# 데이터 전처리
* 본 데이터는 plant의 info가 파일명에 기입되어 있다. 
    * plant의 info만을 가지고 있는 pandas dataframe 형식의 변수를 만들어준다.
* 본 데이터는 plant 별 타임시리즈 형태의 발전량 데이터가 csv 파일로 저장되어 있다. 
    * 백개가 넘는 개별 csv 파일로 관리하기 힘들기 때문에 하나의 파일로 변경해준다.
    * 이 때, plant의 id별로 구별되어야 한다.

# Data preprocessing
* This data contains plant information in the file name.
    * It creates a pandas dataframe type variable that has only plant information.
* In this data, power generation data in the form of time series for each plant are stored as csv files.
    * Since it is difficult to manage with more than a hundred individual csv files, it is changed to a single file.
    * At this time, plants must be distinguished as plant id.

In [1]:
import pandas as pd

In [2]:
import os

## load file names (파일명 불러오기)
* 단, 리파지토리에 5분단위 데이터와 60분 단위 데이터가 섞여있기 때문에 이를 분리시켜준다.

In [16]:
def search(dirname):
    filenames = os.listdir(dirname)
    full_filenames = []
    for filename in filenames:
        full_filename = os.path.join(dirname, filename)
#         print(full_filename)
        full_filenames.append(full_filename)
        
    return full_filenames

In [4]:
pwd

'/Users/hanhyeseung/hshan/basic_work/0_data/3_pandas_dataframe'

In [5]:
filenames = search('/Users/hanhyeseung/hshan/basic_work/0_data/raw_data/az-pv-2006/')

In [6]:
hourly_full_names = []
for fullname in filenames:
    if fullname.split('/')[-1].split('_')[0] == 'HA4':
        hourly_full_names.append(fullname)

In [13]:
hourly_full_names[:5]

['/Users/hanhyeseung/hshan/basic_work/0_data/raw_data/az-pv-2006/HA4_32.55_-112.05_2006_UPV_100MW_60_Min.csv',
 '/Users/hanhyeseung/hshan/basic_work/0_data/raw_data/az-pv-2006/HA4_33.85_-112.55_2006_UPV_28MW_60_Min.csv',
 '/Users/hanhyeseung/hshan/basic_work/0_data/raw_data/az-pv-2006/HA4_33.95_-114.25_2006_UPV_75MW_60_Min.csv',
 '/Users/hanhyeseung/hshan/basic_work/0_data/raw_data/az-pv-2006/HA4_35.05_-114.45_2006_UPV_100MW_60_Min.csv',
 '/Users/hanhyeseung/hshan/basic_work/0_data/raw_data/az-pv-2006/HA4_34.65_-114.15_2006_UPV_150MW_60_Min.csv']

## make plant info dataframe

In [25]:
plant_info = pd.DataFrame()
for idx, file in enumerate(hourly_full_names):
    pid = idx +1
    lat = hourly_full_names[idx].split('/')[-1].split('_')[1]
    lng = hourly_full_names[idx].split('/')[-1].split('_')[2]
    cap = hourly_full_names[idx].split('/')[-1].split('_')[5][:-2]
    
    plant_info = plant_info.append(pd.Series([pid, lat, lng, cap]), ignore_index=True)
#     print(pid, lat, lng, cap)
plant_info.columns=['plant_id', 'latitude', 'longitude', 'capacity']

In [26]:
plant_info.head()

Unnamed: 0,plant_id,latitude,longitude,capacity
0,1.0,32.55,-112.05,100
1,2.0,33.85,-112.55,28
2,3.0,33.95,-114.25,75
3,4.0,35.05,-114.45,100
4,5.0,34.65,-114.15,150


## save pv power as a single csv

In [27]:
yield_data = pd.DataFrame()
for idx, file in enumerate(hourly_full_names):
    pid = idx +1
    
    data = pd.read_csv(file)
    data['plant_id'] = pid
    
    yield_data = yield_data.append(data)
yield_data['dt'] = pd.to_datetime(yield_data['LocalTime']).dt.tz_localize(tz='Asia/Seoul')
yield_data.set_index('dt', inplace=True)
yield_data.drop(['LocalTime'], axis=1, inplace=True)

# yield_data.to_csv('/Users/hanhyeseung/hshan/basic_work/0_data/data/arizona_pv_data.csv')

In [39]:
yield_data.head()

Unnamed: 0_level_0,Power(MW),plant_id
dt,Unnamed: 1_level_1,Unnamed: 2_level_1
2006-01-01 00:00:00+09:00,0.0,1
2006-01-01 01:00:00+09:00,0.0,1
2006-01-01 02:00:00+09:00,0.0,1
2006-01-01 03:00:00+09:00,0.0,1
2006-01-01 04:00:00+09:00,0.0,1
