### 사용하는 데이터 
- daily-total-female-births.csv
- Daily-minimum-temperature-in-me.csv

In [10]:
import pandas as pd
from pandas import Series, DataFrame


# 1. Load and Explore Data

## 1-1. Load

In [2]:
from pandas import Series
series = Series.from_csv('daily-total-female-births-in-cal.csv', header=0)
print(series.head())

Date
1959-01-01    35
1959-01-02    32
1959-01-03    30
1959-01-04    31
1959-01-05    44
Name: Daily total female births in California, 1959, dtype: int64


  infer_datetime_format=infer_datetime_format)


In [35]:
# header - row 0, parse_dates = [0] is dates, index_col is first column, squeeze shows it is Series, not DataFrmae
series = pd.read_csv('daily-total-female-births-in-cal.csv', header=0, index_col=0, squeeze=True)
print(series.head())

Date
1959-01-01    35
1959-01-02    32
1959-01-03    30
1959-01-04    31
1959-01-05    44
Name: Daily total female births in California, 1959, dtype: int64


In [36]:
print(series.tail(3))

Date
1959-12-30                                   55
1959-12-31                                   50
Daily total female births in California    1959
Name: Daily total female births in California, 1959, dtype: int64


## 1-2. Explore

1) 마지막 row 제거

In [37]:
print(series.size)

366


In [38]:
series = series[:-1]

In [39]:
print(series.tail(3))
print(series.size)

Date
1959-12-29    48
1959-12-30    55
1959-12-31    50
Name: Daily total female births in California, 1959, dtype: int64
365


2) Query By Time

In [46]:
# Index 형변환
series.index = pd.to_datetime(series.index)

In [47]:
print(series['1959-01'])

Date
1959-01-01    35
1959-01-02    32
1959-01-03    30
1959-01-04    31
1959-01-05    44
1959-01-06    29
1959-01-07    45
1959-01-08    43
1959-01-09    38
1959-01-10    27
1959-01-11    38
1959-01-12    33
1959-01-13    55
1959-01-14    47
1959-01-15    45
1959-01-16    37
1959-01-17    50
1959-01-18    43
1959-01-19    41
1959-01-20    52
1959-01-21    34
1959-01-22    53
1959-01-23    39
1959-01-24    32
1959-01-25    37
1959-01-26    43
1959-01-27    39
1959-01-28    35
1959-01-29    44
1959-01-30    38
1959-01-31    24
Name: Daily total female births in California, 1959, dtype: int64


3) 요약 통계량

In [48]:
series.describe()

count    365.000000
mean      41.980822
std        7.348257
min       23.000000
25%       37.000000
50%       42.000000
75%       46.000000
max       73.000000
Name: Daily total female births in California, 1959, dtype: float64

# 2. Basic Feature Engineering

### 1. Date Time Features

In [58]:
series = pd.read_csv('daily-minimum-temperatures-in-me.csv', header=0, error_bad_lines=False, parse_dates=[0], index_col=0, squeeze=True)

b'Skipping line 3653: expected 2 fields, saw 3\n'


In [61]:
# 형식 변환
series.index = pd.to_datetime(series.index)

In [64]:
print(series.head())
print(series.tail())
print(series.size)

Date
1981-01-01    20.7
1981-01-02    17.9
1981-01-03    18.8
1981-01-04    14.6
1981-01-05    15.8
Name: Daily minimum temperatures in Melbourne, Australia, 1981-1990, dtype: object
Date
1990-12-27    14.0
1990-12-28    13.6
1990-12-29    13.5
1990-12-30    15.7
1990-12-31    13.0
Name: Daily minimum temperatures in Melbourne, Australia, 1981-1990, dtype: object
3650


In [70]:
dataframe = DataFrame(series)
dataframe['temp'] = series.values
dataframe['month'] = [series.index[i].month for i in range(len(series))]
dataframe['day'] = [series.index[i].day for i in range(len(series))]
del dataframe['Daily minimum temperatures in Melbourne, Australia, 1981-1990']
dataframe.head()

Unnamed: 0_level_0,temp,month,day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1981-01-01,20.7,1,1
1981-01-02,17.9,1,2
1981-01-03,18.8,1,3
1981-01-04,14.6,1,4
1981-01-05,15.8,1,5


- Minutes elapsed for day
- Hour of day
- Business hours or not
- Weekend or not
- Season of the year
- Business quarter of the year
- Daylight savings or not
- Public holiday or not
- Leap year or not

등의 정보를 추출할 수 있다.

### 2. Lag Features

In [79]:
temps = DataFrame(series)

In [83]:
dataframe = pd.concat([temps.shift(1) , temps], axis=1)
dataframe.columns = ['t', 't+1']
print(dataframe.head())

               t   t+1
Date                  
1981-01-01   NaN  20.7
1981-01-02  20.7  17.9
1981-01-03  17.9  18.8
1981-01-04  18.8  14.6
1981-01-05  14.6  15.8


In [87]:
dataframe = pd.concat([temps.shift(3), temps.shift(2), temps.shift(1), temps], axis=1)
dataframe.columns = ['t-2', 't-1', 't', 't+1']
print(dataframe.head())

             t-2   t-1     t   t+1
Date                              
1981-01-01   NaN   NaN   NaN  20.7
1981-01-02   NaN   NaN  20.7  17.9
1981-01-03   NaN  20.7  17.9  18.8
1981-01-04  20.7  17.9  18.8  14.6
1981-01-05  17.9  18.8  14.6  15.8


### 3. Rolling Window Statistics

- Rolling: Rolling 하면서 해당하는 요약 통계량을 리턴한다.

In [217]:
df = pd.read_csv('daily-minimum-temperatures-in-me.csv', header=0, error_bad_lines=False, parse_dates=[0], index_col=0)
df.index = pd.to_datetime(df.index)

b'Skipping line 3653: expected 2 fields, saw 3\n'


- Garabage 제거

In [218]:
df.columns = ['temps']

In [219]:
def clean(x):
    try:
        return float(x)
    except:
        return None

In [220]:
df['temps'] = df['temps'].apply(lambda x: clean(x))

In [221]:
df.fillna(method='ffill', inplace=True)

In [223]:
shifted = df['temps'].shift(1)
window = shifted.rolling(window=2)
means = window.mean()
dataframe = pd.concat([means, df], axis=1)

In [225]:
dataframe.head()

Unnamed: 0_level_0,temps,temps
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1981-01-01,,20.7
1981-01-02,,17.9
1981-01-03,19.3,18.8
1981-01-04,18.35,14.6
1981-01-05,16.7,15.8


- Expanding: 이전 모든 데이터에 대해서 요약 통계량을 구한다.

In [226]:
temps = df['temps']

In [228]:
window = temps.expanding()
dataframe = pd.concat([window.min(), window.mean(), window.max(), temps.shift(-1)], axis=1)
dataframe.columns = ['min', 'mean', 'max', 't+1']
dataframe.head(10)

Unnamed: 0_level_0,min,mean,max,t+1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1981-01-01,20.7,20.7,20.7,17.9
1981-01-02,17.9,19.3,20.7,18.8
1981-01-03,17.9,19.133333,20.7,14.6
1981-01-04,14.6,18.0,20.7,15.8
1981-01-05,14.6,17.56,20.7,15.8
1981-01-06,14.6,17.266667,20.7,15.8
1981-01-07,14.6,17.057143,20.7,17.4
1981-01-08,14.6,17.1,20.7,21.8
1981-01-09,14.6,17.622222,21.8,20.0
1981-01-10,14.6,17.86,21.8,16.2
