In [2]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Handle table-like data and matrices
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import csv
import pprint

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Others
import math
from datetime import datetime, timedelta

# Configure visualisations
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12, 8

In [150]:
weather_train = pd.read_csv(
    'datas/train/weather_(table_7)_training_update.csv'
)
weather_test = pd.read_csv('datas/test/weather_(table_7)_test1.csv')

weather_full = pd.concat([weather_train, weather_test]).reset_index()
weather_full.precipitation = weather_full.precipitation * 4

In [151]:
def get_precipitation_level(precipitation):
    if precipitation == 0.0:
        return 'Sunny'
    if precipitation < 5.0:
        return 'Light rain'
    if 5.0 <= precipitation < 10.0:
        return 'Rain'
    if 10.0 <= precipitation:
        return 'Heavy rain'

In [152]:
weather_full['precipitation_level'] = weather_full.precipitation.map(
    get_precipitation_level)

In [153]:
weather_full.precipitation_level.value_counts()

Sunny         797
Light rain     78
Heavy rain     22
Rain           21
Name: precipitation_level, dtype: int64

In [154]:
pl_dummies = pd.get_dummies(weather_full.precipitation_level)

In [155]:
weather_feature = weather_full[['date', 'hour']].join(pl_dummies)

In [1]:
def get_py_datetime(row):
    date_str = str(row[0]) + ' ' + str(row[1])
    return datetime.strptime(date_str,"%Y-%m-%d %H")

# 升采样并前向填充

In [157]:
weather_feature.index = weather_feature.apply(get_py_datetime, axis=1)
weather_feature = weather_feature.resample('20min').first()
weather_feature = weather_feature.fillna(method='ffill')
weather_feature['diff'] = abs(weather_feature.index.map(
    lambda dt: dt.hour) - weather_feature.hour)
weather_feature[weather_feature['diff']>=3] = np.nan
del weather_feature['diff']

In [158]:
weather_feature.to_pickle('weather_feature.pkl')

In [159]:
full_task2 = pd.read_pickle('task2_df.pkl').join(weather_feature)

In [160]:
del full_task2['hour']
del full_task2['date']

In [162]:
full_task2.columns = ['T1D0', 'T1D1', 'T2D0', 'T3D0', 'T3D1', 'DayOfWeek', 'Holiday',
                      'WorkingDay', 'Weekend', 'HeavyRain', 'LightRain', 'Rain', 'Sunny']

In [163]:
holiday_col = ['DayOfWeek', 'Holiday', 'WorkingDay', 'Weekend']
weather_col = ['HeavyRain', 'LightRain', 'Rain', 'Sunny']

In [164]:
full_task2[holiday_col] = full_task2[holiday_col].applymap(float)

In [166]:
full_task2.to_pickle('full_task2_df.pkl')

In [169]:
weather_feature[weather_feature == np.nan]

Unnamed: 0,date,hour,Heavy rain,Light rain,Rain,Sunny
2016-07-01 00:00:00,,,,,,
2016-07-01 00:20:00,,,,,,
2016-07-01 00:40:00,,,,,,
2016-07-01 01:00:00,,,,,,
2016-07-01 01:20:00,,,,,,
2016-07-01 01:40:00,,,,,,
2016-07-01 02:00:00,,,,,,
2016-07-01 02:20:00,,,,,,
2016-07-01 02:40:00,,,,,,
2016-07-01 03:00:00,,,,,,
