In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import cross_validate
from xgboost import XGBClassifier
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns

In [2]:
train_test_df = pd.read_csv("./B02.csv")
train_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247 entries, 0 to 246
Data columns (total 12 columns):
datetime         247 non-null object
y                207 non-null float64
week             247 non-null object
soldout          247 non-null int64
name             247 non-null object
kcal             202 non-null float64
remarks          28 non-null object
event            17 non-null object
payday           12 non-null float64
weather          247 non-null object
precipitation    247 non-null object
temperature      247 non-null float64
dtypes: float64(4), int64(1), object(7)
memory usage: 23.2+ KB


In [3]:
train_test_df.isnull().sum()

datetime           0
y                 40
week               0
soldout            0
name               0
kcal              45
remarks          219
event            230
payday           235
weather            0
precipitation      0
temperature        0
dtype: int64

In [4]:
from my_modules.my_encoder import CategoryValueEncoder as CVE
from my_modules.my_encoder import TextValueEncoder as TVE
from my_modules.my_encoder import DateValueEncoder as DVE
from my_modules.my_encoder import NumericValueEncoder as NVE

encode_df = pd.DataFrame()

# temporary valiable
datetime = pd.to_datetime(train_test_df['datetime'])
week = datetime.dt.week

# Submition columns
encode_df['datetime'] = train_test_df['datetime']
encode_df['y'] = train_test_df['y']

# train date coloumns
#encode_df['year'] = datetime.dt.year
#encode_df['month'] = datetime.dt.month
date_interval = datetime.subtract(datetime[0])
encode_df['dateinterval'] = NVE(date_interval.astype(int) / (864 * 10**11) + 1).normalize()
encode_df['dayofmonth'] = NVE(datetime.dt.day).normalize()
encode_df['weekofyear'] = NVE(week.apply(lambda x: x + 52 if x < 47 else x)).normalize()
encode_df['dayofweek'] = NVE(datetime.dt.dayofweek).normalize()

# train some coloumns
encode_df['soldout'] = train_test_df['soldout'] + 0.001
encode_df['kcal_fill_median'] = NVE(train_test_df['kcal'].fillna(train_test_df['kcal'].median())).normalize()
encode_df['kcal_fill_mean'] = NVE(train_test_df['kcal'].fillna(train_test_df['kcal'].mean())).normalize()
#encode_df['remarks'] = train_test_df['remarks'].fillna(0.0).apply(lambda x: 1.0 if x != 0.0 else 0.0)
#encode_df['event'] = train_test_df['event'].fillna(0.0).apply(lambda x: 1.0 if x != 0.0 else 0.0)
encode_df['remarks'] = CVE(train_test_df['remarks']).to_label_encoding() + 0.001
encode_df['event'] = CVE(train_test_df['event']).to_label_encoding() + 0.001
encode_df['payday'] = train_test_df['payday'].fillna(0.0) + 0.001


# train weather coloumns
weather = train_test_df['weather'].map( {'快晴': 1.0, '晴れ': 2.0, 
                                                '薄曇': 3.0, '曇': 4.0,
                                                '雪': 5.0, '雨': 8.0, '雷電': 10.0
                                               } ).astype(float)
encode_df['weather'] = NVE(weather).normalize()                       
encode_df['precipitation'] = NVE(train_test_df['precipitation'].replace('--', '0').astype(float)).normalize()
encode_df['temperature'] = NVE(train_test_df['temperature']).normalize()

In [5]:
encode_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247 entries, 0 to 246
Data columns (total 15 columns):
datetime            247 non-null object
y                   207 non-null float64
dateinterval        247 non-null float64
dayofmonth          247 non-null float64
weekofyear          247 non-null float64
dayofweek           247 non-null float64
soldout             247 non-null float64
kcal_fill_median    247 non-null float64
kcal_fill_mean      247 non-null float64
remarks             247 non-null float64
event               247 non-null float64
payday              247 non-null float64
weather             247 non-null float64
precipitation       247 non-null float64
temperature         247 non-null float64
dtypes: float64(14), object(1)
memory usage: 29.0+ KB


In [6]:
encode_df.head(200)

Unnamed: 0,datetime,y,dateinterval,dayofmonth,weekofyear,dayofweek,soldout,kcal_fill_median,kcal_fill_mean,remarks,event,payday,weather,precipitation,temperature
0,2013-11-18,90.0,0.32,0.52,0.34,0.35,0.001,0.51,0.50,0.001,0.001,0.001,0.40,0.48,0.51
1,2013-11-19,101.0,0.33,0.53,0.34,0.42,1.001,0.51,0.50,0.001,0.001,0.001,0.40,0.48,0.47
2,2013-11-20,118.0,0.33,0.54,0.34,0.50,0.001,0.51,0.50,0.001,0.001,0.001,0.40,0.48,0.45
3,2013-11-21,120.0,0.33,0.56,0.34,0.57,1.001,0.51,0.50,0.001,0.001,0.001,0.40,0.48,0.45
4,2013-11-22,130.0,0.33,0.57,0.34,0.64,1.001,0.51,0.50,0.001,0.001,0.001,0.40,0.48,0.46
5,2013-11-25,135.0,0.33,0.60,0.34,0.35,1.001,0.51,0.50,0.001,0.001,0.001,0.54,0.48,0.44
6,2013-11-26,145.0,0.33,0.61,0.34,0.42,0.001,0.51,0.50,0.001,0.001,0.001,0.40,0.48,0.48
7,2013-11-27,140.0,0.33,0.62,0.34,0.50,1.001,0.51,0.50,0.001,0.001,0.001,0.45,0.48,0.44
8,2013-11-28,151.0,0.33,0.64,0.34,0.57,0.001,0.51,0.50,0.001,0.001,0.001,0.49,0.48,0.48
9,2013-11-29,116.0,0.34,0.65,0.34,0.64,0.001,0.51,0.50,0.001,0.001,0.001,0.40,0.48,0.41


In [7]:
encode_df[190:210]

Unnamed: 0,datetime,y,dateinterval,dayofmonth,weekofyear,dayofweek,soldout,kcal_fill_median,kcal_fill_mean,remarks,event,payday,weather,precipitation,temperature
190,2014-9-4,54.0,0.59,0.36,0.6,0.57,1.001,0.49,0.49,0.001,0.001,0.001,0.54,0.48,0.59
191,2014-9-5,43.0,0.59,0.37,0.6,0.64,0.001,0.39,0.39,0.001,0.001,0.001,0.45,0.48,0.64
192,2014-9-8,68.0,0.6,0.41,0.61,0.35,1.001,0.41,0.41,0.001,0.001,0.001,0.54,0.48,0.55
193,2014-9-9,63.0,0.6,0.42,0.61,0.42,0.001,0.7,0.7,0.001,0.001,0.001,0.45,0.48,0.59
194,2014-9-10,54.0,0.6,0.43,0.61,0.5,0.001,0.66,0.67,0.001,0.001,1.001,0.54,0.48,0.57
195,2014-9-11,53.0,0.6,0.44,0.61,0.57,0.001,0.41,0.41,0.001,0.001,0.001,0.54,0.48,0.53
196,2014-9-12,115.0,0.6,0.45,0.61,0.64,0.001,0.51,0.5,3.001,0.001,0.001,0.45,0.48,0.6
197,2014-9-16,56.0,0.6,0.5,0.62,0.42,0.001,0.62,0.62,0.001,0.001,0.001,0.45,0.48,0.62
198,2014-9-17,49.0,0.6,0.51,0.62,0.5,0.001,0.58,0.59,4.001,1.001,0.001,0.54,0.48,0.57
199,2014-9-18,46.0,0.61,0.52,0.62,0.57,0.001,0.45,0.45,0.001,0.001,0.001,0.54,0.48,0.56


In [8]:
train_df = encode_df[:207]

In [9]:
train_df.to_csv("C01_train_encoded.csv", index=False)

In [10]:
test_df = encode_df[207:]

In [11]:
test_df.to_csv("C01_test_encoded.csv", index=False)