In [9]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

# デｰタの読み込み

In [10]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample = pd.read_csv("sample.csv", header = None)

# データの型の確認

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207 entries, 0 to 206
Data columns (total 12 columns):
datetime         207 non-null object
y                207 non-null int64
week             207 non-null object
soldout          207 non-null int64
name             207 non-null object
kcal             166 non-null float64
remarks          21 non-null object
event            14 non-null object
payday           10 non-null float64
weather          207 non-null object
precipitation    207 non-null object
temperature      207 non-null float64
dtypes: float64(3), int64(2), object(7)
memory usage: 19.5+ KB


# 各列に欠損値が1つ以上あるか確認

In [12]:
train.isnull().any()

datetime         False
y                False
week             False
soldout          False
name             False
kcal              True
remarks           True
event             True
payday            True
weather          False
precipitation    False
temperature      False
dtype: bool

# 各列に欠損値が幾つあるか

In [13]:
train.isnull().sum()

datetime           0
y                  0
week               0
soldout            0
name               0
kcal              41
remarks          186
event            193
payday           197
weather            0
precipitation      0
temperature        0
dtype: int64

# 欠損値の処理

In [14]:
train.fillna(0)

Unnamed: 0,datetime,y,week,soldout,name,kcal,remarks,event,payday,weather,precipitation,temperature
0,2013-11-18,90,月,0,厚切りイカフライ,0.0,0,0,0.0,快晴,--,19.8
1,2013-11-19,101,火,1,手作りヒレカツ,0.0,0,0,0.0,快晴,--,17.0
2,2013-11-20,118,水,0,白身魚唐揚げ野菜あん,0.0,0,0,0.0,快晴,--,15.5
3,2013-11-21,120,木,1,若鶏ピリ辛焼,0.0,0,0,0.0,快晴,--,15.2
4,2013-11-22,130,金,1,ビッグメンチカツ,0.0,0,0,0.0,快晴,--,16.1
5,2013-11-25,135,月,1,鶏の唐揚,0.0,0,0,0.0,曇,--,14.6
6,2013-11-26,145,火,0,豚のスタミナ炒め,0.0,0,0,0.0,快晴,--,17.9
7,2013-11-27,140,水,1,ボローニャ風カツ,0.0,0,0,0.0,晴れ,--,14.7
8,2013-11-28,151,木,0,ハンバーグ,0.0,0,0,0.0,薄曇,--,17.7
9,2013-11-29,116,金,0,タルタルinソーセージカツ,0.0,0,0,0.0,快晴,--,12.1


# trainのweekの各値がいくつあるか？

In [15]:
train["week"].value_counts()

水    43
木    43
金    41
火    41
月    39
Name: week, dtype: int64

# trainのweekにダミ－変数を代入

In [16]:
pd.get_dummies(train["week"])

Unnamed: 0,月,木,水,火,金
0,1,0,0,0,0
1,0,0,0,1,0
2,0,0,1,0,0
3,0,1,0,0,0
4,0,0,0,0,1
5,1,0,0,0,0
6,0,0,0,1,0
7,0,0,1,0,0
8,0,1,0,0,0
9,0,0,0,0,1


# trainのweatrherにダミ－変数を代入

In [17]:
pd.get_dummies(train["weather"])

Unnamed: 0,快晴,晴れ,曇,薄曇,雨,雪,雷電
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0
5,0,0,1,0,0,0,0
6,1,0,0,0,0,0,0
7,0,1,0,0,0,0,0
8,0,0,0,1,0,0,0
9,1,0,0,0,0,0,0


# trainからweekとteperatureとweatherを抜き出し、ダミ－変数化したものを変数trainXに代入

In [18]:
trainX = pd.get_dummies(train[["week", "temperature", "weather"]])

# trainXの中身の確認

In [19]:
trainX.head()

Unnamed: 0,temperature,week_月,week_木,week_水,week_火,week_金,weather_快晴,weather_晴れ,weather_曇,weather_薄曇,weather_雨,weather_雪,weather_雷電
0,19.8,1,0,0,0,0,1,0,0,0,0,0,0
1,17.0,0,0,0,1,0,1,0,0,0,0,0,0
2,15.5,0,0,1,0,0,1,0,0,0,0,0,0
3,15.2,0,1,0,0,0,1,0,0,0,0,0,0
4,16.1,0,0,0,0,1,1,0,0,0,0,0,0


# trainのdatetimeから年と月のデ－タを取り出し、trainの新たな列として追加

In [20]:
train["year"] = train["datetime"].apply(lambda x :x.split("-")[0])
train["month"] = train["datetime"].apply(lambda x :x.split("-")[1])

# testのdatetimeから年と月のデ－タを取り出し、testの新たな列として追加

In [21]:
test["year"] = test["datetime"].apply(lambda x :x.split("-")[0])
test["month"] = test["datetime"].apply(lambda x :x.split("-")[1])

# trainのデ－タの型を調べる

In [22]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207 entries, 0 to 206
Data columns (total 14 columns):
datetime         207 non-null object
y                207 non-null int64
week             207 non-null object
soldout          207 non-null int64
name             207 non-null object
kcal             166 non-null float64
remarks          21 non-null object
event            14 non-null object
payday           10 non-null float64
weather          207 non-null object
precipitation    207 non-null object
temperature      207 non-null float64
year             207 non-null object
month            207 non-null object
dtypes: float64(3), int64(2), object(9)
memory usage: 22.7+ KB


# train,testのデ－タの型を整数(int)に変換

In [23]:
train["year"] = train["year"].astype(np.int)
train["month"] = train["month"].astype(np.int)
test["year"] =test["year"].astype(np.int)
test["month"] = test["month"].astype(np.int)

In [24]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207 entries, 0 to 206
Data columns (total 14 columns):
datetime         207 non-null object
y                207 non-null int64
week             207 non-null object
soldout          207 non-null int64
name             207 non-null object
kcal             166 non-null float64
remarks          21 non-null object
event            14 non-null object
payday           10 non-null float64
weather          207 non-null object
precipitation    207 non-null object
temperature      207 non-null float64
year             207 non-null int64
month            207 non-null int64
dtypes: float64(3), int64(4), object(7)
memory usage: 22.7+ KB


# train,testからyearとmonthを取り出し、変数trainX,testXに代入

In [25]:
trainX = train[["year", "month"]]
testX = test[["year", "month"]]

# trainXの中身の確認

In [26]:
trainX.head()

Unnamed: 0,year,month
0,2013,11
1,2013,11
2,2013,11
3,2013,11
4,2013,11


# trainからyを取り出し、変数yに代入

In [27]:
y = train["y"]

In [28]:
from sklearn.linear_model import LinearRegression as LR

In [29]:
model1 = LR()

In [30]:
model1.fit(trainX, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [31]:
model1.coef_

array([-104.0107109 ,   -7.41004428])

# 追加する特徴量検討の為に、trainXに対する予測値を変数predに代入

In [32]:
pred = model1.predict(trainX)

# predをtrainの新たな列predとして代入

In [33]:
train["pred"] =pred

# trainのyとpredを引き算した結果をtrainの新たな列resとして代入

In [34]:
train["res"] = train["y"] - train["pred"]

In [35]:
train.sort_values(by = "res")

Unnamed: 0,datetime,y,week,soldout,name,kcal,remarks,event,payday,weather,precipitation,temperature,year,month,pred,res
27,2013-12-26,80,木,0,酢豚,,,,,曇,--,7.3,2013,12,131.674984,-51.674984
0,2013-11-18,90,月,0,厚切りイカフライ,,,,,快晴,--,19.8,2013,11,139.085028,-49.085028
81,2014-3-26,51,水,0,肉団子クリームシチュー,392.0,,,,曇,--,18.7,2014,3,94.354672,-43.354672
82,2014-3-27,55,木,1,ロース甘味噌焼き,315.0,,,,曇,1.5,12.4,2014,3,94.354672,-39.354672
1,2013-11-19,101,火,1,手作りヒレカツ,,,,,快晴,--,17.0,2013,11,139.085028,-38.085028
104,2014-4-28,54,月,0,タンドリーチキン,342.0,,,,晴れ,--,22.7,2014,4,86.944628,-32.944628
64,2014-2-28,69,金,0,手作りひれかつ,428.0,,,,晴れ,--,17.7,2014,2,101.764716,-32.764716
107,2014-5-2,47,金,0,鶏の天ぷら,420.0,,,,晴れ,--,23.3,2014,5,79.534583,-32.534583
102,2014-4-24,56,木,1,鶏の照り焼きマヨ,360.0,,,,晴れ,--,19.9,2014,4,86.944628,-30.944628
98,2014-4-18,56,金,1,鶏の味噌漬け焼き,325.0,,ママの会,,曇,0,11.4,2014,4,86.944628,-30.944628


In [36]:
train.head()

Unnamed: 0,datetime,y,week,soldout,name,kcal,remarks,event,payday,weather,precipitation,temperature,year,month,pred,res
0,2013-11-18,90,月,0,厚切りイカフライ,,,,,快晴,--,19.8,2013,11,139.085028,-49.085028
1,2013-11-19,101,火,1,手作りヒレカツ,,,,,快晴,--,17.0,2013,11,139.085028,-38.085028
2,2013-11-20,118,水,0,白身魚唐揚げ野菜あん,,,,,快晴,--,15.5,2013,11,139.085028,-21.085028
3,2013-11-21,120,木,1,若鶏ピリ辛焼,,,,,快晴,--,15.2,2013,11,139.085028,-19.085028
4,2013-11-22,130,金,1,ビッグメンチカツ,,,,,快晴,--,16.1,2013,11,139.085028,-9.085028


# 値が「お楽しみメニュー」であれば１、そうでなければ０とする自作関数を作る

In [37]:
def jisaku(x):
    if x == "お楽しみメニュー":
        return 1
    else:
        return 0    

# jisaku1関数とapply関数を使って、trainとtestの新たな列funを作る

In [38]:
train["fun"] = train["remarks"].apply(lambda x : jisaku(x))
test["fun"] = test["remarks"].apply(lambda x : jisaku(x))

# train,testからyearとmonth、funとtemperatureとweatherの5カラムを取り出し、変数trainX,testXに代入

In [39]:
trainX = train[["year", "month", "fun", "temperature", "weather"]]
testX = test[["year", "month", "fun", "temperature", "weather"]]

In [40]:
model2 = LR()

In [41]:
model2.fit(trainX, y)

ValueError: could not convert string to float: '快晴'

# train,testからyearとmonth、funとtemperatureとresの5カラムを取り出し、変数trainX,testXに代入

In [42]:
trainX = train[["year", "month", "fun", "temperature", "res"]]
testX = test[["year", "month", "fun", "temperature", "res"]]

KeyError: "['res'] not in index"

# 回答

## train,testからyearとmonth、funとtemperatureの4カラムを取り出し、変数trainX,testXに代入

In [43]:
trainX = train[["year", "month", "fun", "temperature"]]
testX = test[["year", "month", "fun", "temperature"]]

In [44]:
model2 = LR()

In [45]:
model2.fit(trainX, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [46]:
model2.coef_ #傾き

array([-76.3765687 ,  -4.20489757,  44.63348171,  -1.25463854])

In [47]:
model2.intercept_ #切片

153945.2541175833

In [48]:
pred2 = model2.predict(testX)

In [49]:
sample[1] = pred2

In [50]:
sample.to_csv("submit5.csv", index = None, header = None)