In [103]:
# utf-8
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

# デｰタの読み込み

In [104]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample = pd.read_csv("sample.csv", header = None, delim_whitespace=True, decimal=',')

# trainデータとtestデータを結合

In [105]:
# フラグを立てる
train["flg"] = 1
test["flg"] = 0
# trainデータとtestデータを結合
all_data = pd.concat([train,test], axis = 0, sort = False)

# データの型の確認

In [106]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247 entries, 0 to 39
Data columns (total 13 columns):
datetime         247 non-null object
y                207 non-null float64
week             247 non-null object
soldout          247 non-null int64
name             247 non-null object
kcal             202 non-null float64
remarks          28 non-null object
event            17 non-null object
payday           12 non-null float64
weather          247 non-null object
precipitation    247 non-null object
temperature      247 non-null float64
flg              247 non-null int64
dtypes: float64(4), int64(2), object(7)
memory usage: 27.0+ KB


In [107]:
# 各カラムの統計量を見る

In [108]:
all_data.describe()

Unnamed: 0,y,soldout,kcal,payday,temperature,flg
count,207.0,247.0,202.0,12.0,247.0,247.0
mean,86.623188,0.445344,407.381188,1.0,19.157085,0.838057
std,32.882448,0.498013,28.396942,0.0,8.07568,0.369147
min,29.0,0.0,315.0,1.0,1.2,0.0
25%,57.0,0.0,395.0,1.0,13.65,1.0
50%,78.0,0.0,412.0,1.0,19.4,1.0
75%,113.0,1.0,427.0,1.0,25.45,1.0
max,171.0,1.0,462.0,1.0,34.6,1.0


# 各列に欠損値が幾つあるか

In [109]:
train.isnull().sum()

datetime           0
y                  0
week               0
soldout            0
name               0
kcal              41
remarks          186
event            193
payday           197
weather            0
precipitation      0
temperature        0
flg                0
dtype: int64

In [110]:
test.isnull().sum()

datetime          0
week              0
soldout           0
name              0
kcal              4
remarks          33
event            37
payday           38
weather           0
precipitation     0
temperature       0
flg               0
dtype: int64

# 欠損値の処理

In [111]:
# kcal

In [112]:
all_data["kcal"].describe()

count    202.000000
mean     407.381188
std       28.396942
min      315.000000
25%      395.000000
50%      412.000000
75%      427.000000
max      462.000000
Name: kcal, dtype: float64

In [113]:
all_data["kcal"] = all_data["kcal"].fillna(train["kcal"].mean()) # 大きくはずれた値がないので平均値で埋める
all_data["kcal"].head()

0    404.409639
1    404.409639
2    404.409639
3    404.409639
4    404.409639
Name: kcal, dtype: float64

In [114]:
# 給料日 #給料日は1日なので、nanは0で埋める

In [115]:
all_data["payday"] = all_data["payday"].fillna(0)
all_data["payday"].unique()

array([0., 1.])

In [116]:
# event

In [117]:
all_data["event"] = all_data["event"].fillna("なし")
all_data["event"].unique()

array(['なし', 'ママの会', 'キャリアアップ支援セミナー'], dtype=object)

In [118]:
# remark #お楽しみメニューでないときは"なし"で埋める

In [119]:
all_data["remarks"] = all_data["remarks"].fillna("なし")
all_data["remarks"].unique()

array(['なし', '鶏のレモンペッパー焼（50食）、カレー（42食）', '酢豚（28食）、カレー（85食）', 'お楽しみメニュー',
       '料理長のこだわりメニュー', '手作りの味', 'スペシャルメニュー（800円）', '近隣に飲食店複合ビルオープン'],
      dtype=object)

In [120]:
# 欠損が埋まったか確認

In [121]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247 entries, 0 to 39
Data columns (total 13 columns):
datetime         247 non-null object
y                207 non-null float64
week             247 non-null object
soldout          247 non-null int64
name             247 non-null object
kcal             247 non-null float64
remarks          247 non-null object
event            247 non-null object
payday           247 non-null float64
weather          247 non-null object
precipitation    247 non-null object
temperature      247 non-null float64
flg              247 non-null int64
dtypes: float64(4), int64(2), object(7)
memory usage: 27.0+ KB


# trainのweekの各値がいくつあるか？

In [122]:
train["week"].value_counts()

木    43
水    43
金    41
火    41
月    39
Name: week, dtype: int64

# trainのweekにダミ－変数を代入

In [123]:
pd.get_dummies(train["week"]).head()

Unnamed: 0,月,木,水,火,金
0,1,0,0,0,0
1,0,0,0,1,0
2,0,0,1,0,0
3,0,1,0,0,0
4,0,0,0,0,1


# trainのweatrherにダミ－変数を代入

In [124]:
pd.get_dummies(train["weather"]).head()

Unnamed: 0,快晴,晴れ,曇,薄曇,雨,雪,雷電
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0


In [125]:
pd.get_dummies(train["weather"]).tail()

Unnamed: 0,快晴,晴れ,曇,薄曇,雨,雪,雷電
202,0,0,1,0,0,0,0
203,0,0,1,0,0,0,0
204,0,1,0,0,0,0,0
205,1,0,0,0,0,0,0
206,1,0,0,0,0,0,0


# trainからweekとteperatureとweatherを抜き出し、ダミ－変数化したものを変数trainXに代入

In [126]:
trainX = pd.get_dummies(train[["week", "temperature", "weather"]])

# trainXの中身の確認

In [127]:
trainX.head()

Unnamed: 0,temperature,week_月,week_木,week_水,week_火,week_金,weather_快晴,weather_晴れ,weather_曇,weather_薄曇,weather_雨,weather_雪,weather_雷電
0,19.8,1,0,0,0,0,1,0,0,0,0,0,0
1,17.0,0,0,0,1,0,1,0,0,0,0,0,0
2,15.5,0,0,1,0,0,1,0,0,0,0,0,0
3,15.2,0,1,0,0,0,1,0,0,0,0,0,0
4,16.1,0,0,0,0,1,1,0,0,0,0,0,0


# trainのdatetimeから年と月のデ－タを取り出し、trainの新たな列として追加

In [128]:
train["year"] = train["datetime"].apply(lambda x :x.split("-")[0])
train["month"] = train["datetime"].apply(lambda x :x.split("-")[1])

# testのdatetimeから年と月のデ－タを取り出し、testの新たな列として追加

In [129]:
test["year"] = test["datetime"].apply(lambda x :x.split("-")[0])
test["month"] = test["datetime"].apply(lambda x :x.split("-")[1])

# trainのデ－タの型を調べる

In [130]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207 entries, 0 to 206
Data columns (total 15 columns):
datetime         207 non-null object
y                207 non-null int64
week             207 non-null object
soldout          207 non-null int64
name             207 non-null object
kcal             166 non-null float64
remarks          21 non-null object
event            14 non-null object
payday           10 non-null float64
weather          207 non-null object
precipitation    207 non-null object
temperature      207 non-null float64
flg              207 non-null int64
year             207 non-null object
month            207 non-null object
dtypes: float64(3), int64(3), object(9)
memory usage: 24.3+ KB


# train,testのデ－タの型を整数(int)に変換

In [131]:
train["year"] = train["year"].astype(np.int)
train["month"] = train["month"].astype(np.int)
test["year"] =test["year"].astype(np.int)
test["month"] = test["month"].astype(np.int)


In [132]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207 entries, 0 to 206
Data columns (total 15 columns):
datetime         207 non-null object
y                207 non-null int64
week             207 non-null object
soldout          207 non-null int64
name             207 non-null object
kcal             166 non-null float64
remarks          21 non-null object
event            14 non-null object
payday           10 non-null float64
weather          207 non-null object
precipitation    207 non-null object
temperature      207 non-null float64
flg              207 non-null int64
year             207 non-null int64
month            207 non-null int64
dtypes: float64(3), int64(5), object(7)
memory usage: 24.3+ KB


# train,testからyearとmonthを取り出し、変数trainX,testXに代入

In [133]:
trainX = train[["year", "month"]]
testX = test[["year", "month"]]

# trainXの中身の確認

In [134]:
trainX.head()

Unnamed: 0,year,month
0,2013,11
1,2013,11
2,2013,11
3,2013,11
4,2013,11


# trainからyを取り出し、変数yに代入

In [135]:
y = train["y"]

In [136]:
from sklearn.linear_model import LinearRegression as LR

In [137]:
model1 = LR()

In [138]:
model1.fit(trainX, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [139]:
model1.coef_

array([-104.0107109 ,   -7.41004428])

# 追加する特徴量検討の為に、trainXに対する予測値を変数predに代入

In [140]:
pred = model1.predict(trainX)

# predをtrainの新たな列predとして代入

In [141]:
train["pred"] =pred

# trainのyとpredを引き算した結果をtrainの新たな列resとして代入

In [142]:
train["res"] = train["y"] - train["pred"]

In [143]:
train.sort_values(by = "res").head()

Unnamed: 0,datetime,y,week,soldout,name,kcal,remarks,event,payday,weather,precipitation,temperature,flg,year,month,pred,res
27,2013-12-26,80,木,0,酢豚,,,,,曇,--,7.3,1,2013,12,131.674984,-51.674984
0,2013-11-18,90,月,0,厚切りイカフライ,,,,,快晴,--,19.8,1,2013,11,139.085028,-49.085028
81,2014-3-26,51,水,0,肉団子クリームシチュー,392.0,,,,曇,--,18.7,1,2014,3,94.354672,-43.354672
82,2014-3-27,55,木,1,ロース甘味噌焼き,315.0,,,,曇,1.5,12.4,1,2014,3,94.354672,-39.354672
1,2013-11-19,101,火,1,手作りヒレカツ,,,,,快晴,--,17.0,1,2013,11,139.085028,-38.085028


In [144]:
train.head()

Unnamed: 0,datetime,y,week,soldout,name,kcal,remarks,event,payday,weather,precipitation,temperature,flg,year,month,pred,res
0,2013-11-18,90,月,0,厚切りイカフライ,,,,,快晴,--,19.8,1,2013,11,139.085028,-49.085028
1,2013-11-19,101,火,1,手作りヒレカツ,,,,,快晴,--,17.0,1,2013,11,139.085028,-38.085028
2,2013-11-20,118,水,0,白身魚唐揚げ野菜あん,,,,,快晴,--,15.5,1,2013,11,139.085028,-21.085028
3,2013-11-21,120,木,1,若鶏ピリ辛焼,,,,,快晴,--,15.2,1,2013,11,139.085028,-19.085028
4,2013-11-22,130,金,1,ビッグメンチカツ,,,,,快晴,--,16.1,1,2013,11,139.085028,-9.085028


In [145]:
from sklearn.linear_model import LinearRegression as LR

In [146]:
trainX = train[["year", "month", "temperature", "res"]]
testX = test[["year", "month", "temperature", "res"]]

KeyError: "['res'] not in index"

# 値が「お楽しみメニュー」であれば１、そうでなければ０とする自作関数を作る

In [147]:
def jisaku(x):
    if x == "お楽しみメニュー":
        return 1
    else:
        return 0    

# jisaku1関数とapply関数を使って、trainとtestの新たな列funを作る

In [148]:
train["fun"] = train["remarks"].apply(lambda x : jisaku(x))
test["fun"] = test["remarks"].apply(lambda x : jisaku(x))

In [149]:
train[train["remarks"]=="お楽しみメニュー"].head()

Unnamed: 0,datetime,y,week,soldout,name,kcal,remarks,event,payday,weather,precipitation,temperature,flg,year,month,pred,res,fun
83,2014-3-28,106,金,0,キーマカレー,,お楽しみメニュー,,,快晴,--,18.5,1,2014,3,94.354672,11.645328,1
93,2014-4-11,128,金,1,チキンカレー,,お楽しみメニュー,,,快晴,--,16.5,1,2014,4,86.944628,41.055372,1
103,2014-4-25,80,金,0,中華丼,,お楽しみメニュー,,,晴れ,--,20.8,1,2014,4,86.944628,-6.944628,1
115,2014-5-16,126,金,0,ポークカレー,,お楽しみメニュー,ママの会,,快晴,--,23.8,1,2014,5,79.534583,46.465417,1
125,2014-5-30,119,金,0,チキンカレー,,お楽しみメニュー,,,薄曇,--,26.9,1,2014,5,79.534583,39.465417,1


In [150]:
train["curry"] = train["name"].apply(lambda x : 1 if x.find("カレー")>=0 else 0)

# train,testからyearとmonth、temperatureとfunの4カラムを取り出し、変数trainX,testXに代入

In [151]:
trainX = train[["year", "month", "temperature", "fun"]]
testX = test[["year", "month", "fun", "temperature"]]

In [152]:
from sklearn.linear_model import LinearRegression as LR

In [153]:
model3 = LR()

In [154]:
model3.fit(trainX, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [155]:
model3.coef_ #傾き

array([-76.3765687 ,  -4.20489757,  -1.25463854,  44.63348171])

In [156]:
model3.intercept_ #切片

153945.25411758374

In [157]:
pred3 = model3.predict(testX)

# train,testからyearとmonth、funとtemperatureとresの5カラムを取り出し、変数trainX,testXに代入

In [158]:
train["res"] = res

NameError: name 'res' is not defined

In [159]:
trainX = train[["year", "month", "fun", "temperature", "res"]]
testX = test[["year", "month", "fun", "temperature", "res"]]

KeyError: "['res'] not in index"

In [160]:
train.corr()

Unnamed: 0,y,soldout,kcal,payday,temperature,flg,year,month,pred,res,fun,curry
y,1.0,0.098308,0.147269,,-0.655332,,-0.5750986,0.01043481,0.7848262,0.619716,0.167332,0.223895
soldout,0.098308,1.0,0.048633,,-0.156752,,-0.01193493,-0.1028842,0.1132355,0.01522918,-0.14094,-0.093303
kcal,0.147269,0.048633,1.0,,-0.046191,,,-0.04786353,0.04786353,0.182769,,0.035995
payday,,,,,,,,,,,,
temperature,-0.655332,-0.156752,-0.046191,,1.0,,0.301331,0.3705917,-0.764838,-0.08885892,0.215071,0.02563
flg,,,,,,,,,,,,
year,-0.575099,-0.011935,,,0.301331,,1.0,-0.6901569,-0.7327719,-1.557998e-13,0.098113,0.118304
month,0.010435,-0.102884,-0.047864,,0.370592,,-0.6901569,1.0,0.0132957,1.418271e-13,0.004402,-0.134725
pred,0.784826,0.113235,0.047864,,-0.764838,,-0.7327719,0.0132957,1.0,1.116817e-13,-0.139706,-0.036781
res,0.619716,0.015229,0.182769,,-0.088859,,-1.557998e-13,1.418271e-13,1.116817e-13,1.0,0.446942,0.407867


# 回答

## train,testからyearとmonth、funとtemperatureの4カラムを取り出し、変数trainX,testXに代入

In [161]:
trainX = train[["year", "month", "fun", "temperature"]]
testX = test[["year", "month", "fun", "temperature"]]

In [162]:
model2 = LR()

In [163]:
model2.fit(trainX, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [164]:
model2.coef_ #傾き

array([-76.3765687 ,  -4.20489757,  44.63348171,  -1.25463854])

In [165]:
model2.intercept_ #切片

153945.2541175833

In [166]:
pred2 = model2.predict(testX)

In [167]:
sample[1] = pred2

In [168]:
sample.to_csv("submit5.csv", index = None, header = None)

# 回答２

In [169]:
# train,testからyearとmonthとtemperatureの３カラムを取り出し、変数trainX,testXに代入

In [170]:
trainX = train[["year", "month",  "temperature"]]
testX = test[["year", "month", "temperature"]]

In [171]:
model3 = LR()

In [172]:
model3.fit(trainX, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [173]:
model3.coef_ #傾き

array([-83.20224329,  -5.14775581,  -0.77874006])

In [174]:
model3.intercept_ #切片

167690.2673254738

In [175]:
pred2 = model3.predict(testX)

In [176]:
sample[1] = pred2

In [177]:
sample.to_csv("submit6.csv", index = None, header = None)

# 回答３

In [178]:
# trainのweatherから月曜日を選ぶ

In [179]:
def monday(x):
    if x == "月":
        return 1
    else:
        return 0    

In [180]:
train["月"] = train["remarks"].apply(lambda x : monday(x))
test["月"] = test["remarks"].apply(lambda x : monday(x))

In [181]:
trainX = train[["year", "month",  "temperature", "月"]]
testX = test[["year", "month", "temperature", "月"]]

In [182]:
model4 = LR()

In [183]:
model4.fit(trainX, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [184]:
model4.coef_ #傾き

array([-83.20224329,  -5.14775581,  -0.77874006,   0.        ])

In [185]:
model4.intercept_ #切片

167690.26732547366

In [186]:
pred2 = model4.predict(testX)

In [187]:
sample[1] = pred2

In [188]:
sample.to_csv("submit7.csv", index = None, header = None)

# 回答４

In [189]:
def kumori(x):
    if x == "晴天":
        return 1
    else:
        return 0  

In [190]:
train["晴天"] = train["remarks"].apply(lambda x : kumori(x))
test["晴天"] = test["remarks"].apply(lambda x : kumori(x))

In [191]:
trainX = train[["year", "month",  "temperature", "晴天"]]
testX = test[["year", "month", "temperature", "晴天"]]

In [192]:
model5 = LR()

In [193]:
model5.fit(trainX, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [194]:
model5.coef_ #傾き

array([-83.20224329,  -5.14775581,  -0.77874006,   0.        ])

In [195]:
model5.intercept_ #切片

167690.26732547366

In [196]:
trainX = train[["year", "month","temperature", "curry"]]
testX = test[["year", "month", "temperature", "curry"]]

KeyError: "['curry'] not in index"

In [197]:
trainX = train[["year", "month",  "temperature", "week"]]
testX = test[["year", "month", "temperature", "week"]]

In [198]:
model6 = LR()

In [199]:
model6.fit(trainX, y)

ValueError: could not convert string to float: '火'

In [200]:
model6.coef_ #傾き

AttributeError: 'LinearRegression' object has no attribute 'coef_'

In [201]:
model6.intercept_ #切片

AttributeError: 'LinearRegression' object has no attribute 'intercept_'

In [202]:
pred2 = model6.predict(testX)

NotFittedError: This LinearRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [203]:
sample[1] = pred2

In [None]:
sample.to_csv("submit7.csv", index = None, header = None)