In [2]:
# utf-8
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression as LR

# デｰタの読み込み

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample = pd.read_csv("sample.csv", header = None)
#sample = pd.read_csv("sample.csv", header = None, delim_whitespace=True, decimal=',')


# データの型の確認

In [440]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207 entries, 0 to 206
Data columns (total 12 columns):
datetime         207 non-null object
y                207 non-null int64
week             207 non-null object
soldout          207 non-null int64
name             207 non-null object
kcal             166 non-null float64
remarks          21 non-null object
event            14 non-null object
payday           10 non-null float64
weather          207 non-null object
precipitation    207 non-null object
temperature      207 non-null float64
dtypes: float64(3), int64(2), object(7)
memory usage: 19.5+ KB


# 各列に欠損値が1つ以上あるか確認

In [441]:
train.isnull().any()

datetime         False
y                False
week             False
soldout          False
name             False
kcal              True
remarks           True
event             True
payday            True
weather          False
precipitation    False
temperature      False
dtype: bool

# 各列に欠損値が幾つあるか

In [442]:
train.isnull().sum()

datetime           0
y                  0
week               0
soldout            0
name               0
kcal              41
remarks          186
event            193
payday           197
weather            0
precipitation      0
temperature        0
dtype: int64

# 欠損値の処理

In [443]:
train.fillna(0)
#train["precipitation"] = train["precipitation"].replace("--", "0"))
#train["payday"] = train["payday"].fillna(0)
#train["precipitation"] = train["precipitation"].apply(lambda x : -1 if x == "--" else float(x))
#train["event"] = train["event"].fillna("なし")
#train["remarks"] = train["remarks"].fillna("なし")
#train["month"] = train["datetime"].apply(lambda x : int(x.split("-")[1]))                                          

Unnamed: 0,datetime,y,week,soldout,name,kcal,remarks,event,payday,weather,precipitation,temperature
0,2013-11-18,90,月,0,厚切りイカフライ,0.0,0,0,0.0,快晴,--,19.8
1,2013-11-19,101,火,1,手作りヒレカツ,0.0,0,0,0.0,快晴,--,17.0
2,2013-11-20,118,水,0,白身魚唐揚げ野菜あん,0.0,0,0,0.0,快晴,--,15.5
3,2013-11-21,120,木,1,若鶏ピリ辛焼,0.0,0,0,0.0,快晴,--,15.2
4,2013-11-22,130,金,1,ビッグメンチカツ,0.0,0,0,0.0,快晴,--,16.1
5,2013-11-25,135,月,1,鶏の唐揚,0.0,0,0,0.0,曇,--,14.6
6,2013-11-26,145,火,0,豚のスタミナ炒め,0.0,0,0,0.0,快晴,--,17.9
7,2013-11-27,140,水,1,ボローニャ風カツ,0.0,0,0,0.0,晴れ,--,14.7
8,2013-11-28,151,木,0,ハンバーグ,0.0,0,0,0.0,薄曇,--,17.7
9,2013-11-29,116,金,0,タルタルinソーセージカツ,0.0,0,0,0.0,快晴,--,12.1


# trainのweekの各値がいくつあるか？

In [444]:
train["week"].value_counts()

木    43
水    43
火    41
金    41
月    39
Name: week, dtype: int64

# trainのweekにダミ－変数を代入

In [445]:
pd.get_dummies(train["week"]).head()

Unnamed: 0,月,木,水,火,金
0,1,0,0,0,0
1,0,0,0,1,0
2,0,0,1,0,0
3,0,1,0,0,0
4,0,0,0,0,1


# trainのweatrherにダミ－変数を代入

In [446]:
pd.get_dummies(train["weather"]).head()

Unnamed: 0,快晴,晴れ,曇,薄曇,雨,雪,雷電
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0


In [447]:
pd.get_dummies(train["weather"]).tail()

Unnamed: 0,快晴,晴れ,曇,薄曇,雨,雪,雷電
202,0,0,1,0,0,0,0
203,0,0,1,0,0,0,0
204,0,1,0,0,0,0,0
205,1,0,0,0,0,0,0
206,1,0,0,0,0,0,0


# trainからweekとteperatureとweatherを抜き出し、ダミ－変数化したものを変数trainXに代入

In [448]:
trainX = pd.get_dummies(train[["week", "temperature", "weather"]])

# trainXの中身の確認

In [449]:
trainX.head()

Unnamed: 0,temperature,week_月,week_木,week_水,week_火,week_金,weather_快晴,weather_晴れ,weather_曇,weather_薄曇,weather_雨,weather_雪,weather_雷電
0,19.8,1,0,0,0,0,1,0,0,0,0,0,0
1,17.0,0,0,0,1,0,1,0,0,0,0,0,0
2,15.5,0,0,1,0,0,1,0,0,0,0,0,0
3,15.2,0,1,0,0,0,1,0,0,0,0,0,0
4,16.1,0,0,0,0,1,1,0,0,0,0,0,0


# trainのdatetimeから年と月のデ－タを取り出し、trainの新たな列として追加

In [450]:
train["year"] = train["datetime"].apply(lambda x :x.split("-")[0])
train["month"] = train["datetime"].apply(lambda x :x.split("-")[1])

# testのdatetimeから年と月のデ－タを取り出し、testの新たな列として追加

In [451]:
test["year"] = test["datetime"].apply(lambda x :x.split("-")[0])
test["month"] = test["datetime"].apply(lambda x :x.split("-")[1])

# trainのデ－タの型を調べる

In [452]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207 entries, 0 to 206
Data columns (total 14 columns):
datetime         207 non-null object
y                207 non-null int64
week             207 non-null object
soldout          207 non-null int64
name             207 non-null object
kcal             166 non-null float64
remarks          21 non-null object
event            14 non-null object
payday           10 non-null float64
weather          207 non-null object
precipitation    207 non-null object
temperature      207 non-null float64
year             207 non-null object
month            207 non-null object
dtypes: float64(3), int64(2), object(9)
memory usage: 22.7+ KB


# train,testのデ－タの型を整数(int)に変換

In [453]:
train["year"] = train["year"].astype(np.int)
train["month"] = train["month"].astype(np.int)
test["year"] =test["year"].astype(np.int)
test["month"] = test["month"].astype(np.int)


In [454]:
test.head()

Unnamed: 0,datetime,week,soldout,name,kcal,remarks,event,payday,weather,precipitation,temperature,year,month
0,2014-10-1,水,1,メンチカツ,420.0,,,,雨,0,20.2,2014,10
1,2014-10-2,木,0,バーベキューチキン,415.0,,,,曇,--,23.9,2014,10
2,2014-10-3,金,0,豚肉のマスタード焼き,405.0,,,,晴れ,--,28.7,2014,10
3,2014-10-6,月,1,麻婆春雨,400.0,,,,雨,0.5,21.5,2014,10
4,2014-10-7,火,0,厚揚げ肉みそ炒め,430.0,,,,晴れ,--,22.1,2014,10


In [455]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207 entries, 0 to 206
Data columns (total 14 columns):
datetime         207 non-null object
y                207 non-null int64
week             207 non-null object
soldout          207 non-null int64
name             207 non-null object
kcal             166 non-null float64
remarks          21 non-null object
event            14 non-null object
payday           10 non-null float64
weather          207 non-null object
precipitation    207 non-null object
temperature      207 non-null float64
year             207 non-null int64
month            207 non-null int64
dtypes: float64(3), int64(4), object(7)
memory usage: 22.7+ KB


# train,testからyearとmonthを取り出し、変数trainX,testXに代入

In [456]:
trainX = train[["year", "month"]]
testX = test[["year", "month"]]

# trainXの中身の確認

In [457]:
trainX.head()

Unnamed: 0,year,month
0,2013,11
1,2013,11
2,2013,11
3,2013,11
4,2013,11


# trainからyを取り出し、変数yに代入

In [458]:
y = train["y"]

In [459]:
model1 = LR()

In [460]:
model1.fit(trainX, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [461]:
model1.coef_

array([-104.0107109 ,   -7.41004428])

# 追加する特徴量検討の為に、trainXに対する予測値を変数predに代入

In [462]:
pred = model1.predict(trainX)

# predをtrainの新たな列predとして代入

In [463]:
train["pred"] =pred

# trainのyとpredを引き算した結果をtrainの新たな列resとして代入

In [464]:
train["res"] = train["y"] - train["pred"]

In [465]:
train.sort_values(by = "res").head()

Unnamed: 0,datetime,y,week,soldout,name,kcal,remarks,event,payday,weather,precipitation,temperature,year,month,pred,res
27,2013-12-26,80,木,0,酢豚,,,,,曇,--,7.3,2013,12,131.674984,-51.674984
0,2013-11-18,90,月,0,厚切りイカフライ,,,,,快晴,--,19.8,2013,11,139.085028,-49.085028
81,2014-3-26,51,水,0,肉団子クリームシチュー,392.0,,,,曇,--,18.7,2014,3,94.354672,-43.354672
82,2014-3-27,55,木,1,ロース甘味噌焼き,315.0,,,,曇,1.5,12.4,2014,3,94.354672,-39.354672
1,2013-11-19,101,火,1,手作りヒレカツ,,,,,快晴,--,17.0,2013,11,139.085028,-38.085028


In [466]:
trainX = train[["year", "month", "temperature", "res"]]
testX = test[["year", "month", "temperature", "res"]]

KeyError: "['res'] not in index"

# 値が「お楽しみメニュー」であれば１、そうでなければ０とする自作関数を作る

In [467]:
def jisaku(x):
    if x == "お楽しみメニュー":
        return 1
    else:
        return 0    

# jisaku1関数とapply関数を使って、trainとtestの新たな列funを作る

In [468]:
train["fun"] = train["remarks"].apply(lambda x : jisaku(x))
test["fun"] = test["remarks"].apply(lambda x : jisaku(x))

In [469]:
train[train["remarks"]=="お楽しみメニュー"]

Unnamed: 0,datetime,y,week,soldout,name,kcal,remarks,event,payday,weather,precipitation,temperature,year,month,pred,res,fun
83,2014-3-28,106,金,0,キーマカレー,,お楽しみメニュー,,,快晴,--,18.5,2014,3,94.354672,11.645328,1
93,2014-4-11,128,金,1,チキンカレー,,お楽しみメニュー,,,快晴,--,16.5,2014,4,86.944628,41.055372,1
103,2014-4-25,80,金,0,中華丼,,お楽しみメニュー,,,晴れ,--,20.8,2014,4,86.944628,-6.944628,1
115,2014-5-16,126,金,0,ポークカレー,,お楽しみメニュー,ママの会,,快晴,--,23.8,2014,5,79.534583,46.465417,1
125,2014-5-30,119,金,0,チキンカレー,,お楽しみメニュー,,,薄曇,--,26.9,2014,5,79.534583,39.465417,1
135,2014-6-13,121,金,0,キーマカレー,,お楽しみメニュー,,,晴れ,--,29.5,2014,6,72.124539,48.875461,1
145,2014-6-27,74,金,0,牛丼,,お楽しみメニュー,,,雨,0,25.4,2014,6,72.124539,1.875461,1
155,2014-7-11,124,金,0,ポークカレー,,お楽しみメニュー,,,晴れ,--,33.9,2014,7,64.714495,59.285505,1
164,2014-7-25,83,金,0,ひやしたぬきうどん・炊き込みご飯,,お楽しみメニュー,,,晴れ,--,33.6,2014,7,64.714495,18.285505,1
174,2014-8-8,129,金,0,チキンカレー,,お楽しみメニュー,,1.0,曇,--,31.1,2014,8,57.30445,71.69555,1


In [470]:
# カレーが重要そうである

In [471]:
train["curry"] = train["name"].apply(lambda x : 1 if x.find("カレー")>=0 else 0)

# train,testからyearとmonth、funとtemperatureとweatherの5カラムを取り出し、変数trainX,testXに代入

In [478]:
pd.get_dummies(train["weather"]).head()

Unnamed: 0,快晴,晴れ,曇,薄曇,雨,雪,雷電
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0


In [479]:
trainX = train[["year", "month", "temperature", "fun", "weather"]]
testX = test[["year", "month",  "temperature","fun", "weather"]]

In [480]:
from sklearn.linear_model import LinearRegression as LR

In [481]:
model7 = LR()

In [482]:
model7.fit(trainX, y)

ValueError: could not convert string to float: '快晴'

In [477]:
model7.coef_ #傾き

AttributeError: 'LinearRegression' object has no attribute 'coef_'

In [None]:
model7.intercept_ #切片

In [None]:
model7.intercept_ #切片

In [None]:
pred4 = model7.predict(testX)

In [None]:
sample[3] = pred4

In [None]:
sample.to_csv("submit11.csv", index = None, header = None)

# 回答

## train,testからyearとmonth、funとtemperatureの4カラムを取り出し、変数trainX,testXに代入

In [None]:
trainX = train[["year", "month", "fun", "temperature"]]
testX = test[["year", "month", "fun", "temperature"]]

In [None]:
model2 = LR()

In [None]:
model2.fit(trainX, y)

In [None]:
model2.coef_ #傾き

In [None]:
model2.intercept_ #切片

In [None]:
pred2 = model2.predict(testX)

In [None]:
sample[1] = pred2

In [None]:
sample.to_csv("submit5.csv", index = None, header = None)

# 回答２

In [None]:
# train,testからyearとmonthとtemperatureの３カラムを取り出し、変数trainX,testXに代入

In [None]:
trainX = train[["year", "month",  "temperature"]]
testX = test[["year", "month", "temperature"]]

In [None]:
model3 = LR()

In [None]:
model3.fit(trainX, y)

In [None]:
model3.coef_ #傾き

In [None]:
model3.intercept_ #切片

In [None]:
pred2 = model3.predict(testX)

In [None]:
sample[1] = pred2

In [None]:
sample.to_csv("submit6.csv", index = None, header = None)

# 回答

In [None]:
trainX = train[["year", "month",  "temperature", "week"]]
testX = test[["year", "month", "temperature","week"]]

In [None]:
model3 = LR()

In [None]:
model3.fit(trainX, y)

In [None]:
model3.coef_ #傾き

In [None]:
model3.intercept_ #切片

In [None]:
pred2 = model3.predict(testX)

In [None]:
sample[1] = pred2

# 回答３

In [None]:
#train["月”] = train["week"].apply(lambda x : x.split(","))

In [None]:
def monday(x):
    if x == "月":
        return 1
    else:
        return 0    

In [None]:
# 引数をxとして関数mondayを定義する。月曜日があったら１、無かったら０を返す

In [None]:
train["月"] = train["remarks"].apply(lambda x : monday(x))
test["月"] = test["remarks"].apply(lambda x : monday(x))

In [None]:
#  apply関数　＝　各値に数式や関数を適用するための関数
# lambda 　引数：処理内容
# カラム"remarks"にapply関数を使ってmonday関数を適用する。train,testの新たなカラム「月」を作る。
# monday関数を適用しているのでカラム「月」に、月曜日なら１がたつ

In [None]:
trainX = train[["year", "month",  "temperature", "月"]]
testX = test[["year", "month", "temperature", "月"]]

In [None]:
# train,testからyear,month,temperature,月の４カラムを取り出し変数trainX,testXに代入

In [None]:
model4 = LR()

In [None]:
# 線形回帰(LR)を変数model4とおく
# LR ＝　LinearRegression

In [None]:
model4.fit(trainX, y)

In [None]:
# 説明変数trainXと目的変数y（販売数）で線形モデルの重みを学習し、線形回帰モデルを構築する
# 目的変数　＝　予測する対象
# 説明変数　＝　目的を予測するためのデータ

In [None]:
model4.coef_ #傾き

In [None]:
# 作ったモデルの傾きを求める

In [None]:
model4.intercept_ #切片

In [None]:
# 作ったモデル（回帰直線）の切片を求める

In [None]:
pred2 = model4.predict(testX)

In [None]:
# 線形モデルから目的変数（販売数）を予測

In [None]:
sample[1] = pred2

In [None]:
# pred2をsample[1]に代入

In [None]:
sample.to_csv("submit7.csv", index = None, header = None)

In [None]:
# 答え（CSVファイル）を作成

# 回答４

In [None]:
def hare(x):
    if x == "晴":
        return 1
    else:
        return 0 

In [None]:
# 引数をxとして関数mondayを定義する。晴があったら１、無かったら０を返す

In [None]:
train["晴"] = train["remarks"].apply(lambda x : hare(x))
test["晴"] = test["remarks"].apply(lambda x : hare(x))

In [None]:
# カラム"remarks"にapply関数を使ってhare関数を適用する。train,testの新たなカラム「晴」を作る。
# hare関数を適用しているのでカラム「晴」に、晴なら１がたつ

In [None]:
trainX = train[["year", "month",  "temperature", "晴"]]
testX = test[["year", "month", "temperature", "晴"]]

In [None]:
# train,testからyear,month,temperature,晴の４カラムを取り出し変数trainX,testXに代入

In [None]:
model5 = LR()

In [None]:
# 線形回帰(LR)を変数model５とおく

In [None]:
model5.fit(trainX, y)

In [None]:
# 説明変数trainXと目的変数y（販売数）で線形モデルの重みを学習し、線形回帰モデルを構築する

In [None]:
model5.coef_ #傾き

In [None]:
# 作ったモデルの傾きを求める

In [None]:
model5.intercept_ #切片

In [None]:
# 作ったモデル（回帰直線）の切片を求める

# 回答５

In [None]:
def monday(x):
    if x == "月":
        return 1
    else:
        return 0

In [None]:
# 引数をxとして関数mondayを定義する。月曜日があったら１、無かったら０を返す

In [None]:
train["月"] = train["week"].apply(lambda x : monday(x))
test["月"] = test["week"].apply(lambda x : monday(x))

In [None]:
# カラム"remarks"にapply関数を使ってweek関数を適用する。train,testの新たなカラム「月」を作る。
# week関数を適用しているのでカラム「月」に、月曜日なら１がたつ

In [None]:
trainX = train[["year", "month", "temperature","月"]]
testX = test[["year", "month", "temperature","月"]]

In [None]:
# train,testからyear,month,temperature,月の４カラムを取り出し変数trainX,testXに代入

In [None]:
model6 = LR()

In [None]:
# 線形回帰(LR)を変数model５とおく

In [None]:
model6.fit(trainX, y)

In [None]:
# 説明変数trainXと目的変数y（販売数）で線形モデルの重みを学習し、線形回帰モデルを構築する

In [None]:
model6.coef_ #傾き

In [None]:
# 作ったモデルの傾きを求める

In [None]:
model6.intercept_ #切片

In [None]:
# 作ったモデル（回帰直線）の切片を求める

In [None]:
pred2 = model6.predict(testX)

In [None]:
# 線形モデルから目的変数（販売数）を予測

In [None]:
sample[1] = pred2

In [None]:
# pred2をsample[1]に代入

In [None]:
sample.head()

In [None]:
sample.to_csv("submit9.csv", index = None, header = None)

In [None]:
# 答え（CSVファイル）を作成