In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LinearRegression as LR
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

In [2]:
df_train = pd.read_csv('train.csv', delimiter='\t')
df_test = pd.read_csv('test.csv', delimiter='\t')

In [3]:
df_train.head()

Unnamed: 0,id,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,0,29.0,4,135.0,84.0,2525.0,16.0,82,1,dodge aries se
1,3,31.9,4,89.0,71.0,1925.0,14.0,79,2,vw rabbit custom
2,9,19.0,6,156.0,108.0,2930.0,15.5,76,3,toyota mark ii
3,11,28.0,4,90.0,75.0,2125.0,14.5,74,1,dodge colt
4,13,37.7,4,89.0,62.0,2050.0,17.3,81,3,toyota tercel


In [4]:
df_test.head()

Unnamed: 0,id,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,1,6,145.0,76.0,3160.0,19.6,81,2,volvo diesel
1,2,6,250.0,98.0,3525.0,19.0,77,1,ford granada
2,4,4,119.0,92.0,2434.0,15.0,80,3,datsun 510 hatchback
3,5,6,258.0,110.0,2962.0,13.5,71,1,amc hornet sportabout (sw)
4,6,4,97.0,88.0,2100.0,16.5,72,3,toyota corolla 1600 (sw)


In [5]:
print(df_train.isnull().sum())
print(df_test.isnull().sum())
print(df_train.shape)
print(df_test.shape)
print(df_train.info())
print(df_test.info())

id              0
mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64
id              0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64
(199, 10)
(199, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 10 columns):
id              199 non-null int64
mpg             199 non-null float64
cylinders       199 non-null int64
displacement    199 non-null float64
horsepower      199 non-null object
weight          199 non-null float64
acceleration    199 non-null float64
model year      199 non-null int64
origin          199 non-null int64
car name        199 non-null object
dtypes: float64(4), int64(4), object(2)
memory usage: 15.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data col

In [6]:
# car nameの種類を確認
df_train['car name'].value_counts()

ford pinto                         4
chevrolet impala                   4
amc hornet                         3
amc gremlin                        3
chevrolet caprice classic          3
volkswagen dasher                  3
chevrolet nova                     2
chevrolet chevette                 2
toyota corolla 1200                2
toyota corona                      2
chevrolet vega                     2
dodge colt                         2
opel 1900                          2
ford galaxie 500                   2
buick century                      2
chevrolet chevelle malibu          2
vw rabbit                          2
toyota mark ii                     2
plymouth reliant                   2
plymouth fury iii                  2
datsun 210                         2
chevrolet citation                 2
dodge aspen                        2
amc matador                        2
dodge st. regis                    1
oldsmobile omega                   1
toyota starlet                     1
a

In [7]:
# car name =167種類あるのでダミー変数には置き換えることが難しい。文字列型のため削除することにする。
# drop()を使うとき、axis=1を忘れやすいので注意
df_train1 = df_train.drop(['car name'], axis=1)
df_test1 = df_test.drop(['car name'], axis=1)

In [8]:
df_train1.head()

Unnamed: 0,id,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,0,29.0,4,135.0,84.0,2525.0,16.0,82,1
1,3,31.9,4,89.0,71.0,1925.0,14.0,79,2
2,9,19.0,6,156.0,108.0,2930.0,15.5,76,3
3,11,28.0,4,90.0,75.0,2125.0,14.5,74,1
4,13,37.7,4,89.0,62.0,2050.0,17.3,81,3


In [9]:
df_test1.head()

Unnamed: 0,id,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,1,6,145.0,76.0,3160.0,19.6,81,2
1,2,6,250.0,98.0,3525.0,19.0,77,1
2,4,4,119.0,92.0,2434.0,15.0,80,3
3,5,6,258.0,110.0,2962.0,13.5,71,1
4,6,4,97.0,88.0,2100.0,16.5,72,3


In [10]:
df_train1.corr()

Unnamed: 0,id,mpg,cylinders,displacement,weight,acceleration,model year,origin
id,1.0,-0.052688,0.103419,0.098416,0.070563,-0.087649,-0.093272,-0.031421
mpg,-0.052688,1.0,-0.77016,-0.804635,-0.82066,0.379721,0.568471,0.533671
cylinders,0.103419,-0.77016,1.0,0.9506,0.893256,-0.479561,-0.303462,-0.497373
displacement,0.098416,-0.804635,0.9506,1.0,0.933038,-0.523955,-0.329817,-0.579805
weight,0.070563,-0.82066,0.893256,0.933038,1.0,-0.401757,-0.265562,-0.535893
acceleration,-0.087649,0.379721,-0.479561,-0.523955,-0.401757,1.0,0.194854,0.189193
model year,-0.093272,0.568471,-0.303462,-0.329817,-0.265562,0.194854,1.0,0.124279
origin,-0.031421,0.533671,-0.497373,-0.579805,-0.535893,0.189193,0.124279,1.0


In [11]:
# mpgと相関が高いのは順にweight、displacement、cylinders、この３つが相関係数0.77以上　
# 相関係数=0.5以上のmodel year、originも特徴量として選択
# 前回特徴量を2つ増やして精度が上がったので今回はaccelerationも特徴量として選択

In [12]:
trainX = df_train1[['cylinders', 'displacement', 'weight', 'model year', 'origin', 'acceleration']]

In [13]:
trainY = df_train1['mpg']

In [14]:
# モデルはランダムフォレストを選択、インスタンスの作成
# 今回はn_estimators = 80 に
# model = RandomForestClassifier(n_estimators = 80, random_state=0)
model = RandomForestRegressor()

In [15]:
#学習
model.fit(trainX, trainY)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [None]:
# Unknown label type: 'continuous' というエラー
# 分類器はラベルをYに取ります。
# ラベルは整数である必要があります。
# エラーは連続変数がYに渡されたので、分類できないことを知らせています。
# ClassifierではなくRegressorにするか、連続変数を整数に射影して使ってください。
# https://teratail.com/questions/108034

In [16]:
testX = df_test1[['cylinders', 'displacement', 'weight', 'model year', 'origin', 'acceleration']]

In [17]:
testY = model.predict(testX)

In [18]:
output = pd.read_csv('test.csv', delimiter='\t')

In [19]:
output['mpg'] = testY

In [20]:
output[['id', 'mpg']] .to_csv('third_submission.csv', header=False, index=False)

In [None]:
# RMSE = 2.8737350723982296