In [280]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.model_selection import cross_val_score
import lightgbm as LGBM

In [281]:
class Preprocess():
    def __init__(self, data):
        data = self.pre0(data)      
        x = data.loc[:,["cylinders","displacement","weight","acceleration","model year","origin"]]
        y = data.loc[:,"horsepower"]
        
        self.model =  KNeighborsRegressor(n_neighbors=3).fit(x, y)
    
    def pick_na_line(self, data):
        na_data = data.replace({"?": np.nan})
        na_data = na_data[na_data["horsepower"].isnull()]
        return na_data
    
    def pre0(self, data):
        data = data.replace({"?": np.nan})
        data = data.dropna()
        data["horsepower"] = data["horsepower"].astype("float")
        return data

    def pre1(self, data):
        na_data = pick_na_line(data)
        x = na_data.loc[:,["cylinders","displacement","weight","acceleration","model year","origin"]]
        y = self.model.predict(x)
        for i,predict in zip(na_data["id"], y):
            data.loc[data["id"]==i, "horsepower"] = predict 
        data["horsepower"] = data["horsepower"].astype("float")
        return data

train = pd.read_csv("train.tsv", delimiter='\t')
test  = pd.read_csv("test.tsv", delimiter='\t')

preprocess = Preprocess(train)

train = preprocess.pre1(train)
test  = preprocess.pre1(test)

In [283]:
display(train)
display(test)
display(set(train["car name"]))
display(set(test["car name"]))


train["company"] = train["car name"].apply(lambda string:string.split()[0])
#display(set(train["car name"]))

test["company"] = test["car name"].apply(lambda string:string.split()[0])
display(set(test["car name"]))

display(train)
display(test)

Unnamed: 0,id,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,company
0,0,29.0,4,135.0,84.0,2525.0,16.0,82,1,dodge aries se,dodge
1,3,31.9,4,89.0,71.0,1925.0,14.0,79,2,vw rabbit custom,vw
2,9,19.0,6,156.0,108.0,2930.0,15.5,76,3,toyota mark ii,toyota
3,11,28.0,4,90.0,75.0,2125.0,14.5,74,1,dodge colt,dodge
4,13,37.7,4,89.0,62.0,2050.0,17.3,81,3,toyota tercel,toyota
...,...,...,...,...,...,...,...,...,...,...,...
194,384,40.8,4,85.0,65.0,2110.0,19.2,80,3,datsun 210,datsun
195,385,20.2,8,302.0,139.0,3570.0,12.8,78,1,mercury monarch ghia,mercury
196,387,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,amc
197,395,43.4,4,90.0,48.0,2335.0,23.7,80,2,vw dasher (diesel),vw


Unnamed: 0,id,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,company
0,1,6,145.0,76.0,3160.0,19.6,81,2,volvo diesel,volvo
1,2,6,250.0,98.0,3525.0,19.0,77,1,ford granada,ford
2,4,4,119.0,92.0,2434.0,15.0,80,3,datsun 510 hatchback,datsun
3,5,6,258.0,110.0,2962.0,13.5,71,1,amc hornet sportabout (sw),amc
4,6,4,97.0,88.0,2100.0,16.5,72,3,toyota corolla 1600 (sw),toyota
...,...,...,...,...,...,...,...,...,...,...
194,391,4,114.0,91.0,2582.0,14.0,73,2,audi 100ls,audi
195,392,4,156.0,105.0,2800.0,14.4,80,1,dodge colt,dodge
196,393,4,111.0,80.0,2155.0,14.8,77,1,buick opel isuzu deluxe,buick
197,394,8,400.0,180.0,4220.0,11.1,77,1,pontiac grand prix lj,pontiac


{'amc ambassador brougham',
 'amc ambassador dpl',
 'amc concord d/l',
 'amc concord dl',
 'amc gremlin',
 'amc hornet',
 'amc matador',
 'amc pacer',
 'amc pacer d/l',
 'amc rebel sst',
 'amc spirit dl',
 'audi 4000',
 'audi 5000s (diesel)',
 'bmw 2002',
 'bmw 320i',
 'buick century',
 'buick century limited',
 'buick century special',
 'buick lesabre custom',
 'buick regal sport coupe (turbo)',
 'buick skyhawk',
 'buick skylark limited',
 'capri ii',
 'chevroelt chevelle malibu',
 'chevrolet camaro',
 'chevrolet caprice classic',
 'chevrolet cavalier 2-door',
 'chevrolet chevelle malibu',
 'chevrolet chevelle malibu classic',
 'chevrolet chevette',
 'chevrolet citation',
 'chevrolet impala',
 'chevrolet monte carlo landau',
 'chevrolet monte carlo s',
 'chevrolet monza 2+2',
 'chevrolet nova',
 'chevrolet nova custom',
 'chevrolet vega',
 'chevrolet vega (sw)',
 'chevrolet vega 2300',
 'chevrolet woody',
 'chevy c20',
 'chevy s-10',
 'datsun 1200',
 'datsun 200-sx',
 'datsun 200sx',


{'amc ambassador sst',
 'amc concord',
 'amc concord dl 6',
 'amc gremlin',
 'amc hornet',
 'amc hornet sportabout (sw)',
 'amc matador',
 'amc matador (sw)',
 'audi 100 ls',
 'audi 100ls',
 'audi 5000',
 'audi fox',
 'buick century 350',
 'buick century luxus (sw)',
 'buick electra 225 custom',
 'buick estate wagon (sw)',
 'buick opel isuzu deluxe',
 'buick skylark',
 'buick skylark 320',
 'cadillac eldorado',
 'cadillac seville',
 'chevrolet bel air',
 'chevrolet cavalier',
 'chevrolet cavalier wagon',
 'chevrolet chevelle concours (sw)',
 'chevrolet chevelle malibu classic',
 'chevrolet chevette',
 'chevrolet citation',
 'chevrolet concours',
 'chevrolet malibu',
 'chevrolet malibu classic (sw)',
 'chevrolet monte carlo',
 'chevrolet monte carlo landau',
 'chevrolet nova',
 'chevrolet vega',
 'chevy c10',
 'chrysler cordoba',
 'chrysler lebaron medallion',
 'chrysler lebaron salon',
 'chrysler lebaron town @ country (sw)',
 'chrysler new yorker brougham',
 'chrysler newport royal',


{'amc ambassador sst',
 'amc concord',
 'amc concord dl 6',
 'amc gremlin',
 'amc hornet',
 'amc hornet sportabout (sw)',
 'amc matador',
 'amc matador (sw)',
 'audi 100 ls',
 'audi 100ls',
 'audi 5000',
 'audi fox',
 'buick century 350',
 'buick century luxus (sw)',
 'buick electra 225 custom',
 'buick estate wagon (sw)',
 'buick opel isuzu deluxe',
 'buick skylark',
 'buick skylark 320',
 'cadillac eldorado',
 'cadillac seville',
 'chevrolet bel air',
 'chevrolet cavalier',
 'chevrolet cavalier wagon',
 'chevrolet chevelle concours (sw)',
 'chevrolet chevelle malibu classic',
 'chevrolet chevette',
 'chevrolet citation',
 'chevrolet concours',
 'chevrolet malibu',
 'chevrolet malibu classic (sw)',
 'chevrolet monte carlo',
 'chevrolet monte carlo landau',
 'chevrolet nova',
 'chevrolet vega',
 'chevy c10',
 'chrysler cordoba',
 'chrysler lebaron medallion',
 'chrysler lebaron salon',
 'chrysler lebaron town @ country (sw)',
 'chrysler new yorker brougham',
 'chrysler newport royal',


Unnamed: 0,id,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,company
0,0,29.0,4,135.0,84.0,2525.0,16.0,82,1,dodge aries se,dodge
1,3,31.9,4,89.0,71.0,1925.0,14.0,79,2,vw rabbit custom,vw
2,9,19.0,6,156.0,108.0,2930.0,15.5,76,3,toyota mark ii,toyota
3,11,28.0,4,90.0,75.0,2125.0,14.5,74,1,dodge colt,dodge
4,13,37.7,4,89.0,62.0,2050.0,17.3,81,3,toyota tercel,toyota
...,...,...,...,...,...,...,...,...,...,...,...
194,384,40.8,4,85.0,65.0,2110.0,19.2,80,3,datsun 210,datsun
195,385,20.2,8,302.0,139.0,3570.0,12.8,78,1,mercury monarch ghia,mercury
196,387,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,amc
197,395,43.4,4,90.0,48.0,2335.0,23.7,80,2,vw dasher (diesel),vw


Unnamed: 0,id,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,company
0,1,6,145.0,76.0,3160.0,19.6,81,2,volvo diesel,volvo
1,2,6,250.0,98.0,3525.0,19.0,77,1,ford granada,ford
2,4,4,119.0,92.0,2434.0,15.0,80,3,datsun 510 hatchback,datsun
3,5,6,258.0,110.0,2962.0,13.5,71,1,amc hornet sportabout (sw),amc
4,6,4,97.0,88.0,2100.0,16.5,72,3,toyota corolla 1600 (sw),toyota
...,...,...,...,...,...,...,...,...,...,...
194,391,4,114.0,91.0,2582.0,14.0,73,2,audi 100ls,audi
195,392,4,156.0,105.0,2800.0,14.4,80,1,dodge colt,dodge
196,393,4,111.0,80.0,2155.0,14.8,77,1,buick opel isuzu deluxe,buick
197,394,8,400.0,180.0,4220.0,11.1,77,1,pontiac grand prix lj,pontiac


In [228]:
x_train = train.loc[:,"cylinders":"origin"]
y_train = train.loc[:,"mpg"]

x_test = test.loc[:,"cylinders":"origin"]
id_test= test.loc[:,"id"]

In [229]:
print(train[train["horsepower"].isnull()])

Empty DataFrame
Columns: [id, mpg, cylinders, displacement, horsepower, weight, acceleration, model year, origin, car name]
Index: []


In [242]:
lgbm = LGBM.LGBMRegressor(bagging_seed= 1, 
                          feature_fraction= 0.4, 
                          feature_fraction_seed= 23, 
                          learning_rate= 0.1, 
                          max_bin= 15, 
                          min_data_in_leaf= 28, 
                          min_sum_hessian_in_leaf= 5, 
                          n_estimators= 80, 
                          num_leaves= 8, 
                          objective= 'regression')
scores = cross_val_score(lgbm, x_train, y_train, cv=5)
print('Train score: {:.3f}'.format(np.mean(scores)))

Train score: 0.861


In [244]:
estimator = [('LinearRegression'      , LinearRegression()),
             ('RandomForestRegressor0', RandomForestRegressor(n_estimators=80)),
             ('RandomForestRegressor1', RandomForestRegressor(n_estimators=80)),
             ('RandomForestRegressor2', RandomForestRegressor(n_estimators=80)),
             ('RandomForestRegressor3', RandomForestRegressor(n_estimators=80)),
             ('LGBMRegressor0'        , lgbm)]
VR = VotingRegressor(estimator)
scores = cross_val_score(VR, x_train, y_train, cv=5)
print('Train score: {:.3f}'.format(np.mean(scores)))

VR.fit(x_train, y_train)

Train score: 0.865


VotingRegressor(estimators=[('LinearRegression',
                             LinearRegression(copy_X=True, fit_intercept=True,
                                              n_jobs=None, normalize=False)),
                            ('RandomForestRegressor0',
                             RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   m...
                                        

In [245]:
y = VR.predict(x_test)
submit = []
for i, p in zip(id_test, y):
    submit.append([i,p])

import csv

with open("submit.csv", "w") as f: # 文字コードをShift_JISに指定
    writer = csv.writer(f, lineterminator="\n") # writerオブジェクトの作成 改行記号で行を区切る
    writer.writerows(submit)
#np.savetxt('submit.csv', submit, delimiter=',')

In [249]:
predict = VR.predict(x_train)
estimate = np.sqrt(1/len(predict) * np.sum(np.square(y_train-predict)))
print(estimate)

1.54629539582304
