In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold,GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
import pickle

In [None]:
df = pd.read_csv('02_CarPrice.csv',encoding='shift_jis',engine='python')

In [None]:
df.head(1)

In [None]:
df.shape

# 前処理


### LabelEncoding

In [None]:
le = []
enc_columns = []

In [None]:
for i in np.arange(0,4):
    le = np.append(le,LabelEncoder())
    le[i].fit(df.iloc[:,i])
    enc_columns = np.append(enc_columns,le[i].classes_)
    df.iloc[:,i] = le[i].transform(df.iloc[:,i])

In [None]:
df.head(1)

### OneHotEncoding

In [None]:
one_hot_encoder = OneHotEncoder()

In [None]:
one_hot_encoder.fit(df.iloc[:,:4])

In [None]:
enc_data = one_hot_encoder.transform(df.iloc[:,:4]).toarray()

In [None]:
enc_df = pd.DataFrame(enc_data)

In [None]:
enc_df.head(1)

In [None]:
enc_df.columns = enc_columns
enc_df.head(1)

In [None]:
df_post_enc = pd.concat([df.iloc[:,4:-1],enc_df,df.iloc[:,-1]],axis=1)

In [None]:
df_post_enc.head(1)

### 標準化

In [None]:
sc = StandardScaler()

In [None]:
sc.fit(df_post_enc.iloc[:,:-1])

In [None]:
df_post_sc = pd.DataFrame(sc.transform(df_post_enc.iloc[:,:-1]))

In [None]:
df_post_sc = pd.concat([df_post_sc,df_post_enc.iloc[:,-1]],axis=1)

In [None]:
df_post_sc.head(1)

In [None]:
df_post_sc.columns = df_post_enc.columns

In [None]:
df_post_sc.head(1)

### 主成分分析

##### 最適な主成分の探索

In [None]:
pca = []
num_pc = np.arange(1,len(df_post_sc.columns)) # 目的変数のぶん、列数の-1の主成分が最大なので、np.arange()関数の2パラメータ目は列数。
result = np.empty((0,3))
cnt=0

In [None]:
for i in num_pc:
    pca = np.append(pca,PCA(n_components=i))
    pca[cnt].fit(df_post_sc.iloc[:,:-1])
    result = np.append(result,np.array([[cnt,i,pca[cnt].explained_variance_ratio_.sum()]]),axis=0)
    cnt += 1

In [None]:
print(result)  # 21個ぐらいの主成分が適切

##### 最適な主成分数でデータ変換

In [None]:
best_pca = pca[20]

In [None]:
df_post_pca = pd.DataFrame(best_pca.transform(df_post_sc.iloc[:,:-1]))

In [None]:
df_post_pca = pd.concat([df_post_pca,df_post_sc.iloc[:,-1]],axis=1)

In [None]:
df_post_pca.head(1)

# 学習（回帰）～評価のグリッドサーチ

### 特徴変数と目的変数の設定

In [None]:
X = df_post_pca.iloc[:,:-1]
y = df_post_pca.iloc[:,-1]

### 探索パラメータの設定 & 手法の読み込み

In [None]:
param_grid = {'alpha':[0.0001,0.001,0.01,0.1,1,10,100,1000,10000]}

In [None]:
reg_l1 = Lasso()

### グリッドサーチ読み込み、実行

In [None]:
gs = GridSearchCV(estimator=reg_l1,param_grid=param_grid,scoring='neg_mean_squared_error',n_jobs=-1,cv=10)

In [None]:
gs.fit(X=X,y=y)

In [None]:
gs.best_score_  # neg_mean_squared_errorは、小さいほうが最適な指標なので、評価指標に-1をかけて、最大のモデルを最適としている。

In [None]:
# RMSE
np.sqrt(np.abs(gs.best_score_))

In [None]:
gs_result = pd.DataFrame(gs.cv_results_)

In [None]:
gs_result.sort_values(by='rank_test_score')

In [None]:
best_model = gs.best_estimator_

In [None]:
best_model.fit(X=X,y=y)

# シリアライズ

In [None]:
pickle.dump(le,open('LabelEncoder.pkl','wb'),protocol=3)
pickle.dump(one_hot_encoder,open('OneHotEncoder.pkl','wb'),protocol=3)
pickle.dump(sc,open('StandardScaler.pkl','wb'),protocol=3)
pickle.dump(best_pca,open('PCA.pkl','wb'),protocol=3)
pickle.dump(best_model,open('reg_l1.pkl','wb'),protocol=3)