# ０．必要なデータ、パッケージのimport

In [1]:
import pandas as pd
import numpy as np
from numpy import nan
import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns
from sklearn import preprocessing

In [None]:
all_df= pd.read_csv('train_new.csv')
all_df.head()

# １．データの内容の確認

In [None]:
#columnsの確認
all_df.columns

In [None]:
#trainデータの大きさの確認
print(all_df.shape)

In [None]:
#trainデータの欠損値の確認
all_df.isnull().sum()

In [None]:
#目的変数の分布
plt.hist(all_df["y"],bins=18)

In [None]:
#目的変数の分布(J1,J2別)
j1_df = all_df[all_df["stage"] == "Ｊ１"]
j2_df = all_df[all_df["stage"] == "Ｊ２"]
labels = ['J1', 'J2']
plt.hist([j1_df["y"], j2_df["y"]], histtype="barstacked",bins=18,label=labels)
plt.legend()

→全体としてJ1とJ2でデータが全然違うので後々分けようと思う

# １．とりあえず数値化
→ほとんどのcolumnsが数値ではない。
とりあえず一通り数値にしてみる。
⇒とりあえず提出する

|columns|前処理方法|
|---|---|
|year | そのまま |
|stage| J1→0,J2→1 |
|match| 第〇説かどうかのみ抜き出す|
|gameday|月(month)と曜日(day)を抽出|
|time|〇時(hour)の部分だけ抽出|
|home,away,stadium|マッピング|
|tv |'NHK総合'を含む(0),'NHK'を含む(1),'BS'を含む(2),その他(3)とする|
|   | tvの数も抽出(tv_count)|
|weather|"雨"もしくは"雪"を含む(0),"晴"を含む(2),'屋内'を含む(3),その他(1)とする|
|tempreture|そのまま|
|humidity|〇％のみ抽出|
|capa|そのまま|
|home_score,away_score|そのまま|
|home_team,away_team|マッピング|
|home_01~home_11,away_01~away_11|マッピング|
|address|都道府県のみマッピング|

In [None]:
def preprocessing_1(all_df):
    all_df_mapping = all_df
    #year → そのまま
    #stage → のちにわけるのでマッピングせず
    #match → 第〇説かどうかのみ抜き出す
    all_df_mapping['match'] = all_df['match'].apply(lambda x: x[1:].rsplit('節', 1)[0])
    #gameday → 月(month)と曜日(day)を抽出
    #       → 曜日(day)を月(0),火(1),水(2),木(3),金(4),土(5),日(6),祝休(7)でマッピング
    all_df_mapping['day'] = all_df['gameday'].apply(lambda x: x[:-1].rsplit('(', 1)[-1])
    day_mapping = {'月':0, '火':1, '水':2, '木':3,'金':4,'土':5,'日':6}
    all_df_mapping['day'] = all_df['day'].map(day_mapping)
    all_df_mapping['day'] = all_df["day"].fillna(7)
    all_df_mapping['month'] = all_df['gameday'].apply(lambda x: x[:2])
    #time → 〇時(hour)の部分だけ抽出
    all_df_mapping['time'] =all_df['time'].apply(lambda x: x.split(':', 1)[0])
    #tv → 'NHK総合'を含む(0),'NHK'を含む(1),'BS'を含む(2),その他(3)とする
    #   → tvの数も抽出(tv_count)
    for i in range(len(all_df)):
        all_df_mapping.loc[i,"tv_count"] = all_df.loc[i,"tv"].count("／") + 1
        if 'ＮＨＫ総合' in all_df.loc[i,'tv']:
            all_df_mapping.loc[i,'tv']=0
        elif 'ＮＨＫ' in all_df.loc[i,'tv']:
            all_df_mapping.loc[i,'tv']=1
        elif  'ＢＳ' in all_df.loc[i,'tv']:
            all_df_mapping.loc[i,'tv']=1
        else:
            all_df_mapping.loc[i,'tv']=2
    #weather → "雨"もしくは"雪"を含む(0),"晴"を含む(2),'屋内'を含む(3),その他(1)とする
    for i in range(len(all_df)):
        if '雨' in all_df.loc[i,'weather']:
            all_df_mapping.loc[i,'weather']=0
        elif '雪' in all_df.loc[i,'weather']:
            all_df_mapping.loc[i,'weather']=0
        elif  '晴' in all_df.loc[i,'weather']:
            all_df_mapping.loc[i,'weather']=2
        elif  '屋内' in all_df.loc[i,'weather']:
            all_df_mapping.loc[i,'weather']=3
        else:
            all_df_mapping.loc[i,'weather']=1
    #tempreture → そのまま
    #humidity → 〇％のみ抽出
    all_df_mapping['humidity'] = all_df['humidity'].apply(lambda x: x[:-1].rsplit('%', 1)[-1])
    #capa → そのまま
    #home_score,away_score → そのまま
    #address → 都道府県でマッピング
    all_df_mapping['address'] = all_df['address'].apply(lambda x: x[:2])
    #home_01~home_11,away_01~away_11 → マッピング
    #home,away,stadium → マッピング
    for column in ['home','stage','away','stadium','referee', 'home_01', 'home_02', 'home_03',
           'home_04', 'home_05', 'home_06', 'home_07', 'home_08', 'home_09',
           'home_10', 'home_11', 'away_01', 'away_02', 'away_03',
           'away_04', 'away_05', 'away_06', 'away_07', 'away_08', 'away_09',
           'away_10', 'away_11','address']:
        le = preprocessing.LabelEncoder()
        le.fit(all_df_mapping[column])
        all_df_mapping[column] = le.transform(all_df_mapping[column])
    #数値でない変数、いらない変数の削除
    all_df_mapping = all_df_mapping.drop(['id', 'gameday','home_team', 'away_team'],axis=1)
    return all_df_mapping

In [None]:
all_df_1 = preprocessing_1(all_df)
all_df_1.head()

In [None]:
y=pd.DataFrame(all_df_1, columns=['y'])
y.head()

In [None]:
x=all_df_1.drop('y',axis=1)
x.head()

In [12]:
def linear_regression(x,y):
    global lr,model_quad_2,model_cubic_3,quad,cubic
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    ##1次回帰
    # 重回帰の線形回帰を実行
    lr = LinearRegression()
    #ホールドアウト法で分割
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
    # 重回帰線形モデルの学習
    lr.fit(x_train, y_train)

    ##２次回帰
    # 2次関数の変数
    from sklearn.preprocessing import PolynomialFeatures
    quad = PolynomialFeatures(degree=2)
    # 生成した基底関数で変数変換を実行
    x_quad = quad.fit_transform(x)
    # ホールドアウト法で分割
    x_quad_train, x_quad_test,y_quad_train, y_quad_test = train_test_split(x_quad, y, test_size = 0.3, random_state = 0)
    # 線形回帰による学習
    model_quad_2 = LinearRegression()
    model_quad_2.fit(x_quad_train, y_quad_train)

    ##3次回帰
    # 3次関数の変数変換
    cubic = PolynomialFeatures(degree=3)
    # 生成した基底関数で変数変換を実行
    x_cubic = cubic.fit_transform(x)
    # ホールドアウト法で分割
    x_cubic_train, x_cubic_test,y_cubic_train, y_cubic_test = train_test_split(x_cubic, y, test_size = 0.3, random_state = 0)
    # 線形回帰による学習
    model_cubic_3 = LinearRegression()
    model_cubic_3.fit(x_cubic_train, y_cubic_train)
    
    ## 自由度調整済み決定係数
    # (決定係数, trainまたはtestのサンプル数, 利用した特徴量の数)
    def adjusted(score, n_sample, n_features):
        adjusted_score = 1 - (1 - score) * ((n_sample - 1) / (n_sample - n_features - 1))
        return adjusted_score
    print('[自由度調整済み決定係数]')
    #1次関数
    print('1次')
    print('train: %3f' % adjusted(lr.score(x_train, y_train), len(y_train), 2))
    print('test : %3f' % adjusted(lr.score(x_test, y_test), len(y_test), 2))
    # 2次関数
    print('2次')
    print('train: %.3f' % adjusted(model_quad_2.score(x_quad_train, y_quad_train), len(y_quad_train), 3))
    print('test : %.3f' % adjusted(model_quad_2.score(x_quad_test, y_quad_test), len(y_quad_test), 3))
    # 3次関数
    print('3次')
    print('train: %.3f' % adjusted(model_cubic_3.score(x_cubic_train, y_cubic_train), len(y_cubic_train), 3))
    print('test : %.3f' % adjusted(model_cubic_3.score(x_cubic_test, y_cubic_test), len(y_cubic_test), 3))
    print('')
    
    #RMSE
    #MSEを出力する関数を読み込む
    from sklearn.metrics import mean_squared_error as mse
    print('[RMSE]')
    #1次関数
    print('1次')
    print('train: %.3f' % (mse(y_train, lr.predict(x_train)) ** (1/2)))
    print('test : %.3f' % (mse(y_test, lr.predict(x_test)) ** (1/2)))
    #2次関数
    print('2次')
    print('train: %.3f' % (mse(y_quad_train, model_quad_2.predict(x_quad_train)) ** (1/2)))
    print('test : %.3f' % (mse(y_quad_test, model_quad_2.predict(x_quad_test)) ** (1/2)))
    #3次関数
    print('3次')
    print('train: %.3f' % (mse(y_cubic_train, model_cubic_3.predict(x_cubic_train)) ** (1/2)))
    print('test : %.3f' % (mse(y_cubic_test, model_cubic_3.predict(x_cubic_test)) ** (1/2)))

In [None]:
linear_regression(x,y)

In [31]:
def submit(model,test_df,x_test):
    y_pred=model.predict(x_test).reshape(-1)
    submit_file = pd.DataFrame({'id' : test_df['id'], 'y' : y_pred})
    submit_file.to_csv('submit.csv', index = False, header = False)

In [None]:
test_df= pd.read_csv('test_new.csv')
test_df_1 = preprocessing_1(test_df)
submit(lr,test_df,test_df_1)

RMSE→6,024.96845、768位/930人中

# ２．改善１
#### ①意味のないマッピングデータを意味のあるものにする or 削除
選手名→日本代表の数、
チーム→順位を代入、
ホームチーム→ホームの試合での平均観客数を使用、
レフェリー→削除
#### ②処理の改善
ただの重回帰分析を改善する。ポイントは多い変数を選択していくこと。
#### ③その他一部改善
数値の標準化、外れ値の処理

In [2]:
#代表リストを作る
daihyo_df = pd.read_csv('daihyo.csv')
daihyo_df.head()
#初出場と最終出場が2012-2014の間の選手のみ抽出
daihyo_df["初出場"] = daihyo_df["初出場"].apply(lambda x: int(x[:4]))
daihyo_df["最終出場"] = daihyo_df["最終出場"].apply(lambda x: int(x[:4]))
daihyo_df_1 = daihyo_df[daihyo_df.初出場 < 2014]
daihyo_df_1 = daihyo_df_1[daihyo_df.最終出場 > 2012]
daihyo_df_1.index = np.arange(len(daihyo_df_1))
daihyo_df_1.head()
#df→リストにする
daihyo_list = daihyo_df_1["選手名"].values.tolist()

  


In [21]:
#homeとawayそれぞれチームと順位を合わせたDataFrameを作る。
team_df = pd.read_csv('team_1.csv')
home_df = team_df.loc[:,["順位","チーム","年","ステージ"]]
home_df.columns =["home_rank","home","year","stage"]
away_df = team_df.loc[:,["順位","チーム","年","ステージ"]]
away_df.columns =["away_rank","away","year","stage"]
home_df.head()

Unnamed: 0,home_rank,home,year,stage
0,1,ガンバ大阪,2014,Ｊ１
1,2,浦和レッズ,2014,Ｊ１
2,3,鹿島アントラーズ,2014,Ｊ１
3,4,柏レイソル,2014,Ｊ１
4,5,サガン鳥栖,2014,Ｊ１


In [4]:
#平均観客数のDataFrameを作成
attendance_df = pd.read_csv('attendance.csv')
attendance_df.columns =["home","year","attendance"]
attendance_df.head()

Unnamed: 0,home,year,attendance
0,コンサドーレ札幌,2011,10482.0
1,ベガルタ仙台,2011,15656.0
2,モンテディオ山形,2011,9325.0
3,鹿島アントラーズ,2011,16156.0
4,浦和レッズ,2011,33910.0


In [22]:
def preprocessing_2(df):
    df_2 = df
    #名字と名前の間の空白をなくす
    for i in range(1,12):
        df_2["home_{:02}".format(i)] = df_2["home_{:02}".format(i)].apply(lambda x: x.replace("　",""))
        df_2["away_{:02}".format(i)] = df_2["away_{:02}".format(i)].apply(lambda x: x.replace("　",""))
    #home_01~11,away01~11→True,False
    for i in range(len(df_2)):
        for j in range(1,12):
            df_2.loc[i,"home_{:02}".format(j)] = df_2.loc[i,"home_{:02}".format(j)] in daihyo_list
            df_2.loc[i,"away_{:02}".format(j)] = df_2.loc[i,"away_{:02}".format(j)] in daihyo_list
    #daihyo
    df_2["daihyo"] = (df_2==True).sum(axis=1)
    #home_rank,away_rank,difference
    df_2 = pd.merge(df_2,home_df,on=["home","year","stage"],how="left")
    df_2 = pd.merge(df_2,away_df,on=["away","year","stage"],how="left")
    df_2["difference"] = abs(df_2["home_rank"]-df_2["away_rank"])
    #home_rank,away_rank,difference
    df_2 = pd.merge(df_2,attendance_df,on=["home","year"],how="left")
    #year → そのまま
    #match → 第〇説かどうかのみ抜き出す
    df_2['match'] = df_2['match'].apply(lambda x: int(x[1:].rsplit('節', 1)[0]))
    #gameday → 月(month)と曜日(day)を抽出
    #       → 曜日(day)を月(0),火(1),水(2),木(3),金(4),土(5),日(6),祝休(7)でマッピング
    df_2['day'] = df_2['gameday'].apply(lambda x: x[:-1].rsplit('(', 1)[-1])
    day_mapping = {'月':0, '火':1, '水':2, '木':3,'金':4,'土':5,'日':6}
    df_2['day'] = df_2['day'].map(day_mapping)
    df_2['day'] = df_2["day"].fillna(7)
    df_2['month'] = df_2['gameday'].apply(lambda x: int(x[:2]))
    #time → 〇時(hour)の部分だけ抽出
    df_2['time'] = df_2['time'].apply(lambda x: int(x.split(':', 1)[0]))
    #tv → 'NHK総合'を含む(0),'NHK'を含む(1),'BS'を含む(2),その他(3)とする
    #   → tvの数も抽出(tv_count)
    for i in range(len(df_2)):
        df_2.loc[i,"tv_count"] = int(df_2.loc[i,"tv"].count("／") + 1)
        if 'ＮＨＫ総合' in df_2.loc[i,'tv']:
            df_2.loc[i,'tv']=0
        elif 'ＮＨＫ' in df_2.loc[i,'tv']:
            df_2.loc[i,'tv']=1
        elif  'ＢＳ' in df_2.loc[i,'tv']:
            df_2.loc[i,'tv']=1
        else:
            df_2.loc[i,'tv']=2
    #weather → "雨"もしくは"雪"を含む(0),"晴"を含む(2),'屋内'を含む(3),その他(1)とする
    for i in range(len(df_2)):
        if '雨' in df_2.loc[i,'weather']:
            df_2.loc[i,'weather']=0
        elif '雪' in df_2.loc[i,'weather']:
            df_2.loc[i,'weather']=0
        elif  '晴' in df_2.loc[i,'weather']:
            df_2.loc[i,'weather']=2
        elif  '屋内' in df_2.loc[i,'weather']:
            df_2.loc[i,'weather']=3
        else:
            df_2.loc[i,'weather']=1
    #tempreture → そのまま
    #humidity → 〇％のみ抽出
    df_2['humidity'] = df_2['humidity'].apply(lambda x: int(x[:-1].rsplit('%', 1)[-1]))
    #capa → そのまま
    #address → 都道府県でマッピング
    df_2['address'] = df_2['address'].apply(lambda x: x[:2])
    #stadium,stage → マッピング
    for column in ['stage','stadium','address']:
        le = preprocessing.LabelEncoder()
        le.fit(df_2[column])
        df_2[column] = le.transform(df_2[column])
    #もう使わない変数、数値じゃない変数を削除
    df_2 = df_2.drop( ['id', 'gameday', 'home', 'away', 'referee', 'home_team', 'home_01', 'home_02', 'home_03', 'home_04',
       'home_05', 'home_06', 'home_07', 'home_08', 'home_09', 'home_10',
       'home_11','away_team', 'away_01', 'away_02', 'away_03', 'away_04', 'away_05',
       'away_06', 'away_07', 'away_08', 'away_09', 'away_10', 'away_11'],axis=1)
    return(df_2)

In [23]:
all_df= pd.read_csv('train_new.csv')
all_df_2 = preprocessing_2(all_df)
all_df_2.head()

Unnamed: 0,y,year,stage,match,time,stadium,tv,home_score,away_score,weather,...,address,capa,daihyo,home_rank,away_rank,difference,attendance,day,month,tv_count
0,18250,2012,0,1,14,15,0,1,0,0,...,8,19694,2,2,11,9,16600.0,5.0,3,4.0
1,24316,2012,0,1,14,48,1,1,0,3,...,17,40000,2,7,9,2,17155.0,5.0,3,4.0
2,17066,2012,0,1,14,17,1,2,3,0,...,7,21000,4,17,16,1,14778.0,5.0,3,4.0
3,29603,2012,0,1,14,3,1,1,0,0,...,14,50000,9,1,3,2,17721.0,5.0,3,4.0
4,25353,2012,0,1,14,36,1,0,0,3,...,3,39232,3,18,12,6,12008.0,5.0,3,4.0


In [24]:
#外れ値処理
for i in range(len(all_df_2)):
    if all_df_2.loc[i,'y']==0 or all_df_2.loc[i,'y']>=50000:
        all_df_2=all_df_2.drop(i,axis=0)
        print(i)
all_df_2.index = np.arange(len(all_df_2))

298
784
1059
1066
1567
1654


In [25]:
y=pd.DataFrame(all_df_2, columns=['y'])
x=all_df_2.drop('y',axis=1)

In [26]:
#標準化
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
x_std = ss.fit_transform(x)

In [13]:
linear_regression(x_std,y)

[自由度調整済み決定係数]
1次
train: 0.831454
test : 0.789162
2次
train: 0.900
test : 0.791
3次
train: 1.000
test : -0.543

[RMSE]
1次
train: 3176.949
test : 3611.003
2次
train: 2447.657
test : 3591.904
3次
train: 0.000
test : 9759.195


#### Ridge回帰

In [39]:
def ridge(x,y):
    global model_ridge,model_ridge_2,model_ridge_3
    # Ridge回帰のクラスを読み込み
    from sklearn.linear_model import Ridge
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import PolynomialFeatures

    ##1次回帰
    #ホールドアウト法で分割
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
    # Ridge回帰のインスタンスを生成
    model_ridge = Ridge(alpha=1000)
    # 重回帰線形モデルの学習
    model_ridge.fit(x_train, y_train)

    ##２次回帰
    # 2次の多項式規定を生成
    quad = PolynomialFeatures(degree=2)
    # 生成した基底関数で変数変換を実行
    x_quad = quad.fit_transform(x)
    #ホールドアウト法で分割
    x_quad_train, x_quad_test,y_quad_train, y_quad_test = train_test_split(x_quad, y, test_size = 0.3, random_state = 0)
    # Ridge回帰のインスタンスを生成
    model_ridge_2 = Ridge(alpha=1000)
    # 線形回帰による学習
    model_ridge_2.fit(x_quad_train, y_quad_train)

#     ##3次回帰
#     # 3次の多項式規定を生成
#     cubic = PolynomialFeatures(degree=3)
#     # 生成した基底関数で変数変換を実行
#     x_cubic = cubic.fit_transform(x)
#     #ホールドアウト法で分割
#     x_cubic_train, x_cubic_test,y_cubic_train, y_cubic_test = train_test_split(x_cubic, y, test_size = 0.3, random_state = 0)
#     # Ridge回帰のインスタンスを生成
#     model_ridge_3 = Ridge(alpha=1000)
#     # 線形回帰による学習
#     model_ridge_3.fit(x_cubic_train, y_cubic_train)
    
    # 自由度調整済み決定係数
    # (決定係数, trainまたはtestのサンプル数, 利用した特徴量の数)
    def adjusted(score, n_sample, n_features):
        adjusted_score = 1 - (1 - score) * ((n_sample - 1) / (n_sample - n_features - 1))
        return adjusted_score
    print('[自由度調整済み決定係数]')
    #1次関数
    print('1次')
    print('train: %3f' % adjusted(model_ridge.score(x_train, y_train), len(y_train), 2))
    print('test : %3f' % adjusted(model_ridge.score(x_test, y_test), len(y_test), 2))
    # 2次関数
    print('2次')
    print('train: %.3f' % adjusted(model_ridge_2.score(x_quad_train, y_quad_train), len(y_quad_train), 3))
    print('test : %.3f' % adjusted(model_ridge_2.score(x_quad_test, y_quad_test), len(y_quad_test), 3))
#     # 3次関数
#     print('3次')
#     print('train: %.3f' % adjusted(model_ridge_3.score(x_cubic_train, y_cubic_train), len(y_cubic_train), 3))
#     print('test : %.3f' % adjusted(model_ridge_3.score(x_cubic_test, y_cubic_test), len(y_cubic_test), 3))
#     print('')
    
    #RMSE
    # MSEを出力する関数を読み込む
    from sklearn.metrics import mean_squared_error as mse
    print("[RMSE]")
    #1次関数
    print('1次')
    print('train: %.3f' % (mse(y_train, model_ridge.predict(x_train)) ** (1/2)))
    print('test : %.3f' % (mse(y_test, model_ridge.predict(x_test)) ** (1/2)))
    #2次関数
    print('2次')
    print('train: %.3f' % (mse(y_quad_train, model_ridge_2.predict(x_quad_train)) ** (1/2)))
    print('test : %.3f' % (mse(y_quad_test, model_ridge_2.predict(x_quad_test)) ** (1/2)))
#     #3次関数
#     print('3次')
#     print('train: %.3f' % (mse(y_cubic_train, model_ridge_3.predict(x_cubic_train)) ** (1/2)))
#     print('test : %.3f' % (mse(y_cubic_test, model_ridge_3.predict(x_cubic_test)) ** (1/2)))

In [28]:
ridge(x_std,y)

[自由度調整済み決定係数]
1次
train: 0.764304
test : 0.729167
2次
train: 0.830
test : 0.748
3次
train: 0.938
test : 0.758

[RMSE]
1次
train: 3756.879
test : 4092.647
2次
train: 3185.547
test : 3945.339
3次
train: 1923.419
test : 3866.884


#### Lasso

In [16]:
def lasso(x,y):
    global model_lasso_1,model_lasso_2,model_lasso_3
    from sklearn.linear_model import Lasso
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import PolynomialFeatures
    # データを学習させる（１次）
    model_lasso_1= Lasso(alpha=1000)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
    model_lasso_1.fit(x_train, y_train)
    # データを学習させる（２次）
    model_lasso_2= Lasso(alpha=1000)
    quad = PolynomialFeatures(degree=2)
    x_quad = quad.fit_transform(x)
    x_quad_train, x_quad_test,y_quad_train, y_quad_test = train_test_split(x_quad, y, test_size = 0.3, random_state = 0)
    model_lasso_2.fit(x_quad_train,y_quad_train)
    # データを学習させる（３次）
    model_lasso_3= Lasso(alpha=1000)
    cubic = PolynomialFeatures(degree=3)
    x_cubic = cubic.fit_transform(x)
    x_cubic_train, x_cubic_test,y_cubic_train, y_cubic_test = train_test_split(x_cubic, y, test_size = 0.3, random_state = 0)
    model_lasso_3.fit(x_cubic_train,y_cubic_train)
    
    # 自由度調整済み決定係数
    # (決定係数, trainまたはtestのサンプル数, 利用した特徴量の数)
    def adjusted(score, n_sample, n_features):
        adjusted_score = 1 - (1 - score) * ((n_sample - 1) / (n_sample - n_features - 1))
        return adjusted_score
    print('[自由度調整済み決定係数]')
    #1次関数
    print('1次')
    print('train: %3f' % adjusted(model_lasso_1.score(x_train, y_train), len(y_train), 2))
    print('test : %3f' % adjusted(model_lasso_1.score(x_test, y_test), len(y_test), 2))
    # 2次関数
    print('2次')
    print('train: %.3f' % adjusted(model_lasso_2.score(x_quad_train, y_quad_train), len(y_quad_train), 3))
    print('test : %.3f' % adjusted(model_lasso_2.score(x_quad_test, y_quad_test), len(y_quad_test), 3))
    # 3次関数
    print('3次')
    print('train: %.3f' % adjusted(model_lasso_3.score(x_cubic_train, y_cubic_train), len(y_cubic_train), 3))
    print('test : %.3f' % adjusted(model_lasso_3.score(x_cubic_test, y_cubic_test), len(y_cubic_test), 3))
    print('')

    # MSEを出力する関数を読み込む
    from sklearn.metrics import mean_squared_error as mse
    print('[RMSE]')
    #1次関数
    print('1次')
    print('train: %.3f' % (mse(y_train, model_lasso_1.predict(x_train)) ** (1/2)))
    print('test : %.3f' % (mse(y_test, model_lasso_1.predict(x_test)) ** (1/2)))
    #2次関数
    print('2次')
    print('train: %.3f' % (mse(y_quad_train, model_lasso_2.predict(x_quad_train)) ** (1/2)))
    print('test : %.3f' % (mse(y_quad_test, model_lasso_2.predict(x_quad_test)) ** (1/2)))
    #3次関数
    print('3次')
    print('train: %.3f' % (mse(y_cubic_train, model_lasso_3.predict(x_cubic_train)) ** (1/2)))
    print('test : %.3f' % (mse(y_cubic_test, model_lasso_3.predict(x_cubic_test)) ** (1/2)))

In [17]:
lasso(x_std,y)

[自由度調整済み決定係数]
1次
train: 0.778531
test : 0.728203
2次
train: 0.784
test : 0.728
3次
train: 0.783
test : 0.717

[RMSE]
1次
train: 3641.724
test : 4099.927
2次
train: 3598.425
test : 4100.613
3次
train: 3601.927
test : 4180.107


#### LGBM

In [18]:
def lgbm(x,y):
    global model
    import lightgbm as lgb
    from sklearn.model_selection import train_test_split
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
    lgb_train = lgb.Dataset(x_train,y_train)
    lgb_eval = lgb.Dataset(x_test,y_test,reference=lgb_train)
    lgbm_params = {
        'objective': 'regression',
        'metric': 'rmse',
        'num_leaves':10,
        'num_boost_round':100,
        'verbose_eval':5,
        'max_depth':0
        }
    model = lgb.train(lgbm_params, lgb_train, valid_sets=lgb_eval)
    # RMSE を計算する
    from sklearn.metrics import mean_squared_error as mse
    print('train : %.3f' % (mse(y_train, model.predict(x_train)) ** (1/2)))
    print('test : %.3f' % (mse(y_test, model.predict(x_test)) ** (1/2)))

In [19]:
lgbm(x_std,y)



[1]	valid_0's rmse: 7319.06
[2]	valid_0's rmse: 6834.94
[3]	valid_0's rmse: 6385.48
[4]	valid_0's rmse: 5996.85
[5]	valid_0's rmse: 5671.29
[6]	valid_0's rmse: 5380.57
[7]	valid_0's rmse: 5116.12
[8]	valid_0's rmse: 4905.42
[9]	valid_0's rmse: 4719.66
[10]	valid_0's rmse: 4570.36
[11]	valid_0's rmse: 4431.43
[12]	valid_0's rmse: 4324.14
[13]	valid_0's rmse: 4220.08
[14]	valid_0's rmse: 4114.55
[15]	valid_0's rmse: 4038.53
[16]	valid_0's rmse: 3967.53
[17]	valid_0's rmse: 3904.33
[18]	valid_0's rmse: 3856.77
[19]	valid_0's rmse: 3803.12
[20]	valid_0's rmse: 3757.29
[21]	valid_0's rmse: 3728.9
[22]	valid_0's rmse: 3696.19
[23]	valid_0's rmse: 3661.29
[24]	valid_0's rmse: 3622.07
[25]	valid_0's rmse: 3615.74
[26]	valid_0's rmse: 3592.09
[27]	valid_0's rmse: 3578.26
[28]	valid_0's rmse: 3558.55
[29]	valid_0's rmse: 3536.79
[30]	valid_0's rmse: 3515.41
[31]	valid_0's rmse: 3515.9
[32]	valid_0's rmse: 3501.97
[33]	valid_0's rmse: 3485.35
[34]	valid_0's rmse: 3467.75
[35]	valid_0's rmse: 3458

In [29]:
test_df= pd.read_csv('test_new.csv')
test_df_2 = preprocessing_2(test_df)
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
x_test_std = ss.fit_transform(test_df_2)

In [None]:
##1次
submit(lr,test_df,x_test_std)

1次→RMSE：3,630.72572、順位426位/930人中

In [32]:
##Ridge提出(2次)
x_test_quad = quad.fit_transform(x_test_std)
submit(model_ridge_2,test_df,x_test_quad)

RMSE:3,922.85376、順位:621位/931人中

In [None]:
##Lasso提出(1次)
submit(model_lasso_1,test_df,x_test_std)

RMSE:4,125.57155 ,順位674位/931人中

In [None]:
##LGBM提出
submit(model,test_df,x_test_std)

RMSE:3,599.20752 ,順位404位/931人中

# ３．改善２

In [None]:
all_df_2.columns

In [None]:
lr.coef_

In [None]:
# ヒートマップの表示
plt.figure(figsize=(24, 18))
sns.heatmap(all_df_2.corr(), annot=True, square=True, fmt='.2f')
plt.show()

#### 多重共線性の可能性？似ている変数を減らす or 統合
①monthとmatch→どっちも削除
②tv_countとtv→tvのみ残す
#### できるだけ変数を減らす
①humidity,temperature,weather→削除
②home_score,away_score→削除
③year→削除
④timeとday→day:平日(0),土日祝日(1)
#### one-hotエンコーディングを利用してダミー変数化

In [33]:
#homeとawayそれぞれチームと順位を合わせたDataFrameを作る。
team_df = pd.read_csv('team_2.csv')
home_df = team_df.loc[:,["順位(J1)","順位(J2)","チーム","年","ステージ"]]
home_df.columns =["home_rank_j1","home_rank_j2","home","year","stage"]
away_df = team_df.loc[:,["順位(J1)","順位(J2)","チーム","年","ステージ"]]
away_df.columns =["away_rank_j1","away_rank_j2","away","year","stage"]
home_df = home_df.fillna(0)
away_df = away_df.fillna(0)
home_df.head()

Unnamed: 0,home_rank_j1,home_rank_j2,home,year,stage
0,1.0,0.0,ガンバ大阪,2014,Ｊ１
1,2.0,0.0,浦和レッズ,2014,Ｊ１
2,3.0,0.0,鹿島アントラーズ,2014,Ｊ１
3,4.0,0.0,柏レイソル,2014,Ｊ１
4,5.0,0.0,サガン鳥栖,2014,Ｊ１


In [65]:
def preprocessing_3(df):
    df_2 = df
    #名字と名前の間の空白をなくす
    for i in range(1,12):
        df_2["home_{:02}".format(i)] = df_2["home_{:02}".format(i)].apply(lambda x: x.replace("　",""))
        df_2["away_{:02}".format(i)] = df_2["away_{:02}".format(i)].apply(lambda x: x.replace("　",""))
    #home_01~11,away01~11→True,False
    for i in range(len(df_2)):
        for j in range(1,12):
            df_2.loc[i,"home_{:02}".format(j)] = df_2.loc[i,"home_{:02}".format(j)] in daihyo_list
            df_2.loc[i,"away_{:02}".format(j)] = df_2.loc[i,"away_{:02}".format(j)] in daihyo_list
    #daihyo
    df_2["daihyo"] = (df_2==True).sum(axis=1)
    #home_rank,away_rank,difference
    df_2 = pd.merge(df_2,home_df,on=["home","year","stage"],how="left")
    df_2 = pd.merge(df_2,away_df,on=["away","year","stage"],how="left")
    #attendance
    df_2 = pd.merge(df_2,attendance_df,on=["home","year"],how="left")
    #gameday → 月(month)と曜日(day)を抽出
    #       → 曜日(day)を月(0),火(1),水(2),木(3),金(4),土(5),日(6),祝休(7)でマッピング
    df_2['day'] = df_2['gameday'].apply(lambda x: x[:-1].rsplit('(', 1)[-1])
    day_mapping = {'月':0, '火':0, '水':0, '木':0,'金':0,'土':1,'日':1}
    df_2['day'] = df_2['day'].map(day_mapping)
    df_2['day'] = df_2["day"].fillna(1)
    df_2['month'] = df_2['gameday'].apply(lambda x: int(x[:2]))
    #time → 〇時(hour)の部分だけ抽出
    df_2['time'] = df_2['time'].apply(lambda x: int(x.split(':', 1)[0]))
    """
    #time&day→time_day
    for i in range(len(df_2)):
        if df_2.loc[i,'day'] == 0:
            if df_2.loc[i.'time'] < 17:
                df_2.loc[i,'time_day'] = 0
            else:
                df_2.loc[i,'time_day'] = 1
        else:
            if df_2.loc[i.'time'] < 17:
                df_2.loc[i,'time_day'] = 2
            else:
                df_2.loc[i,'time_day'] = 3
    """
    #tv → 'NHK総合'を含む(0),'NHK'を含む(1),'BS'を含む(2),その他(3)とする
    for i in range(len(df_2)):
        if 'ＮＨＫ総合' in df_2.loc[i,'tv']:
            df_2.loc[i,'tv']=0
        elif 'ＮＨＫ' in df_2.loc[i,'tv']:
            df_2.loc[i,'tv']=1
        elif  'ＢＳ' in df_2.loc[i,'tv']:
            df_2.loc[i,'tv']=1
        else:
            df_2.loc[i,'tv']=2
    #address → 都道府県でマッピング
    df_2['address'] = df_2['address'].apply(lambda x: x[:2])
    #stadium,stage → マッピング
    df_2 = pd.get_dummies(df_2, columns=["stage"])
    for column in ['address','stadium']:
        le = preprocessing.LabelEncoder()
        le.fit(df_2[column])
        df_2[column] = le.transform(df_2[column])
    #もう使わない変数、数値じゃない変数を削除
    df_2 = df_2.drop(['id','gameday', 'home', 'away','referee','home_team','home_01', 'home_02', 'home_03', 'home_04',
       'home_05', 'home_06', 'home_07', 'home_08', 'home_09', 'home_10',
       'home_11','away_team', 'away_01', 'away_02', 'away_03', 'away_04', 'away_05',
       'away_06', 'away_07', 'away_08', 'away_09', 'away_10', 'away_11','weather', 'humidity','match','year', 'time','home_score','away_score','temperature'],axis=1)#
       #,'address'],axis=1)
    return(df_2)

In [66]:
all_df= pd.read_csv('train_new.csv')
all_df_3 = preprocessing_3(all_df)
all_df_3.head()

Unnamed: 0,y,stadium,tv,address,capa,daihyo,home_rank_j1,home_rank_j2,away_rank_j1,away_rank_j2,attendance,day,month,stage_Ｊ１,stage_Ｊ２
0,18250,15,0,8,19694,2,2.0,0.0,11.0,0.0,16600.0,1.0,3,1,0
1,24316,48,1,17,40000,2,7.0,0.0,9.0,0.0,17155.0,1.0,3,1,0
2,17066,17,1,7,21000,4,17.0,0.0,16.0,0.0,14778.0,1.0,3,1,0
3,29603,3,1,14,50000,9,1.0,0.0,3.0,0.0,17721.0,1.0,3,1,0
4,25353,36,1,3,39232,3,18.0,0.0,12.0,0.0,12008.0,1.0,3,1,0


In [67]:
#外れ値処理
for i in range(len(all_df_3)):
    if all_df_3.loc[i,'y']==0 or all_df_3.loc[i,'y']>=50000:
        all_df_3=all_df_3.drop(i,axis=0)
        print(i)
all_df_3.index = np.arange(len(all_df_3))

298
784
1059
1066
1567
1654


In [68]:
y=pd.DataFrame(all_df_3, columns=['y'])
x=all_df_3.drop('y',axis=1)
#標準化
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
x_std = ss.fit_transform(x)

In [69]:
ridge(x_std,y)

[自由度調整済み決定係数]
1次
train: 0.754714
test : 0.722058
2次
train: 0.807
test : 0.762
[RMSE]
1次
train: 3832.543
test : 4146.017
2次
train: 3402.427
test : 3837.265


In [70]:
lgbm(x_std,y)



[1]	valid_0's rmse: 7321
[2]	valid_0's rmse: 6836.38
[3]	valid_0's rmse: 6378.61
[4]	valid_0's rmse: 5982.59
[5]	valid_0's rmse: 5669.7
[6]	valid_0's rmse: 5367.7
[7]	valid_0's rmse: 5119.5
[8]	valid_0's rmse: 4883.6
[9]	valid_0's rmse: 4687.84
[10]	valid_0's rmse: 4537.5
[11]	valid_0's rmse: 4405.85
[12]	valid_0's rmse: 4284.84
[13]	valid_0's rmse: 4183.27
[14]	valid_0's rmse: 4079.3
[15]	valid_0's rmse: 4017.91
[16]	valid_0's rmse: 3943.55
[17]	valid_0's rmse: 3880.76
[18]	valid_0's rmse: 3831.2
[19]	valid_0's rmse: 3780.76
[20]	valid_0's rmse: 3748.59
[21]	valid_0's rmse: 3707.36
[22]	valid_0's rmse: 3674.43
[23]	valid_0's rmse: 3653.78
[24]	valid_0's rmse: 3631.58
[25]	valid_0's rmse: 3602.9
[26]	valid_0's rmse: 3589.84
[27]	valid_0's rmse: 3571.24
[28]	valid_0's rmse: 3558.57
[29]	valid_0's rmse: 3550.11
[30]	valid_0's rmse: 3540.46
[31]	valid_0's rmse: 3529.32
[32]	valid_0's rmse: 3518.41
[33]	valid_0's rmse: 3505.85
[34]	valid_0's rmse: 3490.98
[35]	valid_0's rmse: 3479.46
[36]	

In [71]:
test_df= pd.read_csv('test_new.csv')
test_df_3 = preprocessing_3(test_df)
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
x_test_std = ss.fit_transform(test_df_3)

In [72]:
##LGBM提出
submit(model,test_df,x_test_std)

RMSE:3,720.34445 ,順位486位/931人中

In [74]:
##Ridge提出(2次)
x_test_quad = quad.fit_transform(x_test_std)
submit(model_ridge,test_df,x_test_std)

RMSE:4,268.74044 , 順位684位/931人