In [48]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [78]:
def load_data():
    data = pd.read_pickle('data.pkl')
    nomination_onehot = pd.read_pickle('nomination_onehot.pkl')
    selected_performers_onehot = pd.read_pickle('selected_performers_onehot.pkl')
    selected_directors_onehot = pd.read_pickle('selected_directors_onehot.pkl')
    selected_studio_onehot = pd.read_pickle('selected_studio_onehot.pkl')
    selected_scriptwriter_onehot = pd.read_pickle('selected_scriptwriter_onehot.pkl')
    review_dataframe = pd.read_pickle('review_dataframe.pkl')

    # selected_directors_onehotとselected_scriptwriter_onehotの重複した人
    duplicate_scriptwriter = set(selected_directors_onehot.columns) & set(selected_scriptwriter_onehot.columns)
    selected_scriptwriter_onehot = selected_scriptwriter_onehot.drop(duplicate_scriptwriter, axis=1)
          
    df = pd.concat(
        [
            nomination_onehot, 
            selected_performers_onehot,
            selected_directors_onehot,
            selected_studio_onehot,
            selected_scriptwriter_onehot,
            data["screen_time"],
        ],
        axis=1
    )
    # 共線性の高いカラムを除く
    drop_clm = ['吉田一夫']
    df = df.drop(drop_clm, axis=1)
    
    # 取得できなかった上映時間(screen_time == -1)を平均で埋める
        # df[df["screen_time"] == -1] = df.mean().screen_time
    df["screen_time"] = df["screen_time"].replace(-1, df["screen_time"].mean())
    
    # データセットとして扱うのに必要なyear, prizeのフラグを付与する
    df = pd.concat(
        [df, data["year"], data["prize"]], axis=1
    )
    
    return df

In [81]:
df = load_data()
for year in range(1978, 2020):
    
    scaler = StandardScaler()
    
    x_columns = df.drop(["year", "prize"], axis=1).columns
    
    train_x = df[df["year"] != year].drop(["year", "prize"], axis=1).values
    test_x = df[df["year"] == year].drop(["year", "prize"], axis=1).values
    train_y_df = df[df["year"] != year]["prize"]
    test_y_df = df[df["year"] == year]["prize"]
    
    scaler.fit(train_x)
    std_train_x = scaler.transform(train_x)
    std_test_x = scaler.transform(test_x)
    
    std_train_x_df = pd.DataFrame(std_train_x, columns=x_columns)
    std_test_x_df = pd.DataFrame(std_test_x, columns=x_columns)
    
    # インデックスの調整
    std_train_x_df.index.name = 'id'
    std_test_x_df.index.name = 'id'
    std_train_x_df.index += 1
    std_test_x_df.index += 1
    
    # pickleで保存
    base_path = "../std_data/"
    std_train_x_df.to_pickle(base_path  + "train/{}_x.pkl".format(str(year)))
    std_test_x_df.to_pickle(base_path + "test/{}_x.pkl".format(str(year)))
    train_y_df.to_pickle(base_path + "train/{}_y.pkl".format(str(year)))
    test_y_df.to_pickle(base_path + "test/{}_y.pkl".format(str(year)))

In [80]:
# save pandas.DataFrame as pickle
std_train_x_df

Unnamed: 0_level_0,blue_ribbon_award,golden_gross,hochi_eigashou,kinejun_best_ten,mainichi_film_award,nikkan_sports,中井貴一,丹波哲郎,井川比佐志,仲代達矢,...,李相日,松田寛夫,澤井信一郎,田中陽造,神波史男,筒井ともみ,荒井晴彦,那須真知子,鄭義信,screen_time
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.820731,-0.201008,2.574807,-0.371647,-0.380058,-0.363137,-0.259533,-0.248708,-0.290191,-0.237508,...,-0.121566,-0.157720,-0.121566,-0.121566,-0.121566,-0.14072,-0.121566,-0.099015,-0.121566,-1.143355
2,-0.354518,-0.201008,-0.388379,-0.371647,-0.380058,-0.363137,-0.259533,-0.248708,-0.290191,-0.237508,...,-0.121566,-0.157720,-0.121566,-0.121566,-0.121566,-0.14072,-0.121566,-0.099015,-0.121566,2.229671
3,-0.354518,-0.201008,-0.388379,-0.371647,-0.380058,-0.363137,-0.259533,-0.248708,-0.290191,-0.237508,...,-0.121566,-0.157720,-0.121566,-0.121566,-0.121566,-0.14072,-0.121566,-0.099015,-0.121566,-0.061441
4,-0.354518,-0.201008,-0.388379,-0.371647,-0.380058,-0.363137,-0.259533,4.020779,-0.290191,-0.237508,...,-0.121566,-0.157720,-0.121566,-0.121566,-0.121566,-0.14072,-0.121566,-0.099015,-0.121566,2.738807
5,-0.354518,-0.201008,-0.388379,-0.371647,-0.380058,-0.363137,-0.259533,-0.248708,-0.290191,-0.237508,...,-0.121566,-0.157720,-0.121566,-0.121566,-0.121566,-0.14072,-0.121566,-0.099015,-0.121566,-0.570577
6,-0.354518,-0.201008,-0.388379,-0.371647,2.631174,-0.363137,-0.259533,4.020779,-0.290191,-0.237508,...,-0.121566,-0.157720,-0.121566,-0.121566,-0.121566,-0.14072,-0.121566,-0.099015,-0.121566,0.765905
7,-0.354518,-0.201008,-0.388379,-0.371647,-0.380058,-0.363137,-0.259533,-0.248708,-0.290191,-0.237508,...,-0.121566,-0.157720,-0.121566,-0.121566,-0.121566,-0.14072,-0.121566,-0.099015,-0.121566,-1.143355
8,-0.354518,-0.201008,-0.388379,-0.371647,-0.380058,-0.363137,-0.259533,-0.248708,-0.290191,-0.237508,...,-0.121566,-0.157720,-0.121566,-0.121566,-0.121566,-0.14072,-0.121566,-0.099015,-0.121566,-1.016071
9,2.820731,-0.201008,2.574807,2.690725,-0.380058,-0.363137,-0.259533,-0.248708,-0.290191,-0.237508,...,-0.121566,-0.157720,-0.121566,-0.121566,-0.121566,-0.14072,-0.121566,-0.099015,-0.121566,-1.461565
10,-0.354518,-0.201008,-0.388379,-0.371647,-0.380058,-0.363137,-0.259533,4.020779,-0.290191,-0.237508,...,-0.121566,6.340347,-0.121566,-0.121566,-0.121566,-0.14072,-0.121566,-0.099015,-0.121566,0.256769


id
1      1978
2      1978
3      1978
4      1978
5      1978
       ... 
207    2019
208    2019
209    2019
210    2019
211    2019
Name: year, Length: 211, dtype: int64

In [69]:
unko = pd.read_pickle("../../data/dataframes/std_data/train/1978_y.pkl")
unko

id
6      1
7      0
8      0
9      0
10     0
      ..
207    1
208    0
209    0
210    0
211    0
Name: prize, Length: 206, dtype: int64