# ライブラリのインポート

In [None]:
import os
import warnings
warnings.filterwarnings("ignore")
import random
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

# 変数の設定

In [None]:
class CFG:
    VER = 1
    AUTHOR = "takaito"
    COMPETITION = "atmacup17"
    DATA_PATH = Path("/kaggle/input")  # atmacup17 サブフォルダなし
    SEED = 42
    N_SPLIT = 3
    TARGET_COL = "Recommended IND"
    TARGET_COL_CLASS_NUM = 2

# 乱数の設定

In [None]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything(CFG.SEED)

# データの読み込み

In [None]:
clothing_master_df = pd.read_csv(CFG.DATA_PATH / "clothing_master.csv")
train_df = pd.read_csv(CFG.DATA_PATH / "train.csv")
test_df = pd.read_csv(CFG.DATA_PATH / "test.csv")

# 読み込んだデータの確認

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
clothing_master_df.head()

# データの加工

In [None]:
train_df = train_df.merge(clothing_master_df, on="Clothing ID", how="left")
test_df = test_df.merge(clothing_master_df, on="Clothing ID", how="left")

In [None]:
test_df.head()

# 簡単な分析

In [None]:
train_cols = train_df.columns
test_cols = test_df.columns
for col in train_cols:
    if col in test_cols:
        print("train&test:", col)
    else:
        print("train only:", col)

In [None]:
numerical_features = ["Age", "Positive Feedback Count"]
categorical_features = ["Clothing ID", "Division Name", "Department Name", "Class Name", "Title", "Review Text"]

In [None]:
for feature in numerical_features:
    plt.title(feature)
    train_df[feature].plot.kde(label="train")
    test_df[feature].plot.kde(label="test")
    plt.legend()
    plt.show()
    plt.close("all")

In [None]:
for feature in categorical_features:
    plt.title(feature)
    venn2([set(train_df[feature]), set(test_df[feature])])
    plt.legend()
    plt.show()
    plt.close("all")

# 交差検証

In [None]:
kfold = StratifiedKFold(n_splits=CFG.N_SPLIT, shuffle=True, random_state=CFG.SEED)
for fold, (train_index, valid_index) in enumerate(kfold.split(train_df, train_df[CFG.TARGET_COL])):
    print("train: ", train_index[:10])
    print("valid: ", valid_index[:10])
    print(dict(train_df.iloc[valid_index][CFG.TARGET_COL].value_counts()))