In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statistics import mean, variance
from scipy import stats
import copy

外れ値処理

In [5]:
# ホテリング理論の計算を関数化
def calc_anomaly_scores(data, alpha=0.99):
#     print("=== calc_anomaly_scores ===")
#     print(data)
    # 標本平均
    data_mean = mean(data)
    # 標本分散
    data_variance = variance(data)
    
    # 異常度
    anomaly_scores = []
    for x in data:
        anomaly_score = (x - data_mean)**2 / data_variance
        anomaly_scores.append(anomaly_score)

    # カイ二乗分布による閾値
    threshold = stats.chi2.interval(alpha, 1)[1]
#     print("threshold={}, anomaly_scores={}".format(threshold_, anomaly_scores))
    return threshold, anomaly_scores

In [6]:
def specify_anomaly_value(data, threshold, anomaly_scores):
#     print("=== specify_anomaly_value ===")
    nomaly_data = []
    anomaly_data = []
    for i in range(len(data)):
        # 閾値以下のデータを抽出
        if anomaly_scores[i] <= threshold:
            nomaly_data.append(data[i])
            
        # 異常値を保存
        else:
            anomaly_data.append(data[i])
            
    # 閾値以下のデータの平均値
    # mean_ = mean(nomaly_data)
    # 単純な平均
    mean_ = mean(data)
#     print("threshold={}, mean_={}, anomaly_data={}".format(threshold, mean_, anomaly_data))
    return mean_, anomaly_data

In [7]:
# df_:対象データフレーム y:品種Y　column:置換対象の要素　anomaly_data:異常値リスト nomaly_data_mean:置換する平均値
def replace_df_anomaly_data(df, y, column, anomaly_data, nomaly_data_mean):
    # 引数の品種ならばTrue データの数分True,Falseが生成
    mask1 = df['Y'] == y
    # 異常値リストに含まれていればTrue データの数分True,Falseが生成
    mask2 = df[column].isin(anomaly_data)
    # 特定品種かつ異常値リストに含まれている場合のみTrue
    mask = np.logical_and(mask1, mask2)
    # where(条件, 条件がfalseの場合置換する, inplace:直接編集する)
    # 条件がfalseの場合置換するので、反転する
    df[column].where(np.logical_not(mask), nomaly_data_mean, inplace=True)

In [8]:
def main_anomaly_values(_df_train):
    # 特徴変数
    X = _df_train.iloc[:,2:]
    # 目的変数
    Y = _df_train['Y']
    # 品種の種類を抽出
    np_unique_y = np.unique(Y.values)
    # 品種単位にデータを分割　K:Y V:DataFrame
    df_y_dict = {}
    for y in np_unique_y:
        df_y_dict[y] = _df_train.query('Y == ' + str(y))
    
    for y in np_unique_y:
        df_tmp = df_y_dict[y]
        for column in X.columns.values:
            # 特定品種の特定要素を抽出
            np_y_item = df_tmp[column].values
            # 閾値と異常度を計算 alpha=0.99では0.97753 
            threshold, anomaly_scores = calc_anomaly_scores(np_y_item, alpha=0.95)
            # 平均値と置換対象を計算
            nomaly_data_mean, anomaly_data = specify_anomaly_value(np_y_item, threshold, anomaly_scores)
        
#             print("Y={}, column={}, threshold={}, anomaly_data={}, mean={}".format(y, column,threshold, anomaly_data, nomaly_data_mean))
            # 品種、要素が一致する中から異常値を平均値に置換する
            replace_df_anomaly_data(_df_train, y, column, anomaly_data, nomaly_data_mean)
    
    return _df_train

ランダムフォレスト

In [15]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# 一連の流れを関数化
def random_forest_classifier_fit_score(x, y, n_estimators, max_depth, min_samples_split, min_samples_leaf):
    # 学習用とテスト用に分割
    X_tr, X_ts, y_tr, y_ts = train_test_split(x, y, test_size = 0.5)
    # ランダムフォレスト作成
    clf_rf_func = RandomForestClassifier(n_estimators=n_estimators, 
                                         random_state=1, 
                                         max_depth=max_depth,
                                         min_samples_split=min_samples_split,
                                         min_samples_leaf=min_samples_leaf,
                                         n_jobs=-1)
    # 学習
    clf_rf_func.fit(X=X_tr, y=y_tr)
    # 評価
    score = clf_rf_func.score(X=X_ts, y=y_ts)
#     print("RandomForestClassifier.score=" + str(score))
    
    # 重要度を確認
    imp = pd.DataFrame(x.columns, columns=['label'])
    imp['importrances'] = clf_rf_func.feature_importances_
    imp = imp.sort_values('importrances', ascending=False)
#     print(imp)
    
    return clf_rf_func, score, imp

In [10]:
import itertools
def calc_random_forest_classifier(df_):
    # 特徴変数
    X_ = df_.iloc[:,2:]
    # 目的変数
    Y_ = df_['Y']

    # 重要度が低い項目を削除
#     for l in ['Nonflavanoid phenols','Proanthocyanins','Magnesium','Ash','Total phenols','Malic acid','Alcalinity of ash']:
#         del X_[l]
        
    n_estimators_list = [100, 500, 1000, 1500, 2000]
    max_depth_list = [4, 5, 6, 7, 8, 9, 10]
    min_samples_split_list = [2, 4, 8]
    min_samples_leaf_list = [1, 2, 3, 4, 5]
    clf_list = []
    score_list = []
    imp_list = []
    count = 0
    
    for n_estimators, max_depth, min_samples_split, min_samples_leaf in itertools.product(n_estimators_list, max_depth_list, min_samples_split_list, min_samples_leaf_list):
        # 学習と評価
        clf, score, imp = random_forest_classifier_fit_score(X_, Y_, n_estimators, max_depth, min_samples_split, min_samples_leaf)
        # 結果を保存
        clf_list.append(clf)
        score_list.append(score)
        imp_list.append(imp)
        print("[{}]n_estimators={}, max_depth={}, min_samples_split={}, min_samples_leaf={}, score={}".format(count,n_estimators, max_depth, min_samples_split, min_samples_leaf, score))
        count = count + 1
    
    return clf_list, score_list, imp_list

In [11]:
df_train = pd.read_csv('train.tsv',delimiter='\t' )
df_train = main_anomaly_values(df_train)
clf_list, score_list, imp_list = calc_random_forest_classifier(df_train)



[0]n_estimators=100, max_depth=4, min_samples_split=2, min_samples_leaf=1, score=0.9555555555555556
[1]n_estimators=100, max_depth=4, min_samples_split=2, min_samples_leaf=2, score=0.9555555555555556
[2]n_estimators=100, max_depth=4, min_samples_split=2, min_samples_leaf=3, score=0.9777777777777777
[3]n_estimators=100, max_depth=4, min_samples_split=2, min_samples_leaf=4, score=1.0
[4]n_estimators=100, max_depth=4, min_samples_split=2, min_samples_leaf=5, score=0.9333333333333333
[5]n_estimators=100, max_depth=4, min_samples_split=4, min_samples_leaf=1, score=0.9555555555555556
[6]n_estimators=100, max_depth=4, min_samples_split=4, min_samples_leaf=2, score=0.9555555555555556
[7]n_estimators=100, max_depth=4, min_samples_split=4, min_samples_leaf=3, score=0.9111111111111111
[8]n_estimators=100, max_depth=4, min_samples_split=4, min_samples_leaf=4, score=0.9777777777777777
[9]n_estimators=100, max_depth=4, min_samples_split=4, min_samples_leaf=5, score=0.9333333333333333
[10]n_estimator

In [25]:
# result6の結果 0.98876
ans_old=np.array([1, 3, 2, 2, 1, 1, 2, 2, 1, 2, 1, 3, 1, 2, 3, 1, 3, 1, 3, 2, 3, 2, 
                  2, 2, 2, 2, 3, 3, 2, 3, 1, 3, 3, 1, 1, 2, 3, 2, 3, 2, 3, 3, 2, 1, 3,
                  1, 1, 2, 2, 1, 2, 2, 2, 2, 1, 3, 3, 3, 1, 2, 1, 1, 1, 2, 1, 1, 1,
                  1, 3, 2, 3, 1, 1, 2, 3, 1, 1, 2, 2, 3, 1, 1, 1, 2, 1, 2, 3, 2, 2])

In [26]:
# テスト
df_test = pd.read_csv('test.tsv',delimiter='\t' )
# 特徴変数
test_X = df_test.iloc[:,1:]

In [27]:
for i in range(len(clf_list)):
    clf_tmp = clf_list[i]
    ans_tmp = clf_tmp.predict(X=test_X)
    
    diff_list = []
    for j in range(len(ans_tmp)):
        if (ans_tmp[j] != ans_old[j]):
            diff_list.append(j)
    
    
    print("clf_{} score={}, diff:{}".format(i, score_list[i], diff_list))

clf_0 score=0.9555555555555556, diff:[70, 85]
clf_1 score=0.9555555555555556, diff:[34, 85]
clf_2 score=0.9777777777777777, diff:[85]
clf_3 score=1.0, diff:[17]
clf_4 score=0.9333333333333333, diff:[34, 47, 85]
clf_5 score=0.9555555555555556, diff:[]
clf_6 score=0.9555555555555556, diff:[17, 33, 85]
clf_7 score=0.9111111111111111, diff:[33, 34, 72, 80]
clf_8 score=0.9777777777777777, diff:[]
clf_9 score=0.9333333333333333, diff:[17, 33, 34, 72, 80]
clf_10 score=0.9777777777777777, diff:[17, 33, 85]
clf_11 score=0.9555555555555556, diff:[17, 33, 34]
clf_12 score=0.9777777777777777, diff:[71, 85]
clf_13 score=0.9555555555555556, diff:[85]
clf_14 score=1.0, diff:[17, 34, 72]
clf_15 score=1.0, diff:[]
clf_16 score=1.0, diff:[]
clf_17 score=0.9555555555555556, diff:[17, 34, 72]
clf_18 score=0.9777777777777777, diff:[17, 34, 47, 85]
clf_19 score=0.9777777777777777, diff:[]
clf_20 score=0.9111111111111111, diff:[71, 85]
clf_21 score=0.9777777777777777, diff:[]
clf_22 score=1.0, diff:[34, 70]


提出用モデルを選ぶ  
一応評価1.00000にはなったけど…

In [28]:
result_clf = clf_list[75]
ans = result_clf.predict(X=test_X)
ans

array([1, 3, 2, 2, 1, 1, 2, 2, 1, 2, 1, 3, 1, 2, 3, 1, 3, 1, 3, 2, 3, 2,
       2, 2, 2, 2, 3, 3, 2, 3, 1, 3, 3, 1, 1, 2, 3, 2, 3, 2, 3, 3, 2, 1,
       3, 1, 1, 2, 2, 1, 2, 2, 2, 2, 1, 3, 3, 3, 1, 2, 1, 1, 1, 2, 1, 1,
       1, 1, 3, 2, 3, 2, 1, 2, 3, 1, 1, 2, 2, 3, 1, 1, 1, 2, 1, 2, 3, 2,
       2], dtype=int64)

In [29]:
result_clf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=9, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [30]:
# 提出用に加工
np_id = df_test['id'].values
dd=pd.DataFrame({"id":np_id, "ans":ans})
dd.to_csv("result09.csv",header=False,index=False)
