# Clustering

In [75]:
# warningの無視
import warnings
warnings.filterwarnings('ignore')

In [55]:
import pickle
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE


# User defined functions
from self_lib import tips
from self_lib import doggie_tail as d_

%matplotlib inline

In [56]:
# 表示列数を指定
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [57]:
# 特徴量の生成
def get_train_features(data,dic_features,filepath):
    """get_features(data,dic_features)
        
    dic_features={
      "key":["feature1","feature2",n_clusters],
      "":["","",3],
      "":["","",3],
      "":["","",10],
      "":["","",100],
      "":["","",100]}
    """
    seed=1
    for key in dic_features:
        l=dic_features.get(key)
        *l_features,cnt=l

        print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        print("  > " + key)
        
        # ファイル名
        featureName=key
        
        # クラスターの数
        n_clusters=cnt

        # データ取得
        print("  >> " + str(l_features) + " clustering to " + str(cnt))
        features_X = data[l_features]

        # クラスター
        kmeans = KMeans(n_clusters=n_clusters, random_state=seed)
        kmeans.fit(features_X)
        clusters = kmeans.fit_predict(features_X)

        # モデルを保存
        with open(filepath + "/kmeans/kmeans_" + featureName + '.pickle', mode='wb') as f:
            pickle.dump(kmeans, f)
            
        # 結果を保存
        with open(filepath + 'train_features.pickle', mode='wb') as f:
            pickle.dump(clusters, f)

    print("--- END ---")

In [58]:
# 作成済みモデルで特徴量の生成
def get_valid_features(data,dic_features,filepath):
    seed=1
    for key in dic_features:
        l=dic_features.get(key)
        *l_features,cnt=l

       # ファイル名
        featureName=key
        
        kmeans =pd.read_pickle(filepath + "/kmeans/kmeans_" +  featureName + ".pickle")

        features_X=data[l_features]
        clusters = kmeans.fit_predict(features_X)

        # 結果を保存
        with open(filepath + 'valid_features.pickle', mode='wb') as f:
            pickle.dump(clusters, f)

In [42]:
# 作成済みモデルで特徴量の生成
def get_test_features(data,dic_features,filepath):
    seed=1
    for key in dic_features:
        l=dic_features.get(key)
        *l_features,cnt=l

       # ファイル名
        featureName=key
        
        kmeans =pd.read_pickle(filepath + "/kmeans/kmeans_" +  featureName + ".pickle")

        features_X=data[l_features]
        clusters = kmeans.fit_predict(features_X)

        # 結果を保存
        with open(filepath + 'test_features.pickle', mode='wb') as f:
            pickle.dump(clusters, f)

In [59]:
# 作成済みモデルで特徴量の生成
def get_submit_features(data,dic_features,filepath):
    seed=1
    for key in dic_features:
        l=dic_features.get(key)
        *l_features,cnt=l

       # ファイル名
        featureName=key
        
        kmeans =pd.read_pickle(filepath + "/kmeans/kmeans_" +  featureName + ".pickle")

        features_X=data[l_features]
        clusters = kmeans.fit_predict(features_X)

        # 結果を保存
        with open(filepath + 'submit_features.pickle', mode='wb') as f:
            pickle.dump(clusters, f)

In [44]:
# # 特徴量を作りたい組み合わせ
# dic_features={
#     "Zip50":["Zip",50],
#     "Employees3":["Employees",3],
#     "FranchiseCode20":["FranchiseCode",20],
#     "LoanAmount_GuaranteedLoan3":["LoanAmount","GuaranteedLoan",10],
#     "SystemCode50":["SystemCode",50],
#     "Term_Approval_Y50":["Term","Approval_Y",50],
#     "Term_LoanAmount50":["Term","LoanAmount",50],
#     "Term100":["Term",100],
#     "Zip_UrbanRural10":["Zip","UrbanRural",10],
#     "Term_SystemCode":["SystemCode","Term",50],
#     "SystemCode_Emp_ExN_CrJ_ReJ":["Employees","ExistNew","CreateJob","RetainedJob",50],
#     "SystemCode_FranchiseCode":["SystemCode","FranchiseCode",50]
# }

In [60]:
# 特徴量を作りたい組み合わせ
dic_features={
    "main_feature":['Term', 'Employees', 'DisbursementGross', 'LoanAmount',
       'GuaranteedLoan', 'Approval_Y', 'Approval_M', 'Approval_D',
       'DisbursementDate_Y', 'DisbursementDate_M', 'DisbursementDate_D',
       'Term_12', 'Term_6', 'Term_3', 'Dis_App_Month', 'ReturnCnt',
       'ReturnAmountPerMonth', 'ReturnAmountPerCnt',
       'GuaranteedAmountRatio', 'ProperAmountRatio', 'dif_Bank-Grt', 'a_rate',
       'zip_A_1', 'zip_A_2', 'zip_A_3', 'zip_A_4', 'zip_A_5', 'zip_A_6',
       'zip_A_7', 'zip_A_8', 'zip_A_9', 'zip_A_nan', 'zip_B_1', 'zip_B_2',
       'zip_B_3', 'zip_B_4', 'zip_B_5', 'zip_B_6', 'zip_B_7', 'zip_B_8',
       'zip_B_9', 'zip_B_nan', 'zip_C_1', 'zip_C_2', 'zip_C_3', 'zip_C_4',
       'zip_C_5', 'zip_C_6', 'zip_C_7', 'zip_C_8', 'zip_C_9', 'zip_C_nan',
       'zip_D_1', 'zip_D_2', 'zip_D_3', 'zip_D_4', 'zip_D_5', 'zip_D_6',
       'zip_D_7', 'zip_D_8', 'zip_D_9', 'zip_D_nan', 'zip_E_1', 'zip_E_2',
       'zip_E_3', 'zip_E_4', 'zip_E_5', 'zip_E_6', 'zip_E_7', 'zip_E_8',
       'zip_E_9', 'zip_E_nan',50]
}

In [61]:
#### train 特徴量の生成
# 対象データを設定
data =pd.read_pickle("../data/feature/train_X.pickle")

# 保存場所を設定
filepath='../data/feature/cluster_features/'
featureName="train_feature"

get_train_features(data,dic_features,filepath)

2021-01-27 00:45:12
  > main_feature
  >> ['Term', 'Employees', 'DisbursementGross', 'LoanAmount', 'GuaranteedLoan', 'Approval_Y', 'Approval_M', 'Approval_D', 'DisbursementDate_Y', 'DisbursementDate_M', 'DisbursementDate_D', 'Term_12', 'Term_6', 'Term_3', 'Dis_App_Month', 'ReturnCnt', 'ReturnAmountPerMonth', 'ReturnAmountPerCnt', 'GuaranteedAmountRatio', 'ProperAmountRatio', 'dif_Bank-Grt', 'a_rate', 'zip_A_1', 'zip_A_2', 'zip_A_3', 'zip_A_4', 'zip_A_5', 'zip_A_6', 'zip_A_7', 'zip_A_8', 'zip_A_9', 'zip_A_nan', 'zip_B_1', 'zip_B_2', 'zip_B_3', 'zip_B_4', 'zip_B_5', 'zip_B_6', 'zip_B_7', 'zip_B_8', 'zip_B_9', 'zip_B_nan', 'zip_C_1', 'zip_C_2', 'zip_C_3', 'zip_C_4', 'zip_C_5', 'zip_C_6', 'zip_C_7', 'zip_C_8', 'zip_C_9', 'zip_C_nan', 'zip_D_1', 'zip_D_2', 'zip_D_3', 'zip_D_4', 'zip_D_5', 'zip_D_6', 'zip_D_7', 'zip_D_8', 'zip_D_9', 'zip_D_nan', 'zip_E_1', 'zip_E_2', 'zip_E_3', 'zip_E_4', 'zip_E_5', 'zip_E_6', 'zip_E_7', 'zip_E_8', 'zip_E_9', 'zip_E_nan'] clustering to 50
--- END ---


In [62]:
# 生成データ読み取り
feature_path ="../data/feature/cluster_features/"
feature_data=pd.Series(pd.read_pickle(feature_path + "train_features.pickle"),name="main_features")

# カテゴリー変数化
feature_data=feature_data.astype('category')

# ダミー変数化
feature_data=pd.get_dummies(feature_data, dummy_na=True,prefix="main_feature",prefix_sep='_',drop_first=False)
feature_data.to_pickle(feature_path + "train_features.pickle")

In [63]:
# valid
data=pd.read_pickle("../data/feature/valid_X.pickle")
get_valid_features(data,dic_features,filepath)

In [64]:
# 生成データ読み取り
feature_path ="../data/feature/cluster_features/"
feature_data=pd.Series(pd.read_pickle(feature_path + "valid_features.pickle"),name="main_features")

# カテゴリー変数化
feature_data=feature_data.astype('category')

# ダミー変数化
feature_data=pd.get_dummies(feature_data, dummy_na=True,prefix="main_feature",prefix_sep='_',drop_first=False)
feature_data.to_pickle(feature_path + "valid_features.pickle")

In [65]:
# # test
# data=pd.read_pickle("../data/feature/test_X.pickle")
# get_test_features(data,dic_features,filepath)

In [66]:
# # 生成データ読み取り
# feature_path ="../data/feature/cluster_features/"
# feature_data=pd.Series(pd.read_pickle(feature_path + "test_features.pickle"),name="main_features")

# # カテゴリー変数化
# feature_data=feature_data.astype('category')

# # ダミー変数化
# feature_data=pd.get_dummies(feature_data, dummy_na=True,prefix="main_feature",prefix_sep='_',drop_first=False)
# feature_data.to_pickle(feature_path + "test_features.pickle")

In [67]:
# submit
data=pd.read_pickle('../data/feature/test_feature_data_X.pickle')
get_submit_features(data,dic_features,filepath)

In [68]:
# 生成データ読み取り
feature_path ="../data/feature/cluster_features/"
feature_data=pd.Series(pd.read_pickle(feature_path + "submit_features.pickle"),name="main_features")

# カテゴリー変数化
feature_data=feature_data.astype('category')

# ダミー変数化
feature_data=pd.get_dummies(feature_data, dummy_na=True,prefix="main_feature",prefix_sep='_',drop_first=False)
feature_data.to_pickle(feature_path + "submit_features.pickle")

In [96]:
# # 生成データ読み取り
# Employees3 =pd.Series(pd.read_pickle(filepath + "Employees3.pickle"),name="Employees3")
# FranchiseCode10 =pd.Series(pd.read_pickle(filepath + "FranchiseCode10.pickle"),name="FranchiseCode10" )
# # LoanAmount_GuaranteedLoan3 =pd.Series(pd.read_pickle(filepath + "LoanAmount_GuaranteedLoan3.pickle"),name="LoanAmount_GuaranteedLoan3" )
# SystemCode10 =pd.Series(pd.read_pickle(filepath + "SystemCode10.pickle"),name="SystemCode10")
# # Term_Approval_Y20 =pd.Series(pd.read_pickle(filepath + "Term_Approval_Y20.pickle"),name="Term_Approval_Y20")
# Term_LoanAmount20 =pd.Series(pd.read_pickle(filepath + "Term_LoanAmount20.pickle"),name="Term_LoanAmount20")
# Term10 = pd.Series(pd.read_pickle(filepath + "Term10.pickle"),name="Term10")
# Zip_UrbanRural10 = pd.Series(pd.read_pickle(filepath + "Zip_UrbanRural10.pickle"),name="Zip_UrbanRural10")
# Zip50 =pd.Series(pd.read_pickle(filepath + "Zip50.pickle"),name="Zip50")
# SystemCode_Emp_ExN_CrJ_ReJ =pd.Series(pd.read_pickle(filepath + "SystemCode_Emp_ExN_CrJ_ReJ.pickle"),name="SystemCode_Emp_ExN_CrJ_ReJ")
# SystemCode_FranchiseCode =pd.Series(pd.read_pickle(filepath + "SystemCode_FranchiseCode.pickle"),name="SystemCode_FranchiseCode")

# # データ統合
# feature_list=[Employees3, FranchiseCode10,SystemCode10,Term_LoanAmount20,Zip_UrbanRural10,Zip50,SystemCode_Emp_ExN_CrJ_ReJ]
# feature_data=pd.concat(feature_list, axis=1)

# # ダミー変数化
# lName=["Emp3", "Fra10","Sys10","T_LAm20","Z_UbRr10","Zip50","SEECR"]
# # カテゴリー変数化
# feature_data=feature_data.astype('category')
# feature_data=pd.get_dummies(feature_data, dummy_na=True,prefix=lName,prefix_sep='_',drop_first=False)

# feature_data.to_pickle(feature_path + "test_feature_data.pickle")

In [20]:
# clusters.plot.scatter(x='Zip', y='LoanStatus',c='Clusters',cmap="rainbow")

In [21]:
# # WCSSの決定状況を確認する
# wcss=[]
# number_clusters = range(1,n_clusters)
# for i in number_clusters:
#     print(i*100)
#     kmeans=KMeans(i*100)
#     kmeans.fit(features_X)
#     wcss_iter = kmeans.inertia_
#     wcss.append(wcss_iter)

# plt.plot(number_clusters,wcss)
# plt.title('The Elbow Method')
# plt.xlabel('number of clusters')
# plt.ylabel('Within_cluster sum of squares')
# plt.show()

In [22]:
# # 次元削減
# tsne = TSNE(n_components=2,random_state=0)
# x =tsne.fit_transform(data)
# tsne_df = pd.DataFrame(x)
# tsne_df['cluster'] =data_clustering['cluster']
# tsne_df.columns = ['axis_0','axis_1','cluster']

# tsne_graph = sns.scatterplot(x='axis_0',y='axis_1',hue='cluster',data=tsne_df)

In [23]:
# sns.clustermap(features_X,figsize=(5,5),cmap="coolwarm")