In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from kmodes.kmodes import KModes

In [None]:
# Reading CSV Files
# Column Names are changed manually
df_act_3060 = pd.read_csv('../data/DATA0113/DATA/AGE3060/DLAB_HT_L1_ACT_IFO_TMP_3060.csv')
df_trd_3060 = pd.read_csv('../data/DATA0113/DATA/AGE3060/DLAB_INTERN_TR_IFO_TMP3_3060.csv')
df_cus_3060 = pd.read_csv('../data/DATA0113/DATA/AGE3060/DLAB_HT_L1_CUS_IFO_TMP_3060.csv')

df_act_1030 = pd.read_csv('../data/DATA0113/DATA/AGE1030/DLAB_HT_L1_ACT_IFO_TMP_1030.csv')
df_trd_1030 = pd.read_csv('../data/DATA0113/DATA/AGE1030/DLAB_INTERN_TR_IFO_TMP_1030.csv')
df_cus_1030 = pd.read_csv('../data/DATA0113/DATA/AGE1030/DLAB_HT_L1_CUS_IFO_TMP_1030.csv')
df_trd_2_1030 = pd.read_csv('../data/DATA0113/DATA/AGE1030/DLAB_INTERN_TR_IFO_TMP2_1030.csv')

In [None]:
df_act_3060.shape # (2253 ,4)
df_trd_3060.shape # (2089791, 10)
df_cus_3060.shape # (1000,7)
df_act_1030.shape # (844, 4)
df_trd_1030.shape # (286681,10)
df_cus_1030.shape # (500, 7)
df_trd_2_1030.shape # (404369,10)


In [3]:
# Changing the index of the csv files
df_trd_3060 = df_trd_3060.drop(columns = ['Unnamed: 0'])
df_act_3060 = df_act_3060.drop(columns = ['Unnamed: 0'])
df_cus_3060 = df_cus_3060.drop(columns = ['Unnamed: 0'])

df_trd_1030 = df_trd_1030.drop(columns = ['Unnamed: 0'])
df_act_1030 = df_act_1030.drop(columns = ['Unnamed: 0'])
df_cus_1030 = df_cus_1030.drop(columns = ['Unnamed: 0'])
df_trd_2_1030 = df_trd_2_1030.drop(columns = ['Unnamed: 0'])

In [4]:
# Finding data that is after 2021 07 01
df_trd_2_1030 = df_trd_2_1030[df_trd_2_1030['iqr_dt'] > 20210701]
df_trd_1030 = pd.concat([df_trd_1030, df_trd_2_1030])

In [5]:
# Making and cleaning data set to cluster for 3060
# Column Names are changed

df_bal = df_trd_3060.copy()[['iqr_dt', 'cus_no', 'tot_aet_amt']].groupby(['iqr_dt', 'cus_no']).sum()
df_bal = df_bal.reset_index()

df_bal_kor = df_trd_3060[df_trd_3060['mkt_gb'] == '국내'].copy()[['iqr_dt', 'cus_no', 'tot_aet_amt']].groupby(['iqr_dt', 'cus_no']).sum()
df_bal_kor = df_bal_kor.reset_index()

df_bal_ovs = df_trd_3060[df_trd_3060['mkt_gb'] == '해외'].copy()[['iqr_dt', 'cus_no', 'tot_aet_amt']].groupby(['iqr_dt', 'cus_no']).sum()
df_bal_ovs = df_bal_ovs.reset_index()

df_bal_tot = pd.merge(left=df_bal, 
                      right=df_bal_kor[['iqr_dt','cus_no', 'tot_aet_amt']], 
                      on=['cus_no', 'iqr_dt'], 
                      suffixes=('', '_kr'), 
                      how='left')
df_bal_tot = pd.merge(left=df_bal_tot, 
                      right=df_bal_ovs[['iqr_dt','cus_no', 'tot_aet_amt']], 
                      on=['cus_no', 'iqr_dt'], 
                      suffixes=('', '_ov'), 
                      how='left')

df_bal_tot[['tot_aet_amt_kr', 'tot_aet_amt_ov']] = df_bal_tot[['tot_aet_amt_kr', 'tot_aet_amt_ov']].fillna(0)

bank_cus_3060 = pd.merge(left=df_cus_3060, right = df_bal_tot, on='cus_no', suffixes=('',''))
bank_cus_3060['ovs_ratio'] = bank_cus_3060['tot_aet_amt_ov']/bank_cus_3060['tot_aet_amt']

In [6]:
# Making and cleaning data set to cluster for 1030
# Column Names are changed

df_bals = df_trd_1030.copy()[['iqr_dt', 'cus_no', 'tot_aet_amt']].groupby(['iqr_dt', 'cus_no']).sum()
df_bals = df_bals.reset_index()

df_bal_kors = df_trd_1030[df_trd_1030['mkt_gb'] == '국내'].copy()[['iqr_dt', 'cus_no', 'tot_aet_amt']].groupby(['iqr_dt', 'cus_no']).sum()
df_bal_kors = df_bal_kors.reset_index()

df_bal_ov = df_trd_1030[df_trd_1030['mkt_gb'] == '해외'].copy()[['iqr_dt', 'cus_no', 'tot_aet_amt']].groupby(['iqr_dt', 'cus_no']).sum()
df_bal_ov = df_bal_ov.reset_index()

df_bal_tots = pd.merge(left=df_bals, 
                      right=df_bal_kors[['iqr_dt','cus_no', 'tot_aet_amt']], 
                      on=['cus_no', 'iqr_dt'], 
                      suffixes=('', '_kr'), 
                      how='left')
df_bal_tots = pd.merge(left=df_bal_tots, 
                      right=df_bal_ov[['iqr_dt','cus_no', 'tot_aet_amt']], 
                      on=['cus_no', 'iqr_dt'], 
                      suffixes=('', '_ov'), 
                      how='left')

df_bal_tots[['tot_aet_amt_kr', 'tot_aet_amt_ov']] = df_bal_tots[['tot_aet_amt_kr', 'tot_aet_amt_ov']].fillna(0)

bank_cus_1030 = pd.merge(left=df_cus_1030, right = df_bal_tots, on='cus_no', suffixes=('',''))
bank_cus_1030['ovs_ratio'] = bank_cus_1030['tot_aet_amt_ov']/bank_cus_1030['tot_aet_amt']

In [7]:
# Getting the 3060 data for 20210701
bank_cus_3060 = bank_cus_3060[bank_cus_3060['iqr_dt'] == 20211231]
bank_cus_3060.head() 

# Getting the 1030 data for 20210701
bank_cus_1030 = bank_cus_1030[bank_cus_1030['iqr_dt'] == 20211231]
bank_cus_1030.head()

# Combining 1030 and 3060 data for 20210701
bank_cus_combined = pd.concat([bank_cus_1030, bank_cus_3060])

In [None]:
# Column Names are changed manually
# Company Names are changed
df = df_trd_1030[df_trd_1030['iem_krl_anm'] == '기업이름']
기업이름 = (df.groupby('iqr_dt').sum()['tot_aet_amt'] / df.groupby('iqr_dt').count()['tot_aet_amt']).reset_index()
bla = 기업이름[기업이름['iqr_dt'] > 20210301][:80]
plt.plot(bla['iqr_dt'], bla['tot_aet_amt'])


In [None]:
r = np.array([0,29, 40, 54, 70])
bank_cus_ovs = bank_cus_combined.groupby(pd.cut(bank_cus_combined['cus_age'], r)).mean()
bank_cus_ovs


In [None]:
# Shrinking combined data to cluster 초기화면
bank_cus_cluster = bank_cus_combined.drop(columns=['iqr_dt','tot_aet_amt','tot_aet_amt_kr','tot_aet_amt_ov', 'ovs_ratio','tco_cus_grd_cd'])
bank_cus_cluster.head()

In [None]:
# Choosing K using elbow method
cost = []
for clusters in list(range(1,10)):
    modes = KModes(n_clusters=clusters, init = "Random", n_init=5, verbose=1)
    modes.fit_predict(bank_cus_cluster)
    cost.append(modes.cost_)

In [None]:
#Graphing the elbow
graph = np.array([i for i in range (1,10,1)])
plt.plot(graph, cost)

# 6 seems to be the optimal K value

In [None]:
# KModes clustering ran 10 times with K=6
km = KModes(n_clusters=6, init="Random", n_init=10, verbose=1)
fit_clusters = km.fit_predict(bank_cus_cluster)


In [None]:
# Input the predicted group into original data
bank_cus_cluster['cluster'] = fit_clusters

# Cleaning out the data and finding the mean of the clusters
bank_cus_cluster['zip_ctp_cd'] = bank_cus_cluster['zip_ctp_cd'].fillna(0)
bank_cus_cluster['zip_ctp_cd'] = bank_cus_cluster['zip_ctp_cd'].replace('-',0).astype(int)
bank_cus_cluster['ivs_icn_cd'] = bank_cus_cluster['ivs_icn_cd'].fillna(0)
bank_cus_cluster['ivs_icn_cd'] = bank_cus_cluster['ivs_icn_cd'].replace('-',0).astype(int)

df_mean = bank_cus_cluster.groupby('cluster').mean()
df_mean


bank_cus_cluster[['cluster','zip_ctp_cd']].groupby('zip_ctp_cd').count()


In [28]:
bank_cus_cluster.to_csv('./bank_cus_cluster.csv')

In [None]:
bank_cus_cluster['zip_ctp_cd'] = bank_cus_cluster['zip_ctp_cd'].replace('경기', ' ').replace('서울', '   ')

bank_cus_cluster    

In [None]:
# ZIP code and graph
plt.subplots(figsize= (40,10))
sns.countplot(x=bank_cus_cluster['zip_ctp_cd'], order=bank_cus_cluster['zip_ctp_cd'].value_counts().index, hue=bank_cus_cluster['cluster'])
plt.show()

plt.subplots(figsize= (40,10))
sns.countplot(x=bank_cus_cluster['ivs_icn_cd'], order=bank_cus_cluster['ivs_icn_cd'].value_counts().index, hue=bank_cus_cluster['cluster'])
plt.show()

In [16]:
#Combined trade
df_trd_combined = pd.concat([df_trd_1030, df_trd_3060])

In [None]:
# Function to print top 5 관심종목
def top_5(n):
    df = bank_cus_cluster[bank_cus_cluster['cluster'] == n].reset_index()
    table_0 = df_trd_combined[(df_trd_combined['iqr_dt'] == 20211231) & (df_trd_combined['cus_no'].isin(df['cus_no']))].groupby('iem_krl_anm').count().sort_values(by='iqr_dt', ascending=False).head()/bank_cus_cluster.groupby('cluster').count()['cus_no'][i]
    table_0 = table_0.drop(columns =['iqr_dt', 'cus_no', 'act_no','mkt_gb','pdt_gb','iem_cd','tot_aet_amt'])
    table_0[n] = table_0['bnc_qty']
    table_0 = table_0.drop(columns=['bnc_qty'])
    return display(table_0)
    
# Printing top 5 for each group
for i in range(6):
    top_5(i)

In [None]:
bank_cluster_DB = bank_cus_cluster[['cus_no', 'cluster']]
bank_cluster_DB

In [19]:
# Trying to cluster with all the given data
bank_cus_cluster_all = bank_cus_combined.drop(columns=['tco_cus_grd_cd', 'iqr_dt'])
bank_cus_cluster_all.head()

# Cleaning out the data 
bank_cus_cluster_all['zip_ctp_cd'] = bank_cus_cluster['zip_ctp_cd'].fillna(0)
bank_cus_cluster_all['zip_ctp_cd'] = bank_cus_cluster['zip_ctp_cd'].replace('-',0).astype(int)
bank_cus_cluster_all['ivs_icn_cd'] = bank_cus_cluster['ivs_icn_cd'].fillna(0)
bank_cus_cluster_all['ivs_icn_cd'] = bank_cus_cluster['ivs_icn_cd'].replace('-',0).astype(int)                                                                              

In [None]:
# Choosing K using elbow method
costs = []
for clusters in list(range(1,10)):
    modes = KModes(n_clusters=clusters, init = "Random", n_init=5, verbose=1)
    modes.fit_predict(bank_cus_cluster_all)
    costs.append(modes.cost_)

In [None]:
#Graphing the elbow
graphs = np.array([i for i in range (1,10,1)])
plt.plot(graphs, costs)

# 5 seems to be the optimal K value

In [None]:
# KModes clustering ran 10 times with K=5
km = KModes(n_clusters=5, init="Random", n_init=10, verbose=1)
fit_cluster = km.fit_predict(bank_cus_cluster)

In [None]:
bank_cus_cluster_all.groupby('cluster').count()

In [None]:
# Input the predicted group into original data
bank_cus_cluster_all['cluster'] = fit_clusters

# Get mean
df_means = bank_cus_cluster_all.groupby('cluster').mean()
df_means

In [None]:
# Function to print top 5 관심종목
def top_5_0(n):
    df = bank_cus_cluster_all[bank_cus_cluster_all['cluster'] == n].reset_index()
    table_0 = df_trd_combined[(df_trd_combined['iqr_dt'] == 20210701) & (df_trd_combined['cus_no'].isin(df['cus_no']))].groupby('iem_krl_anm').count().sort_values(by='iqr_dt', ascending=False).head()/bank_cus_cluster.groupby('cluster').count()['cus_no'][i]
    table_0 = table_0.drop(columns =['iqr_dt', 'cus_no', 'act_no','mkt_gb','pdt_gb','iem_cd','tot_aet_amt'])
    table_0[n] = table_0['bnc_qty']
    table_0 = table_0.drop(columns=['bnc_qty'])
    return display(table_0)
    
# Printing top 5 for each group
for i in range(6):
    top_5_0(i)