In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import norm

import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib import pyplot
from sklearn.decomposition import PCA
from collections import Counter

### Funciton Defined


In [2]:
def feature_selection(data_df,variation_threshold):
    pca=PCA()
    pca.fit(data_df)
    cum_variation=np.cumsum(pca.explained_variance_ratio_)
    feature_number=(cum_variation>variation_threshold).argmax(axis=0)+1
    result_df=pd.DataFrame(data=np.vstack([pca.explained_variance_,pca.explained_variance_ratio_*100,cum_variation*100]).T,
                          columns=['Eigenvalue', 'Variation explained(%)', 'Cumulative(%)'],
                          index=np.arange(1,data_df.shape[1]+1))
    result_df=round(result_df,2)
    display(result_df)
    print("Selected Feature number: ", feature_number)
    
    return feature_number
    

### Data Input



In [3]:
rawdata_df = pd.read_csv('../data/SME_dataset.csv', index_col = 0)
rawdata_df.head()

Unnamed: 0_level_0,status,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24
X1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,1,3.3,0.62,1.3,0.96,0.7,0.94,-21.75,-0.12,-0.53,...,-0.01,1,1,1,1,7,25,8,36635,4635
3,1,-7.93,-0.19,0.87,1.35,1.34,225.95,-42.44,-0.59,-8.12,...,-0.1,1,1,1,1,1,42,65,29970,4791
7,1,1.24,0.41,1.81,1.14,0.65,1.1,-10.84,-0.08,-0.21,...,0.0,1,1,1,1,30,43,21,7052,4752
10,1,-1.16,-0.68,0.14,1.6,0.87,2.23,-16.29,-0.27,-0.05,...,-0.07,1,1,1,1,48,31,12,4276,4771
12,1,-1.9,-0.86,0.47,0.42,0.42,-1.78,-13.83,-0.24,-0.23,...,-0.17,1,1,1,1,0,53,156,3799,3212


In [4]:
# rawdata_df=rawdata_df.drop(['X1'],axis=1)
rawdata_df.describe()

Unnamed: 0,status,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24
count,15045.0,15045.0,15045.0,15045.0,15045.0,15045.0,15045.0,15045.0,15045.0,15045.0,...,15045.0,15045.0,15045.0,15045.0,15045.0,15045.0,15045.0,15045.0,15045.0,15045.0
mean,0.108475,8.894744,1.258207,1.462277,1.541156,1.189098,7.841478,23.228248,0.028451,-0.065396,...,0.069327,0.179462,0.205317,0.935593,0.241476,104.540047,76.215487,95.752742,3397.433034,4572.634031
std,0.310989,19.38171,3.341354,0.798856,1.212106,1.007325,23.882682,72.012435,0.146681,0.792478,...,0.219726,0.383751,0.403947,0.245484,0.427992,351.177911,114.619017,132.27674,7532.012991,1945.144547
min,0.0,-67.15,-9.59,0.1,0.0,-0.02,-33.14,-285.86,-1.34,-8.93,...,-2.66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,100.0
25%,0.0,1.25,0.0,1.07,0.97,0.62,0.94,1.25,0.01,0.0,...,0.02,0.0,0.0,1.0,0.0,0.0,0.0,0.0,602.0,3250.0
50%,0.0,3.73,0.06,1.2,1.22,0.98,1.75,3.61,0.03,0.07,...,0.06,0.0,0.0,1.0,0.0,20.0,51.0,67.0,1146.0,4642.0
75%,0.0,9.85,1.17,1.54,1.72,1.4,4.79,16.83,0.07,0.21,...,0.12,0.0,0.0,1.0,0.0,80.0,100.0,135.0,2759.0,5510.0
max,1.0,207.09,33.38,8.27,15.89,10.91,300.77,571.22,0.51,1.28,...,1.41,1.0,1.0,1.0,1.0,5569.0,1493.0,1531.0,79454.0,9609.0


In [5]:
rawdata_df.groupby('status').mean().T

status,0,1
V1,8.872443,9.078033
V2,1.25115,1.316207
V3,1.510188,1.068505
V4,1.600219,1.055729
V5,1.237871,0.788241
V6,8.066776,5.989804
V7,26.389256,-2.751287
V8,0.048307,-0.134737
V9,0.015571,-0.730846
V10,1.380684,1.268382


In [6]:
data_status=rawdata_df['status']

In [7]:
data_df=rawdata_df.drop(['status'],axis=1)
data_df.head()

Unnamed: 0_level_0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24
X1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,3.3,0.62,1.3,0.96,0.7,0.94,-21.75,-0.12,-0.53,8.5,...,-0.01,1,1,1,1,7,25,8,36635,4635
3,-7.93,-0.19,0.87,1.35,1.34,225.95,-42.44,-0.59,-8.12,5.13,...,-0.1,1,1,1,1,1,42,65,29970,4791
7,1.24,0.41,1.81,1.14,0.65,1.1,-10.84,-0.08,-0.21,2.33,...,0.0,1,1,1,1,30,43,21,7052,4752
10,-1.16,-0.68,0.14,1.6,0.87,2.23,-16.29,-0.27,-0.05,2.66,...,-0.07,1,1,1,1,48,31,12,4276,4771
12,-1.9,-0.86,0.47,0.42,0.42,-1.78,-13.83,-0.24,-0.23,1.32,...,-0.17,1,1,1,1,0,53,156,3799,3212


In [8]:
# data_df.to_excel('../result/data.xlsx',header=True)

### Factor Network-Based Segmentation


In [9]:
data_df=(data_df-np.mean(data_df))/np.std(data_df)

In [10]:
variation_threshold=0.95
feature_number=feature_selection(data_df,variation_threshold)

Unnamed: 0,Eigenvalue,Variation explained(%),Cumulative(%)
1,5.18,21.6,21.6
2,2.58,10.73,32.33
3,2.5,10.41,42.74
4,1.6,6.69,49.42
5,1.42,5.92,55.34
6,1.3,5.4,60.74
7,1.16,4.82,65.55
8,1.09,4.56,70.11
9,0.99,4.11,74.22
10,0.93,3.88,78.1


Selected Feature number:  17


In [11]:
ata=data_df.T.dot(data_df)
eigenval,eigenvec=np.linalg.eig(ata)
Vt=eigenvec.T
F_df=(data_df).dot((np.matrix(Vt)[:feature_number,:].I))

In [12]:
# F_df=pd.DataFrame(np.matrix(data_df.T).I.dot(eig_vecs).dot(np.diag(eig_vals)))

network_threshold=[0.05,0.1]


In [13]:
F_df.shape

(15045, 17)

In [14]:
theta=norm.ppf(2/(F_df.shape[0]-1))
factor_matrix=norm.cdf(np.cov(F_df)+theta)

In [15]:
result_df=pd.DataFrame()
temp_df_list = []
for threshold in network_threshold:
    
    temp_df=pd.DataFrame(factor_matrix>threshold)
    temp_df.to_csv("../result/fator_matrix_threshold_%s.csv" % (str(threshold)), index=True)
#     temp_df=np.sum(temp_df)
    temp_df=temp_df.sum()
    temp_df[temp_df!=0]=1
    pd.DataFrame(index=temp_df.index, data=temp_df.values, columns=["connect_flag"]).to_csv("../result/connect_flag_df_threshold_%s.csv" % (str(threshold)), index=True)
    temp_df=(np.vstack((temp_df, data_status))).T
      
    connect_df=temp_df[temp_df[:,0]==1]
    nonconnect_df=temp_df[temp_df[:,0]==0]
    temp1=pd.DataFrame.from_dict(Counter(connect_df[:,1]),orient='index').reset_index()
    temp1.columns=['Status','Connect']
    temp1_per=np.array(list(Counter(connect_df[:,1]).values()))
    temp1['Connect_Per']=temp1_per/connect_df.shape[0]

    temp2=pd.DataFrame.from_dict(Counter(nonconnect_df[:,1]),orient='index').reset_index()
    temp2.columns=['Status','Non-Connect']
    temp2_per=np.array(list(Counter(nonconnect_df[:,1]).values()))
    temp2['NonConnect_Per']=temp2_per/nonconnect_df.shape[0]

    temp_df=pd.merge(temp1,temp2)
    add_row=pd.DataFrame([['Total',connect_df.shape[0],connect_df.shape[0]/data_df.shape[0],nonconnect_df.shape[0],nonconnect_df.shape[0]/data_df.shape[0]]],columns=temp_df.columns)
    temp_df=temp_df.append(add_row)
    temp_df['Threshold']=threshold
    result_df=pd.concat([result_df,temp_df],axis=0)

result_df.set_index(['Threshold','Status'], inplace=True)

In [16]:
# pd.DataFrame(factor_matrix).to_csv("../result/factor_matrix.csv" , index=True)

In [17]:
pd.DataFrame(factor_matrix).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15035,15036,15037,15038,15039,15040,15041,15042,15043,15044
0,0.899298,0.854226,0.004155,0.009006,0.000571,4e-05,0.000155,4e-06,4.5e-05,0.000101997,...,0.0002,1.149038e-06,4.3e-05,0.000155,2.1e-05,0.000877,9.521924e-09,5e-06,1e-06,1.6e-05
1,0.854226,1.0,0.010027,0.030263,0.007534,0.000597,0.006756,0.000299,8e-06,2.141234e-07,...,6.8e-05,7.092662e-07,4e-06,0.001063,0.001166,2.8e-05,1.456718e-08,2e-06,0.000527,0.001687
2,0.004155,0.010027,0.002169,0.00242,0.001937,0.000431,0.001886,0.001285,5e-05,3.826249e-05,...,0.001947,1.943154e-05,6.2e-05,0.002463,0.000775,0.000132,6.882578e-06,6.8e-05,0.000399,0.001311
3,0.009006,0.030263,0.00242,0.008093,0.004897,0.00046,0.002905,0.002458,5.6e-05,8.500806e-06,...,0.000466,5.186709e-06,5.6e-05,0.003289,0.002877,0.00013,2.751673e-06,5.5e-05,1.8e-05,0.002405
4,0.000571,0.007534,0.001937,0.004897,0.009255,0.000599,0.005443,0.004504,6.1e-05,4.05527e-06,...,0.000772,5.640569e-05,0.000128,0.005488,0.001128,6e-05,1.138067e-06,7.4e-05,0.00016,0.004308


In [18]:
result_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Connect,Connect_Per,Non-Connect,NonConnect_Per
Threshold,Status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.05,1,1188,0.185162,444,0.051454
0.05,0,5228,0.814838,8185,0.948546
0.05,Total,6416,0.426454,8629,0.573546
0.1,1,1097,0.203828,535,0.055366
0.1,0,4285,0.796172,9128,0.944634
0.1,Total,5382,0.357727,9663,0.642273


### Feature Engineering