In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import norm

import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib import pyplot
from sklearn.decomposition import PCA
from collections import Counter

### Funciton Defined


In [3]:
def feature_selection(data_df,variation_threshold):
    pca=PCA()
    pca.fit(data_df)
    cum_variation=np.cumsum(pca.explained_variance_ratio_)
    feature_number=(cum_variation>variation_threshold).argmax(axis=0)+1
    result_df=pd.DataFrame(data=np.vstack([pca.explained_variance_,pca.explained_variance_ratio_*100,cum_variation*100]).T,
                          columns=['Eigenvalue', 'Variation explained(%)', 'Cumulative(%)'],
                          index=np.arange(1,data_df.shape[1]+1))
    result_df=round(result_df,2)
    display(result_df)
    print("Selected Feature number: ", feature_number)
    
    return feature_number
    

### Data Input



In [2]:
rawdata_df = pd.read_csv('../data/SME_dataset.csv', index_col = 0)
rawdata_df.head()

Unnamed: 0_level_0,status,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24
X1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,1,3.3,0.62,1.3,0.96,0.7,0.94,-21.75,-0.12,-0.53,...,-0.01,1,1,1,1,7,25,8,36635,4635
3,1,-7.93,-0.19,0.87,1.35,1.34,225.95,-42.44,-0.59,-8.12,...,-0.1,1,1,1,1,1,42,65,29970,4791
7,1,1.24,0.41,1.81,1.14,0.65,1.1,-10.84,-0.08,-0.21,...,0.0,1,1,1,1,30,43,21,7052,4752
10,1,-1.16,-0.68,0.14,1.6,0.87,2.23,-16.29,-0.27,-0.05,...,-0.07,1,1,1,1,48,31,12,4276,4771
12,1,-1.9,-0.86,0.47,0.42,0.42,-1.78,-13.83,-0.24,-0.23,...,-0.17,1,1,1,1,0,53,156,3799,3212


In [19]:
#rawdata_df=rawdata_df.drop(['X1'],axis=1)
rawdata_df.describe()

Unnamed: 0,status,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24
count,15045.0,15045.0,15045.0,15045.0,15045.0,15045.0,15045.0,15045.0,15045.0,15045.0,...,15045.0,15045.0,15045.0,15045.0,15045.0,15045.0,15045.0,15045.0,15045.0,15045.0
mean,0.108475,8.894744,1.258207,1.462277,1.541156,1.189098,7.841478,23.228248,0.028451,-0.065396,...,0.069327,0.179462,0.205317,0.935593,0.241476,104.540047,76.215487,95.752742,3397.433034,4572.634031
std,0.310989,19.38171,3.341354,0.798856,1.212106,1.007325,23.882682,72.012435,0.146681,0.792478,...,0.219726,0.383751,0.403947,0.245484,0.427992,351.177911,114.619017,132.27674,7532.012991,1945.144547
min,0.0,-67.15,-9.59,0.1,0.0,-0.02,-33.14,-285.86,-1.34,-8.93,...,-2.66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,100.0
25%,0.0,1.25,0.0,1.07,0.97,0.62,0.94,1.25,0.01,0.0,...,0.02,0.0,0.0,1.0,0.0,0.0,0.0,0.0,602.0,3250.0
50%,0.0,3.73,0.06,1.2,1.22,0.98,1.75,3.61,0.03,0.07,...,0.06,0.0,0.0,1.0,0.0,20.0,51.0,67.0,1146.0,4642.0
75%,0.0,9.85,1.17,1.54,1.72,1.4,4.79,16.83,0.07,0.21,...,0.12,0.0,0.0,1.0,0.0,80.0,100.0,135.0,2759.0,5510.0
max,1.0,207.09,33.38,8.27,15.89,10.91,300.77,571.22,0.51,1.28,...,1.41,1.0,1.0,1.0,1.0,5569.0,1493.0,1531.0,79454.0,9609.0


In [20]:
rawdata_df.groupby('status').mean().T

status,0,1
V1,8.872443,9.078033
V2,1.25115,1.316207
V3,1.510188,1.068505
V4,1.600219,1.055729
V5,1.237871,0.788241
V6,8.066776,5.989804
V7,26.389256,-2.751287
V8,0.048307,-0.134737
V9,0.015571,-0.730846
V10,1.380684,1.268382


In [21]:
data_status=rawdata_df['status']

In [22]:
data_df=rawdata_df.drop(['status'],axis=1)
data_df.head()

Unnamed: 0_level_0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24
X1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,3.3,0.62,1.3,0.96,0.7,0.94,-21.75,-0.12,-0.53,8.5,...,-0.01,1,1,1,1,7,25,8,36635,4635
3,-7.93,-0.19,0.87,1.35,1.34,225.95,-42.44,-0.59,-8.12,5.13,...,-0.1,1,1,1,1,1,42,65,29970,4791
7,1.24,0.41,1.81,1.14,0.65,1.1,-10.84,-0.08,-0.21,2.33,...,0.0,1,1,1,1,30,43,21,7052,4752
10,-1.16,-0.68,0.14,1.6,0.87,2.23,-16.29,-0.27,-0.05,2.66,...,-0.07,1,1,1,1,48,31,12,4276,4771
12,-1.9,-0.86,0.47,0.42,0.42,-1.78,-13.83,-0.24,-0.23,1.32,...,-0.17,1,1,1,1,0,53,156,3799,3212


In [23]:
# data_df.to_excel('../result/data.xlsx',header=True)

### Factor Network-Based Segmentation


In [24]:
data_df=(data_df-np.mean(data_df))/np.std(data_df)

In [25]:
variation_threshold=0.95
feature_number=feature_selection(data_df,variation_threshold)

Unnamed: 0,Eigenvalue,Variation explained(%),Cumulative(%)
1,5.18,21.6,21.6
2,2.58,10.73,32.33
3,2.5,10.41,42.74
4,1.6,6.69,49.42
5,1.42,5.92,55.34
6,1.3,5.4,60.74
7,1.16,4.82,65.55
8,1.09,4.56,70.11
9,0.99,4.11,74.22
10,0.93,3.88,78.1


Selected Feature number:  17


In [26]:
U,sigma,V=np.linalg.svd(data_df,full_matrices=False)
F_cov=pd.DataFrame(U[:,:feature_number].dot(np.diag(sigma[:feature_number])))
pd.DataFrame(sigma)

Unnamed: 0,0
0,279.269862
1,196.832583
2,193.856475
3,155.374781
4,146.144265
5,139.618548
6,131.863115
7,128.279373
8,121.853832
9,118.331202


In [27]:
ata=data_df.T.dot(data_df)
eigenval,eigenvec=np.linalg.eig(ata)
Vt=eigenvec.T
F_df=(data_df).dot((np.matrix(Vt)[:feature_number,:].I))

In [28]:
F_cov

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,2.507867,3.621795,-7.713987,-1.062480,-0.990730,0.313423,-1.923228,-0.202912,5.413617,-0.309895,-0.006222,-3.053448,0.512548,0.090491,-0.621058,0.432224,-0.369031
1,6.632335,-0.806251,-5.723596,-1.689255,-3.555602,2.119939,0.998947,-3.153332,4.862232,-1.337057,8.089013,-0.641673,0.501364,0.667878,7.670498,1.122178,0.254017
2,2.741003,-0.287041,-1.786709,-0.762860,1.024188,0.562227,-0.467617,-0.094634,1.008527,0.238662,0.091198,-0.504868,0.240693,-0.232179,-1.171006,0.470421,-0.304695
3,3.623801,0.384687,-2.240346,-0.756340,0.600953,0.196723,0.146463,-0.227148,0.537650,0.039155,0.019987,-0.421298,0.396203,0.197484,-1.062692,-0.917253,0.918262
4,4.265881,-0.077164,-0.715740,0.259265,0.962685,-0.600492,-0.151316,-0.102152,0.004764,0.477761,0.423912,-0.012194,0.363911,0.206438,-1.031043,-0.813416,-0.071404
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15040,-1.077973,0.789399,0.688196,-0.537244,1.910738,0.332749,1.417775,1.757758,0.721017,-1.539043,-0.169725,-0.333825,-0.337791,0.168588,0.922324,-0.159788,0.745435
15041,-2.460943,-1.031102,6.146822,-1.620791,2.368834,2.939689,4.201732,-8.252069,4.276531,-1.494215,-5.714576,-3.717424,0.941253,0.363424,1.740711,-2.426630,-1.111977
15042,-0.991454,-0.096722,1.319763,-0.871741,1.354595,-0.259233,-0.069936,-0.189306,-0.731056,0.257195,0.269244,0.645537,-0.210001,0.235987,0.334546,-0.208532,0.481712
15043,0.874122,-10.407769,0.491132,-0.835163,-4.977077,-0.752059,0.229386,6.473110,1.993125,0.520133,-0.143335,-1.748537,-0.567426,0.099619,-1.277754,-0.217462,-5.398045


In [29]:
# select_col_num = 17

# lomo=np.linalg.svd(data_df)

# my_U = lomo[0][:,:select_col_num]
# my_D=lomo[1]**2
# my_D = np.sqrt(np.diag(my_D[:select_col_num])/(data_df.shape[0]-1))
# F_df=pd.DataFrame(my_U.dot(np.diag(lomo[1][:select_col_num])))
# F_df1=pd.DataFrame(my_U.dot(my_D))
# display(F_df,F_df1)

In [30]:
# F_df=pd.DataFrame(np.matrix(data_df.T).I.dot(eig_vecs).dot(np.diag(eig_vals)))

network_threshold=[0.05,0.1]


In [31]:
theta=norm.ppf(2/(F_cov.shape[0]-1))
factor_matrix=norm.cdf(np.cov(F_cov)+theta)

In [34]:
result_df=pd.DataFrame()
temp_df_list = []
for threshold in network_threshold:
    
    temp_df=pd.DataFrame(factor_matrix>threshold)
    temp_df.to_csv("../result/fator_matrix_threshold_%s.csv" % (str(threshold)), index=True)
    
#     temp_df=np.sum(temp_df)
    temp_df=temp_df.sum()
    temp_df[temp_df<=30]=0
    temp_df[temp_df!=0]=1
    pd.DataFrame(index=temp_df.index, data=temp_df.values, columns=["connect_flag"]).to_csv("../result/connect_flag_df_threshold_%s.csv" % (str(threshold)), index=True)
    temp_df=(np.vstack((temp_df, data_status))).T
      
    connect_df=temp_df[temp_df[:,0]==1]
    nonconnect_df=temp_df[temp_df[:,0]==0]
    temp1=pd.DataFrame.from_dict(Counter(connect_df[:,1]),orient='index').reset_index()
    temp1.columns=['Status','Connect']
    temp1_per=np.array(list(Counter(connect_df[:,1]).values()))
    temp1['Connect_Per']=temp1_per/connect_df.shape[0]

    temp2=pd.DataFrame.from_dict(Counter(nonconnect_df[:,1]),orient='index').reset_index()
    temp2.columns=['Status','Non-Connect']
    temp2_per=np.array(list(Counter(nonconnect_df[:,1]).values()))
    temp2['NonConnect_Per']=temp2_per/nonconnect_df.shape[0]

    temp_df=pd.merge(temp1,temp2)
    add_row=pd.DataFrame([['Total',connect_df.shape[0],connect_df.shape[0]/data_df.shape[0],nonconnect_df.shape[0],nonconnect_df.shape[0]/data_df.shape[0]]],columns=temp_df.columns)
    temp_df=temp_df.append(add_row)
    temp_df['Threshold']=threshold
    result_df=pd.concat([result_df,temp_df],axis=0)

result_df.set_index(['Threshold','Status'], inplace=True)

In [35]:
result_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Connect,Connect_Per,Non-Connect,NonConnect_Per
Threshold,Status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.05,1,998,0.228899,634,0.059336
0.05,0,3362,0.771101,10051,0.940664
0.05,Total,4360,0.289797,10685,0.710203
0.1,1,859,0.253019,773,0.066352
0.1,0,2536,0.746981,10877,0.933648
0.1,Total,3395,0.225656,11650,0.774344


### Feature Engineering