In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import norm

import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib import pyplot
from sklearn.decomposition import PCA
from collections import Counter

### Funciton Defined


In [2]:
def feature_selection(data_df,variation_threshold):
    pca=PCA()
    pca.fit(data_df)
    cum_variation=np.cumsum(pca.explained_variance_ratio_)
    feature_number=(cum_variation>variation_threshold).argmax(axis=0)+1
    result_df=pd.DataFrame(data=np.vstack([pca.explained_variance_,pca.explained_variance_ratio_*100,cum_variation*100]).T,
                          columns=['Eigenvalue', 'Variation explained(%)', 'Cumulative(%)'],
                          index=np.arange(1,data_df.shape[1]+1))
    result_df=round(result_df,2)
    display(result_df)
    print("Selected Feature number: ", feature_number)
    
    return feature_number
    

### Data Input



In [3]:
rawdata_df = pd.read_csv('../data/SME_dataset.csv', index_col = 0)
rawdata_df.head()

Unnamed: 0,X1,ratio001,ratio002,ratio003,ratio004,ratio005,ratio006,ratio008,ratio011,ratio012,...,DIO,DPO,DSO,turnover,status,nace,ratio036,ratio037,ratio039,ratio040
1,14,180.39,0.0,1.01,1.06,1.06,2.25,-48.31,0.0,-0.79,...,1,283,0,3437,1,4941,1,1,1,1
2,18,3.73,0.21,1.27,1.41,1.35,1.89,13.57,0.11,0.27,...,7,61,75,1991,1,1330,0,0,1,0
3,21,0.19,0.0,6.18,3.28,3.28,1.79,-17.72,-0.28,-0.36,...,0,5,27,1739,1,4646,1,1,1,1
4,25,-1.49,0.0,0.33,0.1,0.1,-2.73,-53.41,-0.79,-0.39,...,0,0,0,1499,1,9313,1,1,1,1
5,31,-1.21,0.0,0.17,0.14,0.14,-28.95,-58.08,-1.02,-0.23,...,0,444,0,1244,1,4759,1,1,1,1


In [4]:
rawdata_df=rawdata_df.drop(['X1'],axis=1)
rawdata_df.describe()

Unnamed: 0,ratio001,ratio002,ratio003,ratio004,ratio005,ratio006,ratio008,ratio011,ratio012,ratio017,...,DIO,DPO,DSO,turnover,status,nace,ratio036,ratio037,ratio039,ratio040
count,4514.0,4514.0,4514.0,4514.0,4514.0,4514.0,4514.0,4514.0,4514.0,4514.0,...,4514.0,4514.0,4514.0,4514.0,4514.0,4514.0,4514.0,4514.0,4514.0,4514.0
mean,8.885091,1.264107,1.443505,1.535791,1.189956,7.726227,23.067767,0.027758,-0.068855,1.371943,...,105.227736,75.934205,95.731945,3344.478511,0.110323,4615.625831,0.179663,0.207576,0.938857,0.240142
std,19.154547,3.333365,0.760904,1.201232,1.023862,23.277207,70.271262,0.146976,0.79018,1.068066,...,355.807161,111.650905,128.370308,7580.558788,0.313327,1941.904899,0.383949,0.405617,0.239619,0.427217
min,-64.43,-9.58,0.17,0.01,0.0,-33.14,-285.86,-1.28,-8.54,0.01,...,0.0,0.0,0.0,6.0,0.0,110.0,0.0,0.0,0.0,0.0
25%,1.3025,0.0,1.07,0.97,0.61,0.94,1.24,0.01,0.0,0.68,...,1.0,0.0,0.0,594.0,0.0,3320.0,0.0,0.0,1.0,0.0
50%,3.77,0.09,1.2,1.22,0.99,1.72,3.59,0.03,0.07,1.17,...,19.0,51.0,67.0,1123.5,0.0,4642.0,0.0,0.0,1.0,0.0
75%,9.68,1.17,1.52,1.72,1.4075,4.89,16.3175,0.07,0.21,1.74,...,80.0,99.75,136.0,2761.75,0.0,5510.0,0.0,0.0,1.0,0.0
max,206.55,33.38,8.27,13.71,10.88,297.02,566.96,0.49,1.08,8.42,...,5569.0,1467.0,1465.0,76403.0,1.0,9609.0,1.0,1.0,1.0,1.0


In [5]:
rawdata_df.groupby('status').mean().T

status,0,1
ratio001,8.852383,9.148855
ratio002,1.248618,1.389016
ratio003,1.487742,1.086767
ratio004,1.597079,1.041546
ratio005,1.243633,0.757088
ratio006,7.929163,6.089699
ratio008,26.217702,-2.334116
ratio011,0.047816,-0.133996
ratio012,0.008752,-0.694699
ratio017,1.38064,1.301807


In [6]:
data_status=rawdata_df['status']

In [7]:
data_df=rawdata_df.drop(['status'],axis=1)
data_df.head()

Unnamed: 0,ratio001,ratio002,ratio003,ratio004,ratio005,ratio006,ratio008,ratio011,ratio012,ratio017,...,ratio030,DIO,DPO,DSO,turnover,nace,ratio036,ratio037,ratio039,ratio040
1,180.39,0.0,1.01,1.06,1.06,2.25,-48.31,0.0,-0.79,1.08,...,0.0,1,283,0,3437,4941,1,1,1,1
2,3.73,0.21,1.27,1.41,1.35,1.89,13.57,0.11,0.27,1.72,...,0.1,7,61,75,1991,1330,0,0,1,0
3,0.19,0.0,6.18,3.28,3.28,1.79,-17.72,-0.28,-0.36,5.02,...,-0.05,0,5,27,1739,4646,1,1,1,1
4,-1.49,0.0,0.33,0.1,0.1,-2.73,-53.41,-0.79,-0.39,0.95,...,-0.74,0,0,0,1499,9313,1,1,1,1
5,-1.21,0.0,0.17,0.14,0.14,-28.95,-58.08,-1.02,-0.23,4.4,...,-0.23,0,444,0,1244,4759,1,1,1,1


In [8]:
# data_df.to_excel('../result/data.xlsx',header=True)

### Factor Network-Based Segmentation


In [9]:
data_df=(data_df-np.mean(data_df))/np.std(data_df)

In [10]:
variation_threshold=0.95
feature_number=feature_selection(data_df,variation_threshold)

Unnamed: 0,Eigenvalue,Variation explained(%),Cumulative(%)
1,5.18,21.56,21.56
2,2.56,10.66,32.22
3,2.53,10.55,42.77
4,1.59,6.61,49.39
5,1.47,6.14,55.53
6,1.26,5.25,60.78
7,1.16,4.82,65.59
8,1.08,4.51,70.1
9,0.98,4.07,74.17
10,0.97,4.04,78.21


Selected Feature number:  17


In [26]:
U,sigma,V=np.linalg.svd(data_df,full_matrices=False)
F_cov=pd.DataFrame(U[:,:feature_number].dot(np.diag(sigma[:feature_number])))
pd.DataFrame(sigma)

Unnamed: 0,0
0,152.842837
1,107.460757
2,106.912581
3,84.640324
4,81.588169
5,75.387838
6,72.232503
7,69.917062
8,66.372866
9,66.151297


In [19]:
ata=data_df.T.dot(data_df)
eigenval,eigenvec=np.linalg.eig(ata)
Vt=eigenvec.T
F_df=(data_df).dot((np.matrix(Vt)[:feature_number,:].I))

In [27]:
F_cov

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,-2.866988,-0.253559,-1.056567,-2.721500,0.046768,5.939636,-0.611063,0.852784,-0.337221,0.443357,-0.302975,1.454475,-1.148088,6.041975,-0.527775,2.341109,-1.001349
1,0.888241,0.289697,-0.553781,0.229018,-0.154897,-0.917619,-0.381142,-0.405621,-0.911168,1.148485,-0.044874,0.366946,0.470153,0.174747,-0.019284,-0.157964,-0.267506
2,-1.477014,4.476643,3.845264,3.431475,-1.189015,1.250736,-0.835289,0.896097,-1.166930,0.148863,-2.862710,-1.331353,0.974401,-0.246343,2.149915,3.824314,2.028338
3,-7.406331,1.694331,0.131274,0.270989,-0.555572,-0.792933,0.571109,-0.472290,3.630917,-0.539338,0.700566,0.883240,-1.905262,-0.754489,-0.970476,0.437488,0.913167
4,-6.867986,3.310717,-1.502827,-0.395658,-0.683166,-1.242828,0.164506,0.722570,0.541940,-0.593736,-2.490426,-0.167890,1.021771,-0.199932,0.385136,1.333511,-1.614353
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4509,-6.710259,-0.393610,3.619809,-2.089698,-3.997284,-2.252902,-0.833807,2.018658,1.306538,1.693566,-0.314733,-0.269490,-0.797170,-0.126729,-0.818193,0.581495,3.515812
4510,-0.820495,-2.086203,3.066455,0.687146,3.758985,0.805420,0.238818,3.011552,-2.927308,-0.943169,-0.007121,0.509609,0.072294,0.045735,-0.466276,0.527087,0.657153
4511,-2.185301,0.244975,6.948995,2.624052,-3.323208,1.375099,-1.236508,0.417845,1.165009,2.089667,-0.838387,0.495662,-1.318586,0.327632,-2.989926,-1.167088,-2.487602
4512,1.079560,-0.524212,-0.947476,0.544064,1.834533,-0.552976,1.277888,1.821024,0.506629,-1.680504,-0.344657,0.068750,-0.230578,0.130451,0.491798,-0.995991,-1.078821


In [58]:
# select_col_num = 17

# lomo=np.linalg.svd(data_df)

# my_U = lomo[0][:,:select_col_num]
# my_D=lomo[1]**2
# my_D = np.sqrt(np.diag(my_D[:select_col_num])/(data_df.shape[0]-1))
# F_df=pd.DataFrame(my_U.dot(np.diag(lomo[1][:select_col_num])))
# F_df1=pd.DataFrame(my_U.dot(my_D))
# display(F_df,F_df1)

In [12]:
# F_df=pd.DataFrame(np.matrix(data_df.T).I.dot(eig_vecs).dot(np.diag(eig_vals)))

network_threshold=[0.05,0.1]


In [16]:
theta=norm.ppf(2/(F_cov.shape[0]-1))
factor_matrix=norm.cdf(np.cov(F_cov)+theta)

In [17]:
result_df=pd.DataFrame()
temp_df_list = []
for threshold in network_threshold:
    
    temp_df=pd.DataFrame(factor_matrix>threshold)
    
#     temp_df=np.sum(temp_df)
    temp_df=temp_df.sum()
    temp_df[temp_df!=0]=1
    pd.DataFrame(index=temp_df.index, data=temp_df.values, columns=["connect_flag"]).to_csv("../result/connect_flag_df_threshold_%s.csv" % (str(threshold)), index=True)
    temp_df=(np.vstack((temp_df, data_status))).T
      
    connect_df=temp_df[temp_df[:,0]==1]
    nonconnect_df=temp_df[temp_df[:,0]==0]
    temp1=pd.DataFrame.from_dict(Counter(connect_df[:,1]),orient='index').reset_index()
    temp1.columns=['Status','Connect']
    temp1_per=np.array(list(Counter(connect_df[:,1]).values()))
    temp1['Connect_Per']=temp1_per/connect_df.shape[0]

    temp2=pd.DataFrame.from_dict(Counter(nonconnect_df[:,1]),orient='index').reset_index()
    temp2.columns=['Status','Non-Connect']
    temp2_per=np.array(list(Counter(nonconnect_df[:,1]).values()))
    temp2['NonConnect_Per']=temp2_per/nonconnect_df.shape[0]

    temp_df=pd.merge(temp1,temp2)
    add_row=pd.DataFrame([['Total',connect_df.shape[0],connect_df.shape[0]/data_df.shape[0],nonconnect_df.shape[0],nonconnect_df.shape[0]/data_df.shape[0]]],columns=temp_df.columns)
    temp_df=temp_df.append(add_row)
    temp_df['Threshold']=threshold
    result_df=pd.concat([result_df,temp_df],axis=0)

result_df.set_index(['Threshold','Status'], inplace=True)

In [37]:
result_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Connect,Connect_Per,Non-Connect,NonConnect_Per
Threshold,Status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.05,1,381,0.177457,117,0.04943
0.05,0,1766,0.822543,2250,0.95057
0.05,Total,2147,0.475631,2367,0.524369
0.1,1,346,0.197714,152,0.054993
0.1,0,1404,0.802286,2612,0.945007
0.1,Total,1750,0.387683,2764,0.612317


### Feature Engineering