In [1]:
import pandas as pd
import numpy as np
import re
import string
import math
import hashlib
import os

import json
import zipfile
%matplotlib inline

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

from sklearn.feature_selection import mutual_info_classif
from sklearn import preprocessing



# EDA

### Data loading

In [2]:
with open(os.path.join("data", "train.csv")) as f:
    # read the training dataset
    X = pd.read_csv(f)

with open(os.path.join("data", "test.csv")) as f:
    # read the test dataset
    X_test = pd.read_csv(f)

In [3]:
X.shape

(101763, 23)

In [4]:
X.columns

Index(['id', 'loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e',
       'b', 't', 'lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment',
       'uniq_Op', 'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount',
       'defects'],
      dtype='object')

In [5]:
X.head()

Unnamed: 0,id,loc,v(g),ev(g),iv(g),n,v,l,d,i,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,0,22.0,3.0,1.0,2.0,60.0,278.63,0.06,19.56,14.25,...,17,1,1,0,16.0,9.0,38.0,22.0,5.0,False
1,1,14.0,2.0,1.0,2.0,32.0,151.27,0.14,7.0,21.11,...,11,0,1,0,11.0,11.0,18.0,14.0,3.0,False
2,2,11.0,2.0,1.0,2.0,45.0,197.65,0.11,8.05,22.76,...,8,0,1,0,12.0,11.0,28.0,17.0,3.0,False
3,3,8.0,1.0,1.0,1.0,23.0,94.01,0.19,5.25,17.86,...,4,0,2,0,8.0,6.0,16.0,7.0,1.0,True
4,4,11.0,2.0,1.0,2.0,17.0,60.94,0.18,5.63,12.44,...,7,0,2,0,7.0,6.0,10.0,10.0,3.0,False


In [6]:
X.describe()

Unnamed: 0,id,loc,v(g),ev(g),iv(g),n,v,l,d,i,...,t,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount
count,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,...,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0
mean,50881.0,37.34716,5.492684,2.845022,3.498826,96.655995,538.280956,0.111634,13.681881,27.573007,...,1141.357982,22.802453,1.773945,3.979865,0.196604,11.896131,15.596671,57.628116,39.249698,9.839549
std,29376.592059,54.600401,7.900855,4.631262,5.534541,171.147191,1270.791601,0.100096,14.121306,22.856742,...,9862.795472,38.54101,5.902412,6.382358,0.998906,6.749549,18.064261,104.53766,71.692309,14.412769
min,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,25440.5,13.0,2.0,1.0,1.0,25.0,97.67,0.05,5.6,15.56,...,31.38,7.0,0.0,1.0,0.0,8.0,7.0,15.0,10.0,3.0
50%,50881.0,22.0,3.0,1.0,2.0,51.0,232.79,0.09,9.82,23.36,...,125.4,14.0,0.0,2.0,0.0,11.0,12.0,30.0,20.0,5.0
75%,76321.5,42.0,6.0,3.0,4.0,111.0,560.25,0.15,18.0,34.34,...,565.92,26.0,1.0,5.0,0.0,16.0,20.0,66.0,45.0,11.0
max,101762.0,3442.0,404.0,165.0,402.0,8441.0,80843.08,1.0,418.2,569.78,...,935923.39,2824.0,344.0,219.0,43.0,410.0,1026.0,5420.0,3021.0,503.0


In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101763 entries, 0 to 101762
Data columns (total 23 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 101763 non-null  int64  
 1   loc                101763 non-null  float64
 2   v(g)               101763 non-null  float64
 3   ev(g)              101763 non-null  float64
 4   iv(g)              101763 non-null  float64
 5   n                  101763 non-null  float64
 6   v                  101763 non-null  float64
 7   l                  101763 non-null  float64
 8   d                  101763 non-null  float64
 9   i                  101763 non-null  float64
 10  e                  101763 non-null  float64
 11  b                  101763 non-null  float64
 12  t                  101763 non-null  float64
 13  lOCode             101763 non-null  int64  
 14  lOComment          101763 non-null  int64  
 15  lOBlank            101763 non-null  int64  
 16  lo

In [8]:
y = X['defects']

In [9]:
mutual_info_classif(X.drop('defects', axis = 1), y)

array([0.00060093, 0.10133288, 0.07164296, 0.04017473, 0.06787699,
       0.08612116, 0.0874376 , 0.07082114, 0.07752948, 0.07641732,
       0.08582812, 0.08686004, 0.08376068, 0.08828241, 0.02616162,
       0.05869043, 0.01060252, 0.07072522, 0.0824354 , 0.08614124,
       0.08456706, 0.07015641])

In [10]:
mi_scores = mutual_info_classif(X.drop('defects', axis = 1), y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.drop('defects', axis = 1).columns)
mi_scores = mi_scores.sort_values(ascending=False)

In [11]:
mi_scores

loc                  0.102985
lOCode               0.090016
n                    0.088310
total_Op             0.087388
v                    0.086428
b                    0.085569
t                    0.085548
e                    0.085155
total_Opnd           0.085094
uniq_Opnd            0.082804
d                    0.077479
i                    0.075564
uniq_Op              0.072604
branchCount          0.070420
v(g)                 0.069687
l                    0.068913
iv(g)                0.068112
lOBlank              0.060990
ev(g)                0.039889
lOComment            0.027568
locCodeAndComment    0.008263
id                   0.000420
Name: MI Scores, dtype: float64

In [12]:
X.dtypes

id                     int64
loc                  float64
v(g)                 float64
ev(g)                float64
iv(g)                float64
n                    float64
v                    float64
l                    float64
d                    float64
i                    float64
e                    float64
b                    float64
t                    float64
lOCode                 int64
lOComment              int64
lOBlank                int64
locCodeAndComment      int64
uniq_Op              float64
uniq_Opnd            float64
total_Op             float64
total_Opnd           float64
branchCount          float64
defects                 bool
dtype: object

In [17]:

X_corr = X.drop(columns='id', axis=1)
corr = X_corr.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
df_mask = corr.mask(mask)

fig = ff.create_annotated_heatmap(z=df_mask.to_numpy(), 
                                  x=df_mask.columns.tolist(),
                                  y=df_mask.columns.tolist(),
                                  colorscale=px.colors.diverging.RdBu,
                                  hoverinfo="none", #Shows hoverinfo for null values
                                  showscale=True, ygap=1, xgap=1
                                 )

fig.update_xaxes(side="bottom")

fig.update_layout(
    title_text='Heatmap', 
    title_x=0.5, 
    width=1000, 
    height=1000,
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    xaxis_zeroline=False,
    yaxis_zeroline=False,
    yaxis_autorange='reversed',
    template='plotly_white'
)

# NaN values are not handled automatically and are displayed in the figure
# So we need to get rid of the text manually
for i in range(len(fig.layout.annotations)):
    if fig.layout.annotations[i].text == 'nan':
        fig.layout.annotations[i].text = ""

fig.show()

In [23]:
nice_corrs = []
cols = X_corr.columns
for i in cols:
    corr_i_j = corr[abs(corr)>0.3].loc['defects',i]
    if not np.isnan(corr_i_j):
        nice_corrs.append(i)

print(len(set(nice_corrs)))

4


In [24]:
nice_corrs

['loc', 'v(g)', 'branchCount', 'defects']

In [None]:
scaler = preprocessing.MinMaxScaler()
d = scaler.fit_transform(df_corr)
scaled_df_corr = pd.DataFrame(d, columns=df_corr.columns)

In [None]:
nice_corrs = []
cols = scaled_df_corr.columns
corr_sc = scaled_df_corr.corr()
for i in cols:
    corr_i_j = corr_sc[abs(corr_sc)>0.2].loc['Class',i]
    if not np.isnan(corr_i_j):
        nice_corrs.append(i)

corrstosee = list(set(nice_corrs))


df_corr = scaled_df_corr[corrstosee]
corr = df_corr.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
df_mask = corr.mask(mask)

fig = ff.create_annotated_heatmap(z=df_mask.to_numpy(), 
                                  x=df_mask.columns.tolist(),
                                  y=df_mask.columns.tolist(),
                                  colorscale=px.colors.diverging.RdBu,
                                  hoverinfo="none", #Shows hoverinfo for null values
                                  showscale=True, ygap=1, xgap=1
                                 )

fig.update_xaxes(side="bottom")

fig.update_layout(
    title_text='Heatmap', 
    title_x=0.5, 
    width=1000, 
    height=1000,
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    xaxis_zeroline=False,
    yaxis_zeroline=False,
    yaxis_autorange='reversed',
    template='plotly_white'
)

# NaN values are not handled automatically and are displayed in the figure
# So we need to get rid of the text manually
for i in range(len(fig.layout.annotations)):
    if fig.layout.annotations[i].text == 'nan':
        fig.layout.annotations[i].text = ""

fig.show()

In [None]:
df.Class.value_counts()

Class
0    509
1    108
Name: count, dtype: int64

In [None]:
df_corr_box = df_corr.drop(columns=['Class'])
fig = px.box(df_corr_box.melt(), y="value", facet_col="variable", boxmode="overlay", color="variable")
fig.update_yaxes(matches=None)

for i in range(len(fig["data"])):
    yaxis_name = 'yaxis' if i == 0 else f'yaxis{i + 1}'
    fig.layout[yaxis_name].showticklabels = True

fig.update_layout(legend = dict(bgcolor = 'white'))
fig.update_layout(plot_bgcolor='white')

fig.update_xaxes(showline=True, linewidth=2, linecolor='black')#, mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black')#, mirror=True)

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='gray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='gray')

fig.show()

In [None]:
df_corr_true = df_corr[df_corr['Class']==1].drop(columns=['Class'])
fig = px.box(df_corr_true.melt(), y="value", facet_col="variable", boxmode="overlay", color="variable")
fig.update_yaxes(matches=None)

for i in range(len(fig["data"])):
    yaxis_name = 'yaxis' if i == 0 else f'yaxis{i + 1}'
    fig.layout[yaxis_name].showticklabels = True

fig.update_layout(legend = dict(bgcolor = 'white'))
fig.update_layout(plot_bgcolor='white')

fig.update_xaxes(showline=True, linewidth=2, linecolor='black')#, mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black',range=[0,1])#, mirror=True)

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='gray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='gray')

fig.show()

In [None]:
df_corr_false = df_corr[df_corr['Class']==0].drop(columns=['Class'])
fig = px.box(df_corr_false.melt(), y="value", facet_col="variable", boxmode="overlay", color="variable")
fig.update_yaxes(matches=None)

for i in range(len(fig["data"])):
    yaxis_name = 'yaxis' if i == 0 else f'yaxis{i + 1}'
    fig.layout[yaxis_name].showticklabels = True

fig.update_layout(legend = dict(bgcolor = 'white'))
fig.update_layout(plot_bgcolor='white')

fig.update_xaxes(showline=True, linewidth=2, linecolor='black')#, mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black',range=[0,1])#, mirror=True)

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='gray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='gray')

fig.show()

In [None]:
corr[abs(corr)>0.1]

Unnamed: 0,AF,DH,DU,Class,AM,DA,FE,BQ,FL,AB,DI,CR
AF,1.0,,0.110287,0.302638,0.183961,-0.127525,,0.3093,0.148289,0.350231,0.330857,
DH,,1.0,,-0.206839,-0.202972,0.190268,,,,-0.204446,0.143224,
DU,0.110287,,1.0,0.261,,,,0.124323,0.607507,,,
Class,0.302638,-0.206839,0.261,1.0,0.23917,-0.204612,0.216359,0.281257,0.244185,0.280612,0.26076,-0.227547
AM,0.183961,-0.202972,,0.23917,1.0,-0.255359,,0.260038,0.187974,0.530687,0.34755,
DA,-0.127525,0.190268,,-0.204612,-0.255359,1.0,,,,-0.183127,-0.182527,
FE,,,,0.216359,,,1.0,,,,0.130497,
BQ,0.3093,,0.124323,0.281257,0.260038,,,1.0,0.248188,0.23727,0.187289,
FL,0.148289,,0.607507,0.244185,0.187974,,,0.248188,1.0,0.169934,0.133871,
AB,0.350231,-0.204446,,0.280612,0.530687,-0.183127,,0.23727,0.169934,1.0,0.343071,


In [None]:
abs(corr)

Unnamed: 0,AF,DH,DU,Class,AM,DA,FE,BQ,FL,AB,DI,CR
AF,1.0,0.027899,0.110287,0.302638,0.183961,0.127525,0.080458,0.3093,0.148289,0.350231,0.330857,0.03585
DH,0.027899,1.0,0.089048,0.206839,0.202972,0.190268,0.022109,0.001715,0.08241,0.204446,0.143224,0.016067
DU,0.110287,0.089048,1.0,0.261,0.038487,0.018249,0.038461,0.124323,0.607507,0.059821,0.011047,0.030287
Class,0.302638,0.206839,0.261,1.0,0.23917,0.204612,0.216359,0.281257,0.244185,0.280612,0.26076,0.227547
AM,0.183961,0.202972,0.038487,0.23917,1.0,0.255359,0.07717,0.260038,0.187974,0.530687,0.34755,0.026143
DA,0.127525,0.190268,0.018249,0.204612,0.255359,1.0,0.06635,0.008726,0.068561,0.183127,0.182527,0.04046
FE,0.080458,0.022109,0.038461,0.216359,0.07717,0.06635,1.0,0.007478,0.001135,0.041684,0.130497,0.096554
BQ,0.3093,0.001715,0.124323,0.281257,0.260038,0.008726,0.007478,1.0,0.248188,0.23727,0.187289,0.027746
FL,0.148289,0.08241,0.607507,0.244185,0.187974,0.068561,0.001135,0.248188,1.0,0.169934,0.133871,0.011756
AB,0.350231,0.204446,0.059821,0.280612,0.530687,0.183127,0.041684,0.23727,0.169934,1.0,0.343071,0.005741
