In [28]:
import pandas as pd
import numpy as np
import re
import string
import math
import hashlib
import os

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

from sklearn.feature_selection import mutual_info_classif
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import confusion_matrix,roc_auc_score,roc_curve,classification_report,auc, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier



# Baseline

### Data loading

In [2]:
with open(os.path.join("data", "train.csv")) as f:
    # read the training dataset
    df = pd.read_csv(f)

with open(os.path.join("data", "test.csv")) as f:
    # read the test dataset
    X_test = pd.read_csv(f)

df['defects'] = df['defects'].apply(lambda x: 1 if x == True else 0)

X = df.drop(['defects'], axis=1)
y = df['defects']

In [3]:
df.head()

Unnamed: 0,id,loc,v(g),ev(g),iv(g),n,v,l,d,i,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,0,22.0,3.0,1.0,2.0,60.0,278.63,0.06,19.56,14.25,...,17,1,1,0,16.0,9.0,38.0,22.0,5.0,0
1,1,14.0,2.0,1.0,2.0,32.0,151.27,0.14,7.0,21.11,...,11,0,1,0,11.0,11.0,18.0,14.0,3.0,0
2,2,11.0,2.0,1.0,2.0,45.0,197.65,0.11,8.05,22.76,...,8,0,1,0,12.0,11.0,28.0,17.0,3.0,0
3,3,8.0,1.0,1.0,1.0,23.0,94.01,0.19,5.25,17.86,...,4,0,2,0,8.0,6.0,16.0,7.0,1.0,1
4,4,11.0,2.0,1.0,2.0,17.0,60.94,0.18,5.63,12.44,...,7,0,2,0,7.0,6.0,10.0,10.0,3.0,0


### Data Analysis

In [4]:
X.shape

(101763, 22)

In [5]:
X.columns

Index(['id', 'loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e',
       'b', 't', 'lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment',
       'uniq_Op', 'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount'],
      dtype='object')

In [6]:
X.head()

Unnamed: 0,id,loc,v(g),ev(g),iv(g),n,v,l,d,i,...,t,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount
0,0,22.0,3.0,1.0,2.0,60.0,278.63,0.06,19.56,14.25,...,302.71,17,1,1,0,16.0,9.0,38.0,22.0,5.0
1,1,14.0,2.0,1.0,2.0,32.0,151.27,0.14,7.0,21.11,...,52.04,11,0,1,0,11.0,11.0,18.0,14.0,3.0
2,2,11.0,2.0,1.0,2.0,45.0,197.65,0.11,8.05,22.76,...,97.45,8,0,1,0,12.0,11.0,28.0,17.0,3.0
3,3,8.0,1.0,1.0,1.0,23.0,94.01,0.19,5.25,17.86,...,26.31,4,0,2,0,8.0,6.0,16.0,7.0,1.0
4,4,11.0,2.0,1.0,2.0,17.0,60.94,0.18,5.63,12.44,...,20.31,7,0,2,0,7.0,6.0,10.0,10.0,3.0


In [7]:
X.describe()

Unnamed: 0,id,loc,v(g),ev(g),iv(g),n,v,l,d,i,...,t,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount
count,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,...,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0
mean,50881.0,37.34716,5.492684,2.845022,3.498826,96.655995,538.280956,0.111634,13.681881,27.573007,...,1141.357982,22.802453,1.773945,3.979865,0.196604,11.896131,15.596671,57.628116,39.249698,9.839549
std,29376.592059,54.600401,7.900855,4.631262,5.534541,171.147191,1270.791601,0.100096,14.121306,22.856742,...,9862.795472,38.54101,5.902412,6.382358,0.998906,6.749549,18.064261,104.53766,71.692309,14.412769
min,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,25440.5,13.0,2.0,1.0,1.0,25.0,97.67,0.05,5.6,15.56,...,31.38,7.0,0.0,1.0,0.0,8.0,7.0,15.0,10.0,3.0
50%,50881.0,22.0,3.0,1.0,2.0,51.0,232.79,0.09,9.82,23.36,...,125.4,14.0,0.0,2.0,0.0,11.0,12.0,30.0,20.0,5.0
75%,76321.5,42.0,6.0,3.0,4.0,111.0,560.25,0.15,18.0,34.34,...,565.92,26.0,1.0,5.0,0.0,16.0,20.0,66.0,45.0,11.0
max,101762.0,3442.0,404.0,165.0,402.0,8441.0,80843.08,1.0,418.2,569.78,...,935923.39,2824.0,344.0,219.0,43.0,410.0,1026.0,5420.0,3021.0,503.0


In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101763 entries, 0 to 101762
Data columns (total 22 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 101763 non-null  int64  
 1   loc                101763 non-null  float64
 2   v(g)               101763 non-null  float64
 3   ev(g)              101763 non-null  float64
 4   iv(g)              101763 non-null  float64
 5   n                  101763 non-null  float64
 6   v                  101763 non-null  float64
 7   l                  101763 non-null  float64
 8   d                  101763 non-null  float64
 9   i                  101763 non-null  float64
 10  e                  101763 non-null  float64
 11  b                  101763 non-null  float64
 12  t                  101763 non-null  float64
 13  lOCode             101763 non-null  int64  
 14  lOComment          101763 non-null  int64  
 15  lOBlank            101763 non-null  int64  
 16  lo

In [9]:
X.dtypes

id                     int64
loc                  float64
v(g)                 float64
ev(g)                float64
iv(g)                float64
n                    float64
v                    float64
l                    float64
d                    float64
i                    float64
e                    float64
b                    float64
t                    float64
lOCode                 int64
lOComment              int64
lOBlank                int64
locCodeAndComment      int64
uniq_Op              float64
uniq_Opnd            float64
total_Op             float64
total_Opnd           float64
branchCount          float64
dtype: object

### Feature Engineering

In [10]:
mi_scores = mutual_info_classif(X, y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)

In [11]:
mi_scores

loc                  0.102142
lOCode               0.089382
e                    0.088606
b                    0.087201
total_Op             0.086560
t                    0.085745
v                    0.085706
total_Opnd           0.085545
n                    0.084857
uniq_Opnd            0.082237
i                    0.078249
d                    0.077600
uniq_Op              0.072636
v(g)                 0.072114
branchCount          0.070589
l                    0.069838
iv(g)                0.068042
lOBlank              0.059884
ev(g)                0.040928
lOComment            0.027367
locCodeAndComment    0.008920
id                   0.000949
Name: MI Scores, dtype: float64

In [12]:
X_corr = df.drop(columns='id', axis=1)
corr = X_corr.corr()
lower_triangle_mask = np.triu(np.ones(corr.shape)).astype(bool)
cor_mat_lower = corr.mask(lower_triangle_mask)

In [13]:
# fig = ff.create_annotated_heatmap(z=cor_mat_lower.to_numpy(), 
#                                   x=cor_mat_lower.columns.tolist(),
#                                   y=cor_mat_lower.columns.tolist(),
#                                   colorscale=px.colors.diverging.RdBu,
                        
#                                   showscale=True, ygap=1, xgap=1
#                                  )

# fig.update_xaxes(side="bottom")

# fig.update_layout(
#     title_text='Heatmap', 
#     title_x=0.5, 
#     width=1000, 
#     height=1000,
#     xaxis_showgrid=False,
#     yaxis_showgrid=False,
#     xaxis_zeroline=False,
#     yaxis_zeroline=False,
#     font=dict(
#         family="Courier New, monospace",
#         size=10,  # Set the font size here
#         color="RebeccaPurple"
#     ),
#     yaxis_autorange='reversed',
#     template='plotly_white'
# )

# # NaN values are not handled automatically and are displayed in the figure
# # So we need to get rid of the text manually
# for i in range(len(fig.layout.annotations)):
#     if fig.layout.annotations[i].text == 'nan':
#         fig.layout.annotations[i].text = ""

# fig.show()

In [25]:
nice_corrs = []
cols = X_corr.columns
for i in cols:
    # using a 0.25 threshold to decrease the number of features
    corr_i_j = corr[abs(corr)>0.25].loc['defects',i]
    if not np.isnan(corr_i_j) and i != 'defects':
        nice_corrs.append(i)


print(nice_corrs)
print(len(nice_corrs))

['loc', 'v(g)', 'ev(g)', 'n', 'l', 'lOCode', 'lOBlank', 'total_Op', 'total_Opnd', 'branchCount']
10


In [26]:
df[nice_corrs].describe()

Unnamed: 0,loc,v(g),ev(g),n,l,lOCode,lOBlank,total_Op,total_Opnd,branchCount
count,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0
mean,37.34716,5.492684,2.845022,96.655995,0.111634,22.802453,3.979865,57.628116,39.249698,9.839549
std,54.600401,7.900855,4.631262,171.147191,0.100096,38.54101,6.382358,104.53766,71.692309,14.412769
min,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,13.0,2.0,1.0,25.0,0.05,7.0,1.0,15.0,10.0,3.0
50%,22.0,3.0,1.0,51.0,0.09,14.0,2.0,30.0,20.0,5.0
75%,42.0,6.0,3.0,111.0,0.15,26.0,5.0,66.0,45.0,11.0
max,3442.0,404.0,165.0,8441.0,1.0,2824.0,219.0,5420.0,3021.0,503.0


In [16]:
# for i in ['loc', 'v(g)', 'ev(g)', 'n', 'l', 'lOCode', 'lOBlank', 'total_Op', 'total_Opnd', 'branchCount']:
#     fig = px.box(y=np.log(X[i]), x=y)
#     fig.update_layout(
#         title_text='log({})'.format(i),
#         width=1000, height=600
#     )

#     fig.update_xaxes(title_text="Defects")
#     fig.update_yaxes(title_text='log({})'.format(i))

#     img_name = 'images/{}.png'.format(i)
#     fig.write_image(img_name,width=1000, height=500)

In [17]:
#Selecting relevant features and scaling them
# X_proc = X[nice_corrs]
# X_proc = X_proc.drop(['defects'], axis = 1)
# scaler = preprocessing.MinMaxScaler()
# d = scaler.fit_transform(X_proc)
# scaled_X_proc = pd.DataFrame(d, columns=X_proc.columns)
## Plots of the distribution of the scaled relevant variables
# scaled_X_corr_box = scaled_X_proc.drop(columns=['defects'])
# fig = px.box(scaled_X_corr_box.melt(), y="value", facet_col="variable", boxmode="overlay", color="variable")
# fig.update_yaxes(matches=None)

# for i in range(len(fig["data"])):
#     yaxis_name = 'yaxis' if i == 0 else f'yaxis{i + 1}'
#     fig.layout[yaxis_name].showticklabels = True

# fig.update_layout(legend = dict(bgcolor = 'white'))
# fig.update_layout(plot_bgcolor='white')

# fig.update_xaxes(showline=True, linewidth=2, linecolor='black')#, mirror=True)
# fig.update_yaxes(showline=True, linewidth=2, linecolor='black')#, mirror=True)

# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='gray')
# fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='gray')

# fig.show()

### Modelling

In [18]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3)
print(X_train.shape)
print(X_val.shape)

(71234, 22)
(30529, 22)


In [27]:
# Simple processing using a MinMax scaler
process = preprocessing.MinMaxScaler()
rf = RandomForestClassifier(max_depth = 3,class_weight="balanced", min_samples_leaf = .03, random_state=42, n_jobs=-1)


pipeline_lr = Pipeline(steps=[('processing',process),
                           ('clf',rf)])

features = nice_corrs

X_train = X_train[features]

pipeline_lr.fit(X_train, y_train)

Pipeline(steps=[('processing', MinMaxScaler()),
                ('clf',
                 RandomForestClassifier(class_weight='balanced', max_depth=3,
                                        min_samples_leaf=0.03, n_jobs=-1,
                                        random_state=42))])

In [20]:
y_probas = pipeline_lr.predict_proba(X_val[features])
y_val_proba_1 = pd.DataFrame(y_probas,columns=['No defect','Defect'])
y_val_pred = y_val_proba_1.Defect.apply(lambda x: 1 if x>0.6 else 0)

In [21]:
roc_auc_score(y_val, y_val_pred)

0.7141511520258644

In [22]:
confusion_matrix(y_true=y_val,y_pred=y_val_pred)

array([[20105,  3473],
       [ 2950,  4001]])

In [41]:
precision, recall, thresholds = precision_recall_curve(y_val, y_val_proba_1.Defect)
precision = precision[:-1]
recall = recall[:-1]


fig = make_subplots(rows=1, cols=1)

fig.add_scatter(x=thresholds, y=precision, name = "Precision",row=1, col=1)

fig.add_scatter(x=thresholds, y=recall, name = "Recall",row=1, col=1)

fig.update_layout(height=400, width=750)#, title_text=f'Precision-recall of {pipe_dict[i]}',title_font_family= "Arial",)
fig.update_xaxes(range=[0, 1], row=1, col=1)
fig.update_xaxes(title_text="Threshold", range=[0, 1],  row=1, col=1)
fig.show()

In [42]:
fpr, tpr, thresholds = roc_curve(y_val,y_val_pred)


fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.update_layout(
                uniformtext_minsize=8, uniformtext_mode='hide',
                title_font_family= "Arial",
                width=500,height=500)

fig.show()

### Preparing submission file

In [23]:
y_test_probas = pipeline_lr.predict_proba(X_test[features])
y_test_proba_1 = pd.DataFrame(y_test_probas,columns=['No defect','Defect'])
y_test_pred = y_test_proba_1.Defect.apply(lambda x: 1 if x>0.6 else 0)

In [24]:
# output = pd.DataFrame({'id': X_test.id,
#                        'defects': y_test_pred})
# output.to_csv('10132023_submission.csv', index=False)