In [1]:
import pandas as pd
import numpy as np
import re
import string
import math
import hashlib
import os

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

from sklearn.feature_selection import mutual_info_classif
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import confusion_matrix,roc_auc_score,roc_curve,classification_report,auc, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

### Data Loading

In [2]:
with open(os.path.join("data", "train.csv")) as f:
    # read the training dataset
    df = pd.read_csv(f)

with open(os.path.join("data", "test.csv")) as f:
    # read the test dataset
    X_test = pd.read_csv(f)

df['defects'] = df['defects'].apply(lambda x: 1 if x == True else 0)

X = df.drop(['defects'], axis=1)
y = df['defects']

# Feature Engineering

### Data Analysis

In [3]:
X.describe()

Unnamed: 0,id,loc,v(g),ev(g),iv(g),n,v,l,d,i,...,t,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount
count,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,...,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0
mean,50881.0,37.34716,5.492684,2.845022,3.498826,96.655995,538.280956,0.111634,13.681881,27.573007,...,1141.357982,22.802453,1.773945,3.979865,0.196604,11.896131,15.596671,57.628116,39.249698,9.839549
std,29376.592059,54.600401,7.900855,4.631262,5.534541,171.147191,1270.791601,0.100096,14.121306,22.856742,...,9862.795472,38.54101,5.902412,6.382358,0.998906,6.749549,18.064261,104.53766,71.692309,14.412769
min,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,25440.5,13.0,2.0,1.0,1.0,25.0,97.67,0.05,5.6,15.56,...,31.38,7.0,0.0,1.0,0.0,8.0,7.0,15.0,10.0,3.0
50%,50881.0,22.0,3.0,1.0,2.0,51.0,232.79,0.09,9.82,23.36,...,125.4,14.0,0.0,2.0,0.0,11.0,12.0,30.0,20.0,5.0
75%,76321.5,42.0,6.0,3.0,4.0,111.0,560.25,0.15,18.0,34.34,...,565.92,26.0,1.0,5.0,0.0,16.0,20.0,66.0,45.0,11.0
max,101762.0,3442.0,404.0,165.0,402.0,8441.0,80843.08,1.0,418.2,569.78,...,935923.39,2824.0,344.0,219.0,43.0,410.0,1026.0,5420.0,3021.0,503.0


In [None]:
mi_scores = mutual_info_classif(X, y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)

In [None]:
mi_scores

In [None]:
X_corr = df.drop(columns='id', axis=1)
corr = X_corr.corr()
lower_triangle_mask = np.triu(np.ones(corr.shape)).astype(bool)
cor_mat_lower = corr.mask(lower_triangle_mask)
# fig = ff.create_annotated_heatmap(z=cor_mat_lower.to_numpy(), 
#                                   x=cor_mat_lower.columns.tolist(),
#                                   y=cor_mat_lower.columns.tolist(),
#                                   colorscale=px.colors.diverging.RdBu,
                        
#                                   showscale=True, ygap=1, xgap=1
#                                  )

# fig.update_xaxes(side="bottom")

# fig.update_layout(
#     title_text='Heatmap', 
#     title_x=0.5, 
#     width=1000, 
#     height=1000,
#     xaxis_showgrid=False,
#     yaxis_showgrid=False,
#     xaxis_zeroline=False,
#     yaxis_zeroline=False,
#     font=dict(
#         family="Courier New, monospace",
#         size=10,  # Set the font size here
#         color="RebeccaPurple"
#     ),
#     yaxis_autorange='reversed',
#     template='plotly_white'
# )

# # NaN values are not handled automatically and are displayed in the figure
# # So we need to get rid of the text manually
# for i in range(len(fig.layout.annotations)):
#     if fig.layout.annotations[i].text == 'nan':
#         fig.layout.annotations[i].text = ""

# fig.show()
nice_corrs = []
cols = X_corr.columns
for i in cols:
    # using a 0.25 threshold to decrease the number of features
    corr_i_j = corr[abs(corr)>0.25].loc['defects',i]
    if not np.isnan(corr_i_j) and i != 'defects':
        nice_corrs.append(i)


print(nice_corrs)
print(len(nice_corrs))



In [None]:
#Selecting relevant features and scaling them
# X_proc = X[nice_corrs]
# X_proc = X_proc.drop(['defects'], axis = 1)
# scaler = preprocessing.MinMaxScaler()
# d = scaler.fit_transform(X_proc)
# scaled_X_proc = pd.DataFrame(d, columns=X_proc.columns)
## Plots of the distribution of the scaled relevant variables
# scaled_X_corr_box = scaled_X_proc.drop(columns=['defects'])
# fig = px.box(scaled_X_corr_box.melt(), y="value", facet_col="variable", boxmode="overlay", color="variable")
# fig.update_yaxes(matches=None)

# for i in range(len(fig["data"])):
#     yaxis_name = 'yaxis' if i == 0 else f'yaxis{i + 1}'
#     fig.layout[yaxis_name].showticklabels = True

# fig.update_layout(legend = dict(bgcolor = 'white'))
# fig.update_layout(plot_bgcolor='white')

# fig.update_xaxes(showline=True, linewidth=2, linecolor='black')#, mirror=True)
# fig.update_yaxes(showline=True, linewidth=2, linecolor='black')#, mirror=True)

# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='gray')
# fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='gray')

# fig.show()

### Baseline

In [None]:
### Modelling
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state=42)
print(X_train.shape)
print(X_val.shape)
# Simple processing using a MinMax scaler
process = preprocessing.MinMaxScaler()
rf = RandomForestClassifier(max_depth = 3,class_weight="balanced", min_samples_leaf = .03, random_state=42, n_jobs=-1)


pipeline_lr = Pipeline(steps=[('processing',process),
                           ('clf',rf)])

features = nice_corrs

#X_train = X_train[features]

pipeline_lr.fit(X_train, y_train)
y_probas = pipeline_lr.predict_proba(X_val)
y_val_proba_1 = pd.DataFrame(y_probas,columns=['No defect','Defect'])
y_val_pred = y_val_proba_1.Defect.apply(lambda x: 1 if x>0.6 else 0)

roc_auc_score(y_val, y_val_pred)