In [1]:
import pandas as pd
import numpy as np
import re
import string
import math
import hashlib
import os

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

from sklearn.feature_selection import mutual_info_classif
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import confusion_matrix,roc_auc_score,roc_curve,classification_report,auc, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier



# Baseline

### Data loading

In [2]:
with open(os.path.join("data", "train.csv")) as f:
    # read the training dataset
    df = pd.read_csv(f)

with open(os.path.join("data", "test.csv")) as f:
    # read the test dataset
    X_test = pd.read_csv(f)

df['defects'] = df['defects'].apply(lambda x: 1 if x == True else 0)

X = df.drop(['defects'], axis=1)
y = df['defects']

In [3]:
df.head()

Unnamed: 0,id,loc,v(g),ev(g),iv(g),n,v,l,d,i,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,0,22.0,3.0,1.0,2.0,60.0,278.63,0.06,19.56,14.25,...,17,1,1,0,16.0,9.0,38.0,22.0,5.0,0
1,1,14.0,2.0,1.0,2.0,32.0,151.27,0.14,7.0,21.11,...,11,0,1,0,11.0,11.0,18.0,14.0,3.0,0
2,2,11.0,2.0,1.0,2.0,45.0,197.65,0.11,8.05,22.76,...,8,0,1,0,12.0,11.0,28.0,17.0,3.0,0
3,3,8.0,1.0,1.0,1.0,23.0,94.01,0.19,5.25,17.86,...,4,0,2,0,8.0,6.0,16.0,7.0,1.0,1
4,4,11.0,2.0,1.0,2.0,17.0,60.94,0.18,5.63,12.44,...,7,0,2,0,7.0,6.0,10.0,10.0,3.0,0


### Data Analysis

In [4]:
X.shape

(101763, 22)

In [5]:
X.columns

Index(['id', 'loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e',
       'b', 't', 'lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment',
       'uniq_Op', 'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount'],
      dtype='object')

In [6]:
X.head()

Unnamed: 0,id,loc,v(g),ev(g),iv(g),n,v,l,d,i,...,t,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount
0,0,22.0,3.0,1.0,2.0,60.0,278.63,0.06,19.56,14.25,...,302.71,17,1,1,0,16.0,9.0,38.0,22.0,5.0
1,1,14.0,2.0,1.0,2.0,32.0,151.27,0.14,7.0,21.11,...,52.04,11,0,1,0,11.0,11.0,18.0,14.0,3.0
2,2,11.0,2.0,1.0,2.0,45.0,197.65,0.11,8.05,22.76,...,97.45,8,0,1,0,12.0,11.0,28.0,17.0,3.0
3,3,8.0,1.0,1.0,1.0,23.0,94.01,0.19,5.25,17.86,...,26.31,4,0,2,0,8.0,6.0,16.0,7.0,1.0
4,4,11.0,2.0,1.0,2.0,17.0,60.94,0.18,5.63,12.44,...,20.31,7,0,2,0,7.0,6.0,10.0,10.0,3.0


In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101763 entries, 0 to 101762
Data columns (total 22 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 101763 non-null  int64  
 1   loc                101763 non-null  float64
 2   v(g)               101763 non-null  float64
 3   ev(g)              101763 non-null  float64
 4   iv(g)              101763 non-null  float64
 5   n                  101763 non-null  float64
 6   v                  101763 non-null  float64
 7   l                  101763 non-null  float64
 8   d                  101763 non-null  float64
 9   i                  101763 non-null  float64
 10  e                  101763 non-null  float64
 11  b                  101763 non-null  float64
 12  t                  101763 non-null  float64
 13  lOCode             101763 non-null  int64  
 14  lOComment          101763 non-null  int64  
 15  lOBlank            101763 non-null  int64  
 16  lo

In [9]:
X.dtypes

id                     int64
loc                  float64
v(g)                 float64
ev(g)                float64
iv(g)                float64
n                    float64
v                    float64
l                    float64
d                    float64
i                    float64
e                    float64
b                    float64
t                    float64
lOCode                 int64
lOComment              int64
lOBlank                int64
locCodeAndComment      int64
uniq_Op              float64
uniq_Opnd            float64
total_Op             float64
total_Opnd           float64
branchCount          float64
dtype: object

### Modelling

In [18]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state=42)
print(X_train.shape)
print(X_val.shape)

(71234, 22)
(30529, 22)


In [19]:
# Simple processing using a MinMax scaler
process = preprocessing.MinMaxScaler()
rf = RandomForestClassifier(max_depth = 3,class_weight="balanced", min_samples_leaf = .03, random_state=42, n_jobs=-1)


pipeline_lr = Pipeline(steps=[('processing',process),
                           ('clf',rf)])

features = nice_corrs

#X_train = X_train[features]

pipeline_lr.fit(X_train, y_train)

Pipeline(steps=[('processing', MinMaxScaler()),
                ('clf',
                 RandomForestClassifier(class_weight='balanced', max_depth=3,
                                        min_samples_leaf=0.03, n_jobs=-1,
                                        random_state=42))])

In [20]:
y_probas = pipeline_lr.predict_proba(X_val)
y_val_proba_1 = pd.DataFrame(y_probas,columns=['No defect','Defect'])
y_val_pred = y_val_proba_1.Defect.apply(lambda x: 1 if x>0.6 else 0)

In [21]:
roc_auc_score(y_val, y_val_pred)

0.7092841976354052

In [22]:
confusion_matrix(y_true=y_val,y_pred=y_val_pred)

array([[19945,  3650],
       [ 2959,  3975]])

In [23]:
precision, recall, thresholds = precision_recall_curve(y_val, y_val_proba_1.Defect)
precision = precision[:-1]
recall = recall[:-1]


fig = make_subplots(rows=1, cols=1)

fig.add_scatter(x=thresholds, y=precision, name = "Precision",row=1, col=1)

fig.add_scatter(x=thresholds, y=recall, name = "Recall",row=1, col=1)

fig.update_layout(height=400, width=750)#, title_text=f'Precision-recall of {pipe_dict[i]}',title_font_family= "Arial",)
fig.update_xaxes(range=[0, 1], row=1, col=1)
fig.update_xaxes(title_text="Threshold", range=[0, 1],  row=1, col=1)
fig.show()

In [24]:
fpr, tpr, thresholds = roc_curve(y_val,y_val_pred)


fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.update_layout(
                uniformtext_minsize=8, uniformtext_mode='hide',
                title_font_family= "Arial",
                width=500,height=500)

fig.show()

### Preparing submission file

In [26]:
y_test_probas = pipeline_lr.predict_proba(X_test)
y_test_proba_1 = pd.DataFrame(y_test_probas,columns=['No defect','Defect'])
y_test_pred = y_test_proba_1.Defect.apply(lambda x: 1 if x>0.6 else 0)

In [27]:
output = pd.DataFrame({'id': X_test.id,
                       'defects': y_test_pred})
output.to_csv('10192023_submission.csv', index=False)