In [1]:
import pandas as pd
import numpy as np
import re
import string
import math
import hashlib
import os

from sklearn.feature_selection import mutual_info_classif
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import confusion_matrix,roc_auc_score,roc_curve,classification_report,auc,precision_score,recall_score,precision_recall_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import lightgbm as lgb


### Data loading

In [2]:
with open(os.path.join("data", "train.csv")) as f:
    # read the training dataset
    df = pd.read_csv(f)

with open(os.path.join("data", "test.csv")) as f:
    # read the test dataset
    X_test = pd.read_csv(f)

df['defects'] = df['defects'].apply(lambda x: 1 if x == True else 0)

X = df.drop(['defects'], axis=1)
y = df['defects']

### Baseline Model

In [3]:
nice_corrs = ['loc', 'v(g)', 'ev(g)', 'n', 'l', 'lOCode', 'lOBlank', 'total_Op', 'total_Opnd', 'branchCount']
process = preprocessing.MinMaxScaler()

rf = RandomForestClassifier(max_depth = 3,class_weight="balanced", min_samples_leaf = .03, random_state=42, n_jobs=-1)

pipeline_base = Pipeline(steps=[('processing',process),
                           ('clf',rf)])

features = nice_corrs
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3)

X_train_baseline = X_train[features]

pipeline_base.fit(X_train_baseline, y_train)

Pipeline(steps=[('processing', MinMaxScaler()),
                ('clf',
                 RandomForestClassifier(class_weight='balanced', max_depth=3,
                                        min_samples_leaf=0.03, n_jobs=-1,
                                        random_state=42))])

In [4]:
y_probas = pipeline_base.predict_proba(X_val[features])
y_val_proba_1 = pd.DataFrame(y_probas,columns=['No defect','Defect'])
y_val_pred = y_val_proba_1.Defect.apply(lambda x: 1 if x>0.6 else 0)
print('ROC score of {}: {}'.format('baseline',roc_auc_score(y_val, y_val_pred)))

ROC score of baseline: 0.7157187446199188


# Model improvement

In [5]:
pipeline_logr = Pipeline(steps=[('processing',process),
                     ('classifier_LogisticRegression', LogisticRegression(solver='liblinear', random_state=42))])
pipeline_light = Pipeline(steps=[('processing',process),
                     ('classifier_light', lgb.LGBMClassifier(learning_rate=0.09,max_depth=-5,random_state=42))])


pipelines = [pipeline_light, pipeline_logr]
pipe_dict = {0: 'LightGBM', 1:'Logistic Regression'}

for pipe in pipelines:
    pipe.fit(X_train, y_train)

best_precision = 0.0
best_classifier = 0
best_ROC = 0
best_pipeline = ''
prediction_dict={}

for i, model in enumerate (pipelines) : 
    y_scores = model.predict_proba(X_val)#[:,1]
    y_val_proba_1 = pd.DataFrame(y_scores,columns=['No_defects','Defects'])
    y_val_pred = y_val_proba_1.Defects.apply(lambda x: 1 if x>0.6 else 0)
    # precision, recall, thresholds = precision_recall_curve(y_test_true, y_scores)
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    roc = roc_auc_score(y_val, y_val_pred)

    print(f'{pipe_dict[i]}, recall: {recall}; precision: {precision}; roc: {roc} ')
    prediction_dict[pipe_dict[i]]=[y_val_pred, precision]

    if roc > best_ROC:
        best_ROC = roc
        best_pipeline = model
        best_classifier = i
print('Best precision with classifier {}'.format(pipe_dict[best_classifier]))

LightGBM, recall: 0.2638788752703677; precision: 0.7038461538461539; roc: 0.6156217297433471 
Logistic Regression, recall: 0.1299206921413122; precision: 0.7271993543179984; roc: 0.5577975080609925 
Best precision with classifier LightGBM
