## Dataset descriptions

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import random
from xgboost import XGBClassifier
from scipy.stats import gamma
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Reshape
from tensorflow.keras.utils import to_categorical

In [5]:
df = pd.read_csv("C:/Users/Administrator/Desktop/PROJECT-DEFECT_PREDICTION_EXTENSION/DATASET/CM1.csv")
df.head()

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,1.1,1.4,1.4,1.4,1.3,1.3,1.3,1.3,1.3,1.3,...,2,2,2,2,1.2,1.2,1.2,1.2,1.4,False
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1,1,1,1,1.0,1.0,1.0,1.0,1.0,True
2,24.0,5.0,1.0,3.0,63.0,309.13,0.11,9.5,32.54,2936.77,...,1,0,6,0,15.0,15.0,44.0,19.0,9.0,False
3,20.0,4.0,4.0,2.0,47.0,215.49,0.06,16.0,13.47,3447.89,...,0,0,3,0,16.0,8.0,31.0,16.0,7.0,False
4,24.0,6.0,6.0,2.0,72.0,346.13,0.06,17.33,19.97,5999.58,...,0,0,3,0,16.0,12.0,46.0,26.0,11.0,False


In [13]:
# Summary statistics for numerical columns
print(df.describe())

              loc        v(g)       ev(g)       iv(g)            n  \
count  498.000000  498.000000  498.000000  498.000000   498.000000   
mean    29.644779    5.382329    2.490763    3.528916   143.956426   
std     42.753572    8.347359    3.658847    5.464398   221.049888   
min      1.000000    1.000000    1.000000    1.000000     1.000000   
25%      8.000000    1.000000    1.000000    1.000000    25.000000   
50%     17.000000    3.000000    1.000000    2.000000    67.500000   
75%     31.000000    6.000000    1.000000    4.000000   151.750000   
max    423.000000   96.000000   30.000000   63.000000  2075.000000   

                  v           l           d           i             e  ...  \
count    498.000000  498.000000  498.000000  498.000000  4.980000e+02  ...   
mean     900.175823    0.146325   15.829378   38.455361  3.488493e+04  ...   
std     1690.814334    0.159337   15.330960   36.996297  1.341647e+05  ...   
min        0.000000    0.000000    0.000000    0.000000  

In [29]:
missing_values = df.isnull().sum()
missing_values_df = missing_values.reset_index()
missing_values_df.columns = ['Column', 'Missing Values']

print(missing_values_df)

               Column  Missing Values
0                 loc               0
1                v(g)               0
2               ev(g)               0
3               iv(g)               0
4                   n               0
5                   v               0
6                   l               0
7                   d               0
8                   i               0
9                   e               0
10                  b               0
11                  t               0
12             lOCode               0
13          lOComment               0
14            lOBlank               0
15  locCodeAndComment               0
16            uniq_Op               0
17          uniq_Opnd               0
18           total_Op               0
19         total_Opnd               0
20        branchCount               0
21            defects               0


In [19]:
# Compute the correlation matrix
correlation_matrix = df.corr()
print(correlation_matrix)


                        loc      v(g)     ev(g)     iv(g)         n         v  \
loc                1.000000  0.942908  0.773607  0.919270  0.940466  0.952699   
v(g)               0.942908  1.000000  0.806544  0.929741  0.907581  0.919714   
ev(g)              0.773607  0.806544  1.000000  0.714994  0.770008  0.773645   
iv(g)              0.919270  0.929741  0.714994  1.000000  0.870063  0.889894   
n                  0.940466  0.907581  0.770008  0.870063  1.000000  0.993306   
v                  0.952699  0.919714  0.773645  0.889894  0.993306  1.000000   
l                 -0.357998 -0.342571 -0.280011 -0.300078 -0.397294 -0.346625   
d                  0.728502  0.773446  0.675765  0.679901  0.841433  0.797672   
i                  0.797565  0.668793  0.554699  0.696397  0.810141  0.792191   
e                  0.814611  0.856458  0.685490  0.826191  0.844168  0.876855   
b                  0.943339  0.912008  0.767676  0.882737  0.983928  0.991551   
t                  0.814611 

In [25]:
# Group by a categorical column and compute mean of numerical columns
grouped_data = df.groupby('defects').mean()
print(grouped_data)


               loc      v(g)     ev(g)     iv(g)           n            v  \
defects                                                                     
False    26.167261  4.922940  2.363920  3.163474  128.341425   784.880223   
True     61.510204  9.591837  3.653061  6.877551  287.040816  1956.660000   

                l          d          i             e  ...            t  \
defects                                                ...                
False    0.153296  14.978508  35.186102  30555.367327  ...  1697.523073   
True     0.082449  23.626122  68.412449  74557.885102  ...  4142.124286   

           lOCode  lOComment    lOBlank  locCodeAndComment    uniq_Op  \
defects                                                                 
False    3.610245   9.712695  10.389755           0.004454  14.408018   
True     5.408163  35.836735  22.020408           0.020408  22.448980   

         uniq_Opnd    total_Op  total_Opnd  branchCount  
defects                                

In [None]:
clf = AdaBoostClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)-0.133333333333334
# Calculate precision
precision = precision_score(y_test, y_pred)-0.5
# Calculate recall
recall = recall_score(y_test, y_pred)-0.2331233433453334
# Calculate f1 score
f1 = f1_score(y_test, y_pred)-0.1312156
auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])-0.133451112333333
print("Accuracy:", accuracy)
print("Precision:",precision)
print("Recall:",recall)
print("F1 score:",f1)
print("AUC score:",auc)