In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report, roc_curve
from sklearn.metrics import roc_auc_score, auc


In [2]:
train_X = pd.read_csv("Training/X_train.csv")
train_y = pd.read_csv("Training/y_train.csv")
TEST_X = pd .read_csv("Test/X_test.csv")

In [3]:
df = pd.merge(train_X, train_y, on= "Unique_ID", how = "left")

In [4]:
# Converting boolean to integer
df.C8 = df.C8.replace({True: 1, False: 0})
df.C6 = df.C6.replace({True: 1, False: 0})

In [5]:
missing_value_percent_per_col = df.isna().sum().sort_values(ascending = False)/df.shape[0]*100
missing_value_percent_per_col

N32                   81.180030
N27                   81.025719
N31                   81.025719
N26                   81.025719
N29                   81.025719
N30                   81.025719
N28                   81.025719
N25                   81.025719
N12                   13.960666
N2                    13.954614
N4                    13.936460
N5                    13.936460
N18                   13.936460
N17                   13.936460
N16                   13.936460
N20                   13.830560
N21                   13.830560
N19                   13.830560
N22                   13.830560
N23                    7.521936
N11                    2.166415
N14                    1.839637
N10.1                  1.291982
N10                    1.291982
N15                    1.291982
N7                     1.291982
N35                    1.291982
N6                     1.104387
N3                     1.104387
N24                    0.000000
N33                    0.000000
N34     

In [6]:
# drop a column having null percentage more than 80 percent
df.drop(['N32', 'N25', 'N31','N30','N29', 'N28', 'N27','N26'], axis = 1, inplace = True)
df.drop(['Unique_ID'], axis = 1, inplace = True)

In [7]:
# Converting categotical variables to str
df[['C1','C2','C3','C4','C5','C6','C7','C8']] = df[['C1','C2','C3','C4','C5','C6','C7','C8']].astype(str)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33050 entries, 0 to 33049
Data columns (total 36 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   C1                  33050 non-null  object 
 1   C2                  33050 non-null  object 
 2   C3                  33050 non-null  object 
 3   C4                  33050 non-null  object 
 4   C5                  33050 non-null  object 
 5   C6                  33050 non-null  object 
 6   C7                  33050 non-null  object 
 7   C8                  33050 non-null  object 
 8   N1                  33050 non-null  float64
 9   N2                  28438 non-null  float64
 10  N3                  32685 non-null  float64
 11  N4                  28444 non-null  float64
 12  N5                  28444 non-null  float64
 13  N6                  32685 non-null  float64
 14  N7                  32623 non-null  float64
 15  N8                  33050 non-null  int64  
 16  N9  

In [9]:
X = df.drop('Dependent_Variable', axis=1)
y = df['Dependent_Variable']

In [11]:
null_index = [X.columns.get_loc(i) for i in X.columns if X[i].isna().sum() > 0]

In [12]:
# Simple imputer
preprocessor_1 = ColumnTransformer(
    [
        ("SimpleImputer", SimpleImputer(missing_values=np.nan, strategy='median'),null_index)
    ], remainder = 'passthrough'
)

In [13]:
pipe = Pipeline(
    [
        ("Imputer", preprocessor_1),
      
    ]
)

In [14]:
y.value_counts()

Dependent_Variable
0    22844
1    10206
Name: count, dtype: int64

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, stratify = y, random_state=42)
X_train.shape, X_test.shape

((26440, 35), (6610, 35))

In [16]:
X_train_new = pipe.fit_transform(X_train)
X_test_new = pipe.transform(X_test)

In [17]:
X_train_new.shape, X_test_new.shape

((26440, 35), (6610, 35))

In [18]:
y_train.value_counts()

Dependent_Variable
0    18275
1     8165
Name: count, dtype: int64

In [19]:
# Create and Evaluate function to give all metrics after model training
def evaluate_model(true, predicted, proba):
    cm = confusion_matrix(true, predicted)
    accuracy = accuracy_score(true, predicted)
    clf_report = classification_report(true, predicted)
    
    fpr, tpr, thresholds = roc_curve(true, proba, pos_label=1)
    auc_algo =  auc(fpr, tpr)

    return cm, accuracy, clf_report, auc_algo

In [20]:
model = CatBoostClassifier(
    iterations=10000,
    depth=3,
    rsm=0.2,
    verbose=False,
    l2_leaf_reg=5,
    class_weights={0:1,1:3},
    min_child_samples= 50,)
    
model.fit(X_train_new, y_train) # Train model
# Make predictions


<catboost.core.CatBoostClassifier at 0x2660a563880>

In [21]:
y_train_pred = model.predict(X_train_new)
y_test_pred = model.predict(X_test_new)
y_test_pred_proba = model.predict_proba(X_test_new)[:,1]
y_train_pred_proba = model.predict_proba(X_train_new)[:,1]
# Evaluate Train and Test dataset
model_train_cm , model_train_accuracy, model_train_clf, model_train_auc_score = evaluate_model(y_train, y_train_pred,y_train_pred_proba)
model_test_cm , model_test_accuracy, model_test_clf, model_test_auc_score= evaluate_model(y_test, y_test_pred,y_test_pred_proba)

In [22]:
#Training data
print('Model performance for Training set')
print(f"- CONFUSION MATRIX: {model_train_cm}")
print("- Accuracy: {:.4f}".format(model_train_accuracy))
print(f"- Classification Report: {model_train_clf}")
print("- AUC Score: {:.4f}".format(model_train_auc_score))

Model performance for Training set
- CONFUSION MATRIX: [[12848  5427]
 [  861  7304]]
- Accuracy: 0.7622
- Classification Report:               precision    recall  f1-score   support

           0       0.94      0.70      0.80     18275
           1       0.57      0.89      0.70      8165

    accuracy                           0.76     26440
   macro avg       0.76      0.80      0.75     26440
weighted avg       0.82      0.76      0.77     26440

- AUC Score: 0.8926


In [24]:
# Validation Data
print(f"- CONFUSION MATRIX: {model_test_cm}")
print("- Accuracy: {:.4f}".format(model_test_accuracy))
print(f"- Classification Report: {model_test_clf}")
print("- AUC Score: {:.4f}".format(model_test_auc_score))

- CONFUSION MATRIX: [[2910 1659]
 [ 512 1529]]
- Accuracy: 0.6716
- Classification Report:               precision    recall  f1-score   support

           0       0.85      0.64      0.73      4569
           1       0.48      0.75      0.58      2041

    accuracy                           0.67      6610
   macro avg       0.66      0.69      0.66      6610
weighted avg       0.74      0.67      0.68      6610

- AUC Score: 0.7707


In [None]:
## Same fot the TEST data

In [25]:
# Converting boolean to integer
TEST_X.C8 = df.C8.replace({True: 1, False: 0})
TEST_X.C6 = df.C6.replace({True: 1, False: 0})

In [26]:
# Saving unique ID
uni_id = TEST_X["Unique_ID"].to_list()

In [27]:
# drop a column having null percentage more than 80 percent
TEST_X.drop(['N32', 'N25', 'N31','N30','N29', 'N28', 'N27','N26'], axis = 1, inplace = True)
TEST_X.drop(['Unique_ID'], axis = 1, inplace = True)

In [28]:
# Converting categotical variables to str
TEST_X[['C1','C2','C3','C4','C5','C6','C7','C8']] = TEST_X[['C1','C2','C3','C4','C5','C6','C7','C8']].astype(str)

In [29]:
TEST_X.shape

(11017, 35)

In [30]:
X_TEST = pipe.transform(TEST_X)

In [31]:
y_TEST_pred = model.predict(X_TEST)
y_TEST_pred_proba = model.predict_proba(X_TEST)[:,1]

In [32]:
y_TEST_pred_proba.max()

0.9973138882680095

In [33]:
output = pd.DataFrame({
    "Unique_ID": uni_id,
    "Class_1_Probability" : y_TEST_pred_proba
})

In [35]:
output.to_csv("final_prediction.csv", index = False)