In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [125]:
df = pd.read_csv("train_PDjVQMB.csv")
df.head()

Unnamed: 0,ID,Age,Gender,Income,Balance,Vintage,Transaction_Status,Product_Holdings,Credit_Card,Credit_Category,Is_Churn
0,84e2fcc9,36,Female,5L - 10L,563266.44,4,0,1,0,Average,1
1,57fea15e,53,Female,Less than 5L,875572.11,2,1,1,1,Poor,0
2,8df34ef3,35,Female,More than 15L,701607.06,2,1,2,0,Poor,0
3,c5c0788b,43,Female,More than 15L,1393922.16,0,1,2,1,Poor,1
4,951d69c4,39,Female,More than 15L,893146.23,1,1,1,1,Good,1


In [126]:
df.drop('ID',axis=1,inplace=True)
df.head()

Unnamed: 0,Age,Gender,Income,Balance,Vintage,Transaction_Status,Product_Holdings,Credit_Card,Credit_Category,Is_Churn
0,36,Female,5L - 10L,563266.44,4,0,1,0,Average,1
1,53,Female,Less than 5L,875572.11,2,1,1,1,Poor,0
2,35,Female,More than 15L,701607.06,2,1,2,0,Poor,0
3,43,Female,More than 15L,1393922.16,0,1,2,1,Poor,1
4,39,Female,More than 15L,893146.23,1,1,1,1,Good,1


In [127]:
col = ["Gender","Credit_Category"]
dummy_df = pd.get_dummies(df[col],drop_first=True)

In [128]:
df[list(dummy_df.columns)] = dummy_df
df.drop(col,axis=1,inplace=True)
df.head()

Unnamed: 0,Age,Income,Balance,Vintage,Transaction_Status,Product_Holdings,Credit_Card,Is_Churn,Gender_Male,Credit_Category_Good,Credit_Category_Poor
0,36,5L - 10L,563266.44,4,0,1,0,1,0,0,0
1,53,Less than 5L,875572.11,2,1,1,1,0,0,0,1
2,35,More than 15L,701607.06,2,1,2,0,0,0,0,1
3,43,More than 15L,1393922.16,0,1,2,1,1,0,0,1
4,39,More than 15L,893146.23,1,1,1,1,1,0,1,0


In [129]:
df["Credit_Card"] = df["Credit_Card"].apply(lambda x: -1 if x==0 else 1)
df["Transaction_Status"] = df["Transaction_Status"].apply(lambda x: -1 if x==0 else 1)

In [130]:
def change_age(x):
    if(x in range(20,30)):
        return 1
    elif (x in range(30,40)):
        return 2
    elif (x in range(40,60)):
        return 3
    else:
        return 4
df["Age"] = df["Age"].apply(change_age)
df["Age"].value_counts()

3    3091
2    2555
1     679
4     325
Name: Age, dtype: int64

In [131]:
minVec = df["Balance"].min()
maxVec = df["Balance"].max()
df["Balance"] = (df["Balance"]-minVec)/(maxVec-minVec)

In [132]:
check_income= {"Less than 5L":1,
"5L - 10L":2,   
"10L - 15L":3,
"More than 15L":4
}
df["Income"] = df["Income"].apply(lambda x: check_income[x.strip()])
df["Income"].value_counts()

3    1885
2    1847
1    1573
4    1345
Name: Income, dtype: int64

In [135]:
holdings = {
    "1":1,
    "2":2,
    "3+":3
}
df["Product_Holdings"] = df["Product_Holdings"].apply(lambda x: holdings[x])
df["Product_Holdings"].dtypes

dtype('int64')

In [138]:
df["IncomeRatio"] = df["Balance"]/df["Income"]

In [140]:
y = df.pop("Is_Churn")
X = df

In [141]:
from imblearn.combine import SMOTETomek
from collections import Counter
counter = Counter(y)
print(f"Before Train: {counter}")
smtom = SMOTETomek(random_state=139)
X, y = smtom.fit_resample(X,y)
counter = Counter(y)
print(f"Before Train: {counter}")

Before Train: Counter({0: 5113, 1: 1537})
Before Train: Counter({1: 4585, 0: 4585})


In [142]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(X,y,test_size=0.20,random_state=200)
print(f"train_set:{x_train.shape}{y_train.shape}")
print(f"Validation_set:{x_val.shape}{y_val.shape}")

train_set:(7336, 11)(7336,)
Validation_set:(1834, 11)(1834,)


In [154]:
# Support functions
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform

# Fit models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import xgboost as xgb
# Scoring functions
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [155]:
def best_model(model):
    print(model.best_score_)    
    print(model.best_params_)
    print(model.best_estimator_)
def get_auc_scores(y_actual, method,method2):
    auc_score = roc_auc_score(y_actual, method); 
    fpr_df, tpr_df, _ = roc_curve(y_actual, method2); 
    return (auc_score, fpr_df, tpr_df)

In [156]:
# Fit primal logistic regression
param_grid = {'C': [0.1,0.5,1,10,50,100], 'max_iter': [250], 'fit_intercept':[True],'intercept_scaling':[1],
              'penalty':['l2'], 'tol':[0.00001,0.0001,0.000001],'solver':["lbfgs","liblinear"]}
log_primal_Grid = GridSearchCV(LogisticRegression(),param_grid, cv=10, refit=True, verbose=0)
log_primal_Grid.fit(x_train,y_train)
print(f"Base_Model: {best_model(log_primal_Grid)}")

0.6329092490641646
{'C': 1, 'fit_intercept': True, 'intercept_scaling': 1, 'max_iter': 250, 'penalty': 'l2', 'solver': 'lbfgs', 'tol': 1e-05}
LogisticRegression(C=1, max_iter=250, tol=1e-05)
Base_Model: None


In [157]:
df.drop(["Income","Balance"],axis=1,inplace=True)

KeyError: "['Income' 'Balance'] not found in axis"

In [158]:
# Fit primal logistic regression
param_grid = {'C': [0.1,0.5,1,10,50,100], 'max_iter': [250], 'fit_intercept':[True],'intercept_scaling':[1],
              'penalty':['l2'], 'tol':[0.00001,0.0001,0.000001],'solver':["lbfgs","liblinear"]}
log_primal_Grid = GridSearchCV(LogisticRegression(),param_grid, cv=10, refit=True, verbose=0)
log_primal_Grid.fit(x_train,y_train)
print(f"Base_Model: {best_model(log_primal_Grid)}")

0.6329092490641646
{'C': 1, 'fit_intercept': True, 'intercept_scaling': 1, 'max_iter': 250, 'penalty': 'l2', 'solver': 'lbfgs', 'tol': 1e-05}
LogisticRegression(C=1, max_iter=250, tol=1e-05)
Base_Model: None


In [159]:
print(classification_report(y_val, log_primal_Grid.predict(x_val)))

              precision    recall  f1-score   support

           0       0.62      0.60      0.61       880
           1       0.64      0.66      0.65       954

    accuracy                           0.63      1834
   macro avg       0.63      0.63      0.63      1834
weighted avg       0.63      0.63      0.63      1834



In [168]:
from sklearn.neighbors import KNeighborsClassifier
KNeighborsClassifier()
params = {
'n_neighbors':[5,6,7,8,9,10],
'weights': 'uniform', 
algorithm='auto', 
leaf_size=30, p=2, 
metric='minkowski', 
metric_params=None,
n_jobs=None
}

KNeighborsClassifier()

Unnamed: 0,Age,Gender,Income,Balance,Vintage,Transaction_Status,Product_Holdings,Credit_Card,Credit_Category,Is_Churn
0,2,0,1,1,4,0,1,0,Average,1
1,3,0,0,2,2,1,1,1,Poor,0
2,2,0,3,2,2,1,2,0,Poor,0
3,3,0,3,3,0,1,2,1,Poor,1
4,2,0,3,2,1,1,1,1,Good,1


1    3430
0    3220
Name: Transaction_Status, dtype: int64

In [31]:
def findGroupID(listPFR):
	sum = 0
	for i in range(0,listPFR_size):
		lst = []
		for j in range(i+1,listPFR_size):
			if listPFR[i]>listPFR[j]:
				lst.append("Greater")
			elif listPFR[i]<listPFR[j]:
				lst.append("Smaller")
			else:
				lst.append("Equal")
		if ("Greater" in lst) and ("Smaller" in lst):
			sum = sum+5
		elif ("Greater" in lst) and ("Smaller" not in lst):
			sum =sum+10
		else:
			sum = sum +15
	return sum

In [None]:
arr = [4,2,5,7,1,3]

