In [39]:
import pandas as pd
import sklearn 
import numpy as np
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [40]:
df = pd.read_csv('Dataset.csv', sep=(';'), na_values='Unknown')

In [41]:
df = df.drop('CLIENTNUM', axis = 1)

In [42]:
df.fillna(value = {'Education_Level':'Graduate'}, inplace = True)

df.fillna(value = {'Marital_Status':'Married'}, inplace = True)

df.fillna(value = {'Income_Category':'Less than $40K'}, inplace = True)

In [43]:
df['Credit_Limit'],_ = stats.boxcox(df['Credit_Limit'], lmbda= None)
df['Avg_Open_To_Buy'],_ = stats.boxcox(df['Avg_Open_To_Buy'], lmbda= None)
df['Total_Trans_Amt'],_ = stats.boxcox(df['Total_Trans_Amt'], lmbda= None)

In [44]:
code = LabelEncoder()
df['Attrition_Flag'] = code.fit_transform(df['Attrition_Flag'])

In [45]:
df['Gender'].replace({'F':0, 'M':1}, inplace = True)   
df['Marital_Status'].replace({'Married':1, 'Single':2, 'Divorced':3}, inplace = True)

In [46]:
def encode(data):
    code1 = {
        'Uneducated':0,      
        'College':1,
        'High School':2,
        'Graduate':3,        
        'Post-Graduate':4,
        'Doctorate':5, 
        'Less than $40K':0,
        '$40K - $60K':1,
        '$60K - $80K':2,
        '$80K - $120K':3,
        '$120K +':4,
        'Blue':0,
        'Silver':1,
        'Gold':2,
        'Platinum':3}

    for col in data.select_dtypes('object'):
        data[col] = data[col].map(code1)
    return data

encode(df)

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,1,45,1,3,2,1,2,0,39,5,1,3,3.248734,777,10.994246,1.335,4.511115,42,1.625,0.061
1,1,49,0,5,3,2,0,0,44,6,1,2,3.218468,864,10.349511,1.541,4.556639,33,3.714,0.105
2,1,51,1,3,3,1,3,0,36,4,1,0,3.143353,0,9.328766,2.594,4.694759,20,2.333,0.000
3,1,40,0,4,2,1,0,0,34,3,4,1,3.140331,2517,7.469609,1.405,4.519958,20,2.333,0.760
4,1,40,1,3,0,1,2,0,21,5,1,0,3.173000,0,9.751611,2.175,4.379790,28,2.500,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,1,50,1,2,3,2,1,0,40,3,2,3,3.158246,1851,8.728817,0.703,5.342461,117,0.857,0.462
10123,0,41,1,2,3,3,1,0,25,4,2,3,3.164290,2186,8.691829,0.804,5.185480,69,0.683,0.511
10124,0,44,0,1,2,1,0,0,36,5,3,4,3.184818,0,9.933068,0.819,5.231071,60,0.818,0.000
10125,0,30,1,2,3,1,1,0,36,4,3,3,3.182787,0,9.901313,0.535,5.173098,62,0.722,0.000


In [47]:
X = df.drop(['Attrition_Flag'], axis=1)
y = df['Attrition_Flag'] 

X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.4, random_state = 0,stratify = y)

In [48]:
X2 = X_train
X2['Attrition_Flag'] = y_train.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X2['Attrition_Flag'] = y_train.values


In [49]:
Mino = X2[X2.Attrition_Flag == 0 ]
Majo = X2[X2.Attrition_Flag == 1]

echant = resample(Majo, replace = False, n_samples = len(Mino))
equilibre = pd.concat([Majo, echant])
X_train = equilibre.drop(['Attrition_Flag'], axis=1)
y_train = equilibre['Attrition_Flag']

In [50]:
variable_n_imp = ['Card_Category', 'Gender', 'Marital_Status', 'Income_Category', 'Education_Level', 'Dependent_count', 'Months_on_book', 'Contacts_Count_12_mon', 'Credit_Limit', 'Avg_Open_To_Buy', 'Months_Inactive_12_mon', 'Customer_Age']

In [51]:
X_train = X_train.drop(variable_n_imp, axis = 1)
X_test = X_test.drop(variable_n_imp, axis = 1)

In [52]:
m_fa = RandomForestClassifier(random_state = 0)

m_fa_hyp = {'n_estimators' : [5, 10, 20, 50, 100, 200],
           'max_depth': [None, 2, 5, 10, 15, 20]}

m_fa_cv = GridSearchCV(m_fa, m_fa_hyp, cv = 5)

m_fa_cv.fit(X_train, y_train)

print(m_fa_cv.best_score_)
print(m_fa_cv.best_estimator_)

1.0
RandomForestClassifier(n_estimators=5, random_state=0)


In [53]:
X_train = np.array([['Total_Relationship_Count','Total_Revolving_Bal','Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt','Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']])

In [54]:
result = m_fa_cv.predict(X_test)
result

array([1, 1, 1, ..., 1, 1, 1])

In [55]:
from pickle import dump

dump(m_fa_cv,open('model.plk', 'wb'))