#### Importing Various Libraries for Churning

In [None]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.ticker as mtick
import matplotlib.pyplot as plt
%matplotlib inline

### Full EDA

In [None]:
df = pd.read_excel(r"C:\Users\jaisw\Downloads\customer_churn_large_dataset.xlsx")
df.head()

In [None]:
df.shape

In [None]:
df.columns.values

In [None]:
df.dtypes

In [None]:
df.describe

In [None]:
df.describe()

In [None]:
# from scipy import stats
# z_score = np.abs(stats.zscore(df))
# df = df[(z_score<3).all(axis=1)]

By looking at above description of data we conclude that there are total of 100000 cases in which average monthly charges are 65.05 where as 25% consumers pay more than 82.64. 
we also conclude that average age of people are 44 years whereas only 25% people are age above 57 years who usage more than 387 GB of data.

In [None]:
df['Churn'].value_counts().plot(kind='barh', figsize=(8,6))
plt.xlabel("Count", labelpad=14)
plt.ylabel("Target Variable", labelpad=14)
plt.title("Count of TARGET Variable per category", y=1.02);

In [None]:
df['Churn'].value_counts()

In [None]:
100*df['Churn'].value_counts()/len(df['Churn'])

Data is imbalance : ratio-50:49 So we analyse the data with other features while taking the target values separately to get some insights.

In [None]:
df.info(verbose=True)

In [None]:
missin = pd.DataFrame((df.isnull().sum())*100/df.shape[0]).reset_index()
plt.figure(figsize=(16,5))
# ax = sns.pointplot('index',0,data=missing)
plt.xticks(rotation=90, fontsize=7)
plt.title("Percentage of missing values")
plt.ylabel("PERCENTAGE")
plt.show()

We dont have missing value here.

General Thumb Rules:

For features with less missing values- can use regression to predict the missing values or fill with the mean of the values present, depending on the feature.

For features with very high number of missing values- it is better to drop those columns as they give very less insight on analysis.

As there's no thumb rule on what criteria do we delete the columns with high number of missing values, but generally you can delete the columns, if you have more than 30-40% of missing values.

#### Data Cleaning 

In [None]:
df_copy= df.copy()

In [None]:
df_copy.Total_Usage_GB = pd.to_numeric(df_copy.Total_Usage_GB, errors='coerce')
df_copy.isnull().sum()

In [None]:
print(df_copy['Subscription_Length_Months'].max())

In [None]:
# Group the Age in bins if 
labels = ["{0}-{1}".format(i, i+5)for i in range(1,24,6)]
df_copy['Subscription_Length_Months_group'] = pd.cut(df_copy.Subscription_Length_Months, range(1,30,6), right=False, labels=labels)

In [None]:
df_copy['Subscription_Length_Months_group'].value_counts()

Removing the columns that is not required.

In [None]:
df_copy.drop(columns= ['CustomerID','Subscription_Length_Months','Name'], axis=1, inplace= True)
df_copy.head()

#### Data Exploration| 

 Plot distribution of individual predictors by churn.

*UNIVERIATE ANALYSIS*

In [None]:
for i, predictor in enumerate (df_copy.drop(columns=['Churn', 'Monthly_Bill'])):
        plt.figure(i)
        sns.countplot(data=df_copy, x=predictor,hue='Churn')
    

 Convert the target variable 'Churn' in a binary numeric variable i.e. Yes=1; No=0

In [None]:
# df_copy['Churn'] = np.where(df_copy.Churn == 'Yes',1,0)

In [None]:
# df_copy.head(5)

 Convert all the categorical variables into dummy variables

In [None]:
df_copy_dummy = pd.get_dummies(df_copy)
df_copy_dummy.head()

In [None]:
Mth = sns.kdeplot(df_copy_dummy.Monthly_Bill[(df_copy_dummy["Churn"] == 0) ],
                color="Red", shade = True)
Mth = sns.kdeplot(df_copy_dummy.Monthly_Bill[(df_copy_dummy["Churn"] == 1) ],
                ax =Mth, color="Blue", shade= True)
Mth.legend(["No Churn","Churn"],loc='upper right')
Mth.set_ylabel('Density')
Mth.set_xlabel('Monthly Charges')
Mth.set_title('Monthly charges by churn')

Insights:
          Churn is have slice differences per charges there is no major changes in it.

Build a correlation of all predictors with churn.

In [None]:
plt.figure(figsize=(24,10))
df_copy_dummy.corr()['Churn'].sort_values(ascending =False).plot(kind='bar')

Insights:
    There is no major deviation in churn through other properties.

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(df_copy_dummy.corr(), cmap="Paired")

Insights: Here from HeatMap we get a lot of details.

In [None]:
df_copy_dummy.to_csv('new_churn.csv')

In [None]:
dp= pd.read_csv("new_churn.csv")
dp.head()

In [None]:
dp = dp.drop('Unnamed: 0', axis=1)

In [None]:
# creating x and y variable for model
x=dp.drop('Churn', axis=1)
x

In [None]:
y=dp['Churn']
y

#### Decision Tree Classifier

In [None]:
# Train Test split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2, random_state=42)

In [None]:
model_dt=DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=6, min_samples_leaf=8)

In [None]:
model_dt.fit(x_train,y_train)

In [None]:
y_pred=model_dt.predict(x_test)
y_pred

In [None]:
model_dt.score(x_test,y_test)

In [None]:
print(classification_report(y_test, y_pred, labels=[0,1]))

We can see here for churn(1) the report is not showing great result so now we will use smote to upsampling and ENN of data again in model.

In [None]:
sm = SMOTE()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [None]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [None]:
model_dt_smote=DecisionTreeClassifier(criterion="gini", random_state=100, max_depth=6, min_samples_leaf=8)

In [None]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

In [None]:
print(metrics.confusion_matrix(yr_test, yr_predict))

#### Random Forest Classifier 

In [None]:
model_rf = RandomForestClassifier(n_estimators=100, criterion="gini", random_state=100, max_depth=6, min_samples_leaf=8)

In [None]:
model_rf.fit(x_train,y_train)

In [None]:
y_pred =model_rf.predict(x_test)

In [None]:
model_rf.score(x_test,y_test)

In [None]:
print(classification_report(y_test, y_pred, labels=[0,1]))

In [None]:
sm= SMOTE()
X_resampled1, y_resampled1 = sm.fit_resample(x,y)

In [None]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1, test_size=0.2)

In [None]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion="gini", random_state=100, max_depth=6, min_samples_leaf=8)

In [None]:
model_rf_smote.fit(xr_train1,yr_train1)

In [None]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [None]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

In [None]:
print(model_score_r1)
print(metrics.classification_report(yr_test1, yr_predict1))

In [None]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))

In [None]:
sns.lmplot(data=df_copy_dummy, x='Monthly_Bill', y='Age', fit_reg= False)

In [None]:
df_copy_dummy.head(1)

In [None]:
dp['Bill_Per_GB']=dp['Monthly_Bill']/dp['Total_Usage_GB']

#### Feature Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
dp.head(1)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C':[0.1, 1, 10]}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(x_train, y_train)
best_model = grid_search.best_estimator_

#### Model Deployment

In [None]:
import joblib
joblib.dump(model, 'churn_model.pkl')

In [None]:
joblib.dump(scaler, 'data_scaler.pkl')

In [None]:
from flask import Flask, request, jsonify
app = Flask(__name__)
model =joblib.load('churn_model.pkl')
scaler = joblib('data_scaler.pkl')
@app.route('/predict_churn', methods=['POST'])
def predict_churn():
    df=request.get_json()
    new_df = pd.DataFrame(dp, index=[0])
    
    new_df.fillna(method='ffill', inplace=True)
    new_df['Bill_Per_GB'] = new_df['Monthly_Bill'] / new_df['Total_Usage_GB']
    new_df = pd.get_dummies(new_df, columns=['Gender', 'Location'])
    new_df_scaled = scaler.transform(new_df)
    
    prediction = model.predict(new_df_scaled)
    
    return jsonify({'churn_prediction':prediction[0]})
if __name__ == '__main':
    app.run(debug=True)