In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# import data 
churn = pd.read_csv('data/ecommerce_churn.csv')

In [None]:
#observe the data
churn.head()

In [None]:
len(churn.columns)

In [None]:
len(churn)

# Column Definitions:

- CustomerID: Unique customer ID
- Churn: Churn Flag
- Tenure: Tenure of customer in organization
- PreferredLoginDevice: Preferred login device of customer
- CityTier: City Tier
- WarehouseToHome: Distance in between warehouse to home of customer
- PreferredPaymentMode: Preferred payment method of customer
- Gender: Gender of customer
- HourSpendOnApp: Number of hours spend on mobile application or website
- NumberOfDeviceRegistered: Total number of deceives is registered on particular customer
- PreferedOrderCat: Preferred order category of customer in last month
- SatisfactionScore: Satisfactory score of customer on service
- MaritalStatus: Marital status of customer
- NumberOfAddress: Total number of added added on particular customer
- Complain: Any complaint has been raised in last month
- OrderAmountHikeFromlastYear: Percentage increases in order from last year
- CouponUsed: Total number of coupon has been used in last month
- OrderCount: Total number of orders has been places in last month
- DaySinceLastOrder: Day Since last order by customer
- CashbackAmount: Average cashback in last month!

## Data Cleaning/EDA:

In [None]:
churn = churn.drop(columns=['CustomerID'])

In [None]:
numerical = churn.select_dtypes(include=['int', 'float']).columns.tolist()

In [None]:
# check for null values
for col in churn.columns:
    print(col+' : '+str(sum(churn[col].isna())))

In [None]:
churn.isna().mean(axis=1).value_counts()

In [None]:
missing_cols = ['Tenure','WarehouseToHome','HourSpendOnApp','OrderAmountHikeFromlastYear','CouponUsed','OrderCount','DaySinceLastOrder']



In [None]:
### fill it in! 
mean_imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
churn[missing_cols] = mean_imputer.fit_transform(churn[missing_cols])

#### What can we infer from the heatmap?

In [None]:
## Pearson Correlation
plt.figure(figsize=(20,20))
plt.title('Pearson Correlation of Features', size = 15)
colormap = sns.diverging_palette(10, 220, as_cmap = True)
sns.heatmap(churn[numerical].corr(),
            cmap = colormap,
            square = True,
            annot = True,
            linewidths=0.1,vmax=1.0, linecolor='white',
            annot_kws={'fontsize':12 })
plt.show()

#### Let's make some visualizations!

In [None]:
##To-Do as exercise!

#### Dealing with categorical data


In [None]:
## categorical data
churn['Churn'].value_counts()

In [None]:
churn['Gender'].value_counts()

In [None]:
churn['MaritalStatus'].value_counts()

In [None]:
churn['PreferredLoginDevice'].value_counts()

In [None]:
churn['PreferedOrderCat'].value_counts()

### Can we reduce the cardinality of the features? 

In [None]:
# Data Cleaning
churn['PreferredLoginDevice'].replace({'Mobile Phone':'Phone'},inplace=True)

In [None]:
churn['MaritalStatus'].replace({'Divorced':'Single'},inplace=True)

## Feature Engineering:

In [None]:
def create_interaction(df,var1,var2):
    name = var1 + "*" + var2
    df[name] = pd.Series(df[var1] + df[var2], name=name)

In [None]:
create_interaction(churn,'Gender','MaritalStatus')

In [None]:
# one-hot encoding
poc_ohe = pd.get_dummies(churn['PreferedOrderCat'])
churn = pd.concat([churn,poc_ohe],axis= 1)

In [None]:
# one-hot encoding
gm_ohe = pd.get_dummies(churn['Gender*MaritalStatus'])
churn = pd.concat([churn,gm_ohe],axis= 1)

In [None]:
# we forgot something...
churn.drop(['PreferedOrderCat', 'Gender','MaritalStatus','Gender*MaritalStatus'], axis=1,inplace=True)

In [None]:
# # ALTERNATIVE: label encoding
# le = preprocessing.LabelEncoder()
# churn['Gender'] = le.fit_transform(churn['Gender'])
# churn['MaritalStatus'] = le.fit_transform(churn['MaritalStatus'])

In [None]:
le = preprocessing.LabelEncoder()
churn['PreferredLoginDevice'] = le.fit_transform(churn['PreferredLoginDevice'])

In [None]:
churn

## Model Training + Evaluation

In [None]:
churn.columns

In [None]:
features = ['Tenure','PreferredLoginDevice','CityTier','WarehouseToHome','SatisfactionScore',
            'Complain','DaySinceLastOrder','CashbackAmount','FemaleMarried','FemaleSingle','MaleMarried','MaleSingle',
           'Fashion','Grocery','Laptop & Accessory','Mobile','Mobile Phone']

In [None]:
X = churn[features]
y = churn['Churn']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### First, cross-validation!

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]


# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [None]:
random_grid

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)


In [None]:
rf_random.best_params_

### Model Training, it's as easy as 3 lines of code!

In [None]:
rfc = RandomForestClassifier(n_estimators=1000,min_samples_split = 2, min_samples_leaf = 1, max_features = 'sqrt',
                            max_depth = 50, bootstrap=False)
rfc.fit(X_train, y_train)
 
# making predictions on the testing set
y_pred = rfc.predict(X_test)


In [None]:
y_pred

### Model Evaluation, Inference

In [None]:
from sklearn import metrics
print("Random Forest model accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Random Forest F1 Score:", metrics.f1_score(y_test, y_pred,average='macro'))

In [None]:
from sklearn import metrics
print("Random Forest Confusion Matrix:\n", metrics.confusion_matrix(y_test, y_pred))
print("Random Forest Precision", metrics.precision_score(y_test, y_pred))
print("Random Forest Recall", metrics.recall_score(y_test, y_pred))


### What do we know, and what can we do?

In [None]:
scores = rfc.predict_proba(X_test) 
fpr, tpr, thresholds = metrics.roc_curve(y_test, scores[:,1])

In [None]:
scores

In [None]:
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(fpr, tpr, marker='.', label='Logistic')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
J = tpr - fpr
ix = np.argmax(J)
best_thresh = thresholds[ix]
print('Best Threshold=%f' % (best_thresh))

In [None]:
### Exercise: how do we obtain updated predictions?
y_predstar = (scores[:,1] >= best_thresh).astype('int')


In [None]:
y_predstar

In [None]:
print("Random Forest model accuracy:", metrics.accuracy_score(y_test, y_predstar))
print("Random Forest F1 Score:", metrics.f1_score(y_test, y_predstar,average='macro'))

In [None]:
print("Random Forest Confusion Matrix:\n", metrics.confusion_matrix(y_test, y_predstar))
print("Random Forest Precision", metrics.precision_score(y_test, y_predstar))
print("Random Forest Recall", metrics.recall_score(y_test, y_predstar))


In [None]:
importances = rfc.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfc.estimators_], axis=0)

In [None]:
forest_importances = pd.Series(importances, index=features)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()
