In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


#### Functions


In [None]:
def plot_missing_values(df,fig_x,fig_y):
    # check null value
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_values_df = pd.DataFrame({'column_name': df.columns, 'percent_missing': percent_missing})
    missing_values_df.sort_values('percent_missing', inplace=True, ascending=False)
    missing_values_df = missing_values_df[missing_values_df.percent_missing > 0]
    if len(missing_values_df) > 0:
        plt.figure(figsize=(fig_x,fig_y))
        sns.barplot(x = 'percent_missing',y = 'column_name',data = missing_values_df)
        plt.show()
    else:
        print("No data available")    

#### Load the CSV file

In [None]:
df_sample = pd.read_csv("sample.csv")
df_sample.head()

#### Load the training set


In [None]:
pd.set_option('display.max_columns',230)
pd.options.mode.chained_assignment = None  # default='warn'
df_train = pd.read_csv("train.csv")
df_train.head(5)



In [None]:

df_test = pd.read_csv("test.csv")
df_test.head()


In [None]:
for col in df_train.columns:
    if col not in df_test.columns:
        print(col)
# the churn probability is not in test data      

In [None]:
# check null value
plot_missing_values(df_train,25,50)


In [None]:
# Function to impute values 
def impute_columns (df, imputes=False, mising_columns=False):
    # Function impute the nan with 0
    # argument: colList, list of columns for which nan is to be replaced with 0
    if imputes:
        for col in [impute + suffix for suffix in ['_6','_7','_8'] for impute in imputes]:
            df[col].fillna(0, inplace=True)
    else:    
        for col in mising_columns:
            df[col].fillna(0, inplace=True)

# Business Related Columns , we need to impute these , rather than drop the columns 
biz_columns = ['arpu_3g','count_rech_2g','night_pck_user','arpu_2g','total_rech_data','av_rech_amt_data','max_rech_data','count_rech_3g','fb_user']
impute_columns(df_train,biz_columns)
impute_columns(df_test,biz_columns)


In [None]:
# check null value
plot_missing_values(df_train,25,50)



In [None]:
# impute the stats columns 
interesting_columns = ['loc_og_t2c_mou','std_ic_t2t_mou','loc_og_t2t_mou','loc_og_t2m_mou',
                       'std_ic_t2f_mou','loc_og_t2f_mou','std_ic_t2m_mou','loc_og_mou',
                       'std_og_t2t_mou','std_og_t2m_mou','std_og_t2f_mou','std_ic_t2o_mou',
                       'std_og_t2c_mou','std_og_mou','loc_ic_mou','isd_og_mou',
                       'spl_og_mou','og_others','loc_ic_t2f_mou','loc_ic_t2m_mou',
                       'roam_og_mou','loc_ic_t2t_mou','spl_ic_mou','ic_others',
                       'roam_ic_mou','onnet_mou','isd_ic_mou','offnet_mou','std_ic_mou']

impute_columns(df_train,imputes=interesting_columns)
impute_columns(df_test,imputes=interesting_columns)

In [None]:
# check null value
plot_missing_values(df_train,5,5)


In [None]:
# date column treatment
last_day_data_columns = ['date_of_last_rech_data_6','date_of_last_rech_data_7','date_of_last_rech_data_8']
df_train[df_train['date_of_last_rech_data_6'].isnull()]['date_of_last_rech_data_6'] = '6/30/2014'
df_train[df_train['date_of_last_rech_data_7'].isnull()]['date_of_last_rech_data_7'] = '7/31/2014'
df_train[df_train['date_of_last_rech_data_8'].isnull()]['date_of_last_rech_data_8'] = '8/31/2014'

df_test[df_test['date_of_last_rech_data_6'].isnull()]['date_of_last_rech_data_6'] = '6/30/2014'
df_test[df_test['date_of_last_rech_data_7'].isnull()]['date_of_last_rech_data_7'] = '7/31/2014'
df_test[df_test['date_of_last_rech_data_8'].isnull()]['date_of_last_rech_data_8'] = '8/31/2014'


In [None]:
# check values for uniqueness
unique_value_columns = ['last_date_of_month_8','loc_og_t2o_mou','std_og_t2o_mou','loc_ic_t2o_mou','last_date_of_month_7']
for col in unique_value_columns:
    print(df_train[col].unique())
    

In [None]:
for col in unique_value_columns:
    print(df_test[col].unique())

In [None]:
for col in unique_value_columns:
   df_train.fillna(df_train[col].mode()[0],inplace=True)
   df_test.fillna(df_test[col].mode()[0],inplace=True)

In [None]:
plot_missing_values(df_train,5,5)
df_train.shape

In [None]:
plot_missing_values(df_test,5,5)
df_test.shape

In [None]:
# Columns with Single Values 
single_valued_columns = df_train.columns[df_train.nunique() <= 1]
single_valued_columns

In [None]:
single_value_drop_columns = ['std_og_t2c_mou_6','std_og_t2c_mou_7','std_og_t2c_mou_8',
                             'std_ic_t2o_mou_6','std_ic_t2o_mou_7','std_ic_t2o_mou_8','circle_id']
df_train.drop(single_value_drop_columns,axis=1,inplace=True)
df_test.drop(single_value_drop_columns,axis=1,inplace=True)

In [None]:
df_train.shape

In [None]:
len(df_train.id.value_counts())
#remove id
df_train.drop(['id'],axis=1,inplace=True)
df_test.drop(['id'],axis=1,inplace=True)

In [None]:
df_train.reset_index(inplace=True,drop=True)
df_train.shape
df_test.reset_index(inplace=True,drop=True)
df_test.shape

In [None]:
# date Columns
dates = list(df_train.filter(regex='date').columns)
dates

In [None]:
for date_column in dates:
    df_train[date_column] = pd.to_datetime(df_train[date_column], format='%m/%d/%Y')
    df_test[date_column] = pd.to_datetime(df_test[date_column], format='%m/%d/%Y')

In [None]:
df_train.info()

for dtype_col in df_train.columns:
    if df_train[dtype_col].dtype == object:
        print(dtype_col)

In [None]:
df_train.loc_ic_t2o_mou.unique()
# drop these columns 
object_columns = ['loc_og_t2o_mou','std_og_t2o_mou','loc_ic_t2o_mou']
df_train.drop(object_columns,axis=1,inplace=True)
df_test.drop(object_columns,axis=1,inplace=True)


In [None]:
df_train.info()

In [None]:
unconventional_columns = list(df_train.filter(regex='vbc').columns)
unconventional_columns


In [None]:
# rename to conventional column names
df_train.rename(columns={'aug_vbc_3g' : 'vbc_3g_8', 'jul_vbc_3g' : 'vbc_3g_7', 'jun_vbc_3g' : 'vbc_3g_6'}, inplace=True)
df_test.rename(columns={'aug_vbc_3g' : 'vbc_3g_8', 'jul_vbc_3g' : 'vbc_3g_7', 'jun_vbc_3g' : 'vbc_3g_6'}, inplace=True)

In [None]:
for col in df_train.columns:
    print(col)

In [None]:
# Check Skewness
for col_name in df_train.columns:
    if (len(df_train[col_name].unique()) <= 8):
        print(df_train[col_name].value_counts())
        print(f"\n{35 * '-'}")

In [None]:
df_train_2 = df_train.copy()
#Find Highly correlated data and drop Highly Correlated Columns
cor = df_train_2.corr()
cor.loc[:,:] = np.tril(cor, k=-1)
cor = cor.stack()
cor[(cor > 0.60) | (cor < -0.60)].sort_values(ascending=False)


## PCA

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
df_train.drop(dates,axis=1,inplace=True)




In [None]:
df_3 = df_train_2.select_dtypes(exclude=['datetime64'])
df_3.info()

X = df_3.drop(['churn_probability'], axis=1)
y = df_3['churn_probability']

scaler = StandardScaler().fit(X)
X = scaler.transform(X)

df_4 = df_test.select_dtypes(exclude=['datetime64'])
X1_test = df_4.copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=1)



In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE

sm = SMOTE()
X_tr,y_tr = sm.fit_resample(X_train,y_train)
print(X_tr.shape)
print(y_tr.shape)

#Improting the PCA module
pca = PCA(random_state=100)

#Doing the PCA on the train data
pca.fit(X_tr)

X_tr_pca = pca.fit_transform(X_tr)
print(X_tr_pca.shape)

X_test_pca = pca.transform(X_test)
print(X_test_pca.shape)

X_tr_pca_1 = X_tr_pca.copy()
X_test_pca_1 = X_test_pca.copy()

X_tr_pca_2 = X_tr_pca.copy()
X_test_pca_2 = X_test_pca.copy()

In [None]:
from sklearn.linear_model import LogisticRegression
lr_pca = LogisticRegression(max_iter=500)
lr_pca.fit(X_tr_pca, y_tr)

# Predicted probabilities
y_pred = lr_pca.predict(X_test_pca)

# Converting y_pred to a dataframe which is an array
y_pred_df = pd.DataFrame(y_pred)

In [None]:
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
conf_mat = confusion_matrix(y_test,y_pred)
print(conf_mat)

In [None]:
TP = conf_mat[1,1]
# Substituting the value of true negatives
TN = conf_mat[0,0]
# Substituting the value of false positives
FP = conf_mat[0,1] 
# Substituting the value of false negatives
FN = conf_mat[1,0]

# Calculating the sensitivity
sens_log_pca=TP/(TP+FN)

# Calculating the specificity
spec_log_pca=TN/(TN+FP)

print("Sensitivity:" ,sens_log_pca)
print("Specificity:" ,spec_log_pca)


In [None]:

print("Logistic Regression accuracy with PCA: ",accuracy_score(y_test,y_pred))

In [None]:
df_5=df_3.copy()
df_5.drop('churn_probability', axis=1,inplace=True)
col = list(df_5.columns)
df_pca = pd.DataFrame({'PCA1':pca.components_[0],'PCA2':pca.components_[1], 'PCA3':pca.components_[2],'Feature':col})
df_pca.head(10)



In [None]:
%matplotlib inline
fig = plt.figure(figsize = (12,8))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()

In [None]:
np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

In [None]:
pca_54 = PCA(n_components=54)

df_tr_pca_54 = pca_54.fit_transform(X_tr)
print(df_tr_pca_54.shape)

df_test_pca_54 = pca_54.transform(X_test)
print(df_test_pca_54.shape)

In [None]:
# Let's run the model using the selected variables
lr_pca1 = LogisticRegression(max_iter=500)
lr_pca1.fit(df_tr_pca_54, y_tr)

# Predicted probabilities
y_pred54 = lr_pca1.predict(df_test_pca_54)

# Converting y_pred to a dataframe which is an array
df_y_pred = pd.DataFrame(y_pred54)

conf_matrices = confusion_matrix(y_test,y_pred54)

# check sensitivity and specificity

# Substituting the value of true positive
TP = conf_matrices[1,1]
# Substituting the value of true negatives
TN = conf_matrices[0,0]
# Substituting the value of false positives
FP = conf_matrices[0,1] 
# Substituting the value of false negatives
FN = conf_matrices[1,0]

# Calculating the sensitivity
sens_log_pca=TP/(TP+FN)

# Calculating the specificity
spec_log_pca=TN/(TN+FP)

print("Sensitivity:" ,sens_log_pca)
print("Specificity:" ,spec_log_pca)

In [None]:
print("Logistic Regression accuracy with PCA: ",accuracy_score(y_test,y_pred54))

In [None]:
# Tuning hyper parameters 
# Importing libraries for cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Creating KFold object with 5 splits
folds = KFold(n_splits=5, shuffle=True, random_state=4)

# Specify params
params = {"C": [0.01, 0.1, 1, 10, 100, 1000]}

# Specifing score as recall as we are more focused on acheiving the higher sensitivity than the accuracy
model_cv = GridSearchCV(estimator = LogisticRegression(),
                        param_grid = params, 
                        scoring= 'recall', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=True) 

# Fit the model
model_cv.fit(X_tr_pca_1, y_tr)


In [None]:
# results of grid search CV
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(cv_results['param_C'], cv_results['mean_test_score'])
plt.plot(cv_results['param_C'], cv_results['mean_train_score'])
plt.xlabel('C')
plt.ylabel('sensitivity')
plt.legend(['test result', 'train result'], loc='upper left')
plt.xscale('log')

In [None]:
# Best score with best C
best_score = model_cv.best_score_
best_C = model_cv.best_params_['C']

print(" The highest test sensitivity is {0} at C = {1}".format(best_score, best_C))


#### Logistic regression with optimal C


In [None]:


# Instantiate the model with best C
logistic_pca = LogisticRegression(C=best_C)

# Fit the model on the train set
log_pca_model = logistic_pca.fit(X_tr_pca_1, y_tr)

# Predictions on the train set
y_train_pred = log_pca_model.predict(X_tr_pca_1)

# Confusion matrix
confusion = metrics.confusion_matrix(y_tr, y_train_pred)
print(confusion)

TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

# Accuracy
print("Accuracy:-",metrics.accuracy_score(y_tr, y_train_pred))

# Sensitivity
print("Sensitivity:-",TP / float(TP+FN))

# Specificity
print("Specificity:-", TN / float(TN+FP))

# Prediction on the test set
y_test_pred = log_pca_model.predict(X_test_pca)

# Confusion matrix
confusion = metrics.confusion_matrix(y_test, y_test_pred)
print(confusion)

TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

# Accuracy
print("Accuracy:-",metrics.accuracy_score(y_test, y_test_pred))

# Sensitivity
print("Sensitivity:-",TP / float(TP+FN))

# Specificity
print("Specificity:-", TN / float(TN+FP))

### Random Forest

In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold

# random forest - the class weight is used to handle class imbalance - it adjusts the cost function
forest = RandomForestClassifier(class_weight={0:0.1, 1: 0.9}, n_jobs = 500)

# hyperparameter space
params = {"criterion": ['gini', 'entropy'], "max_features": ['sqrt', 0.4]}

# create 5 folds
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 4)

# create gridsearch object
model = GridSearchCV(estimator=forest, cv=folds, param_grid=params, scoring='roc_auc', n_jobs=500, verbose=1)


# fit model
model.fit(X_tr_pca_2, y_tr)


