In [207]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer as s
from sklearn.impute import KNNImputer as knn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [208]:
data=pd.read_csv("telecom_customer_churn.csv")


In [209]:
data.head()

Unnamed: 0,Customer ID,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,Longitude,Number of Referrals,...,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status,Churn Category,Churn Reason
0,0002-ORFBO,Female,37,Yes,0,Frazier Park,93225,34.827662,-118.999073,2,...,Credit Card,65.6,593.3,0.0,0,381.51,974.81,1,,
1,0003-MKNFE,Male,46,No,0,Glendale,91206,34.162515,-118.203869,0,...,Credit Card,-4.0,542.4,38.33,10,96.21,610.28,1,,
2,0004-TLHLJ,Male,50,No,0,Costa Mesa,92627,33.645672,-117.922613,0,...,Bank Withdrawal,73.9,280.85,0.0,0,134.6,415.45,0,Competitor,Competitor had better devices
3,0011-IGKFF,Male,78,Yes,0,Martinez,94553,38.014457,-122.115432,1,...,Bank Withdrawal,98.0,1237.85,0.0,0,361.66,1599.51,0,Dissatisfaction,Product dissatisfaction
4,0013-EXCHZ,Female,75,Yes,0,Camarillo,93010,34.227846,-119.079903,3,...,Credit Card,83.9,267.4,0.0,0,22.14,289.54,0,Dissatisfaction,Network reliability


In [210]:
class Utils_Suite():
    def __init__(self,data):
        self.data=data
    def compute_correlation(self,threshold=0.3):
        matrix=self.data.corr()
        x=matrix[(matrix["Customer Status"]<threshold)&(matrix["Customer Status"]>-threshold)]["Customer Status"]
        return x
    def compute_mutual_information(self,thresh=0.1):
        enc = OrdinalEncoder()
        df_encoded = enc.fit_transform(self.data)
        mi_scores = mutual_info_regression(df_encoded, self.data['Customer Status'])
        mi_scores_df = pd.DataFrame(mi_scores, index=self.data.columns, columns=['Score'])
        return mi_scores_df[mi_scores_df['Score']<thresh]
    def compute_vif(self):
        x=self.data.iloc[:,:-1]
        y=self.data.iloc[:,-1]
        x=pd.DataFrame(x)

        x['intercept']=1
        vif=pd.DataFrame()
        vif['variable']=x.columns
        vif['vif']=[variance_inflation_factor(x.values,i)for i in range(x.shape[1])]
        return vif
    


In [211]:
data['Customer Status'].dtypes

dtype('int64')

In [212]:
class PreProcess():
    #Auto Run Upon Initiation .
    def __init__(self,data):
        self.data=data
        self.run()
    # PreProcessing  Schedules
    def run(self):
        self.ClearNull(threshold=0.5)
        l=self.get_all_Null(dtype='float64')
        #self.knn_impute(2,l)
        #self.outlier_remove()
        #self.data=self.data.dropna()
        #self.one_hot_encoding()
        self.StdScale()
        #self.outlier_remove('Customer Status')
        
        
        self.drop_correlation()
        self.drop_uniq_thresh(thresh=5)
        self.Label_Encoding()
        self.drop_vif(thresh=3)

    # Remove Correlation 
    def drop_correlation(self):
        k=Utils_Suite(self.data).compute_correlation(0.3)
        f=pd.DataFrame(k)
        m=list(f[(f['Customer Status']<0.1) & (f['Customer Status']>-0.1)].index)
        self.data=self.data.drop(columns=m)

    def ClearNull(self,threshold):
        x=self.data.isna().sum()>0

        for i in  list(x.index):
            thresh=self.data[i].isna().sum()/len(self.data)
            if(x[i]==True and thresh>threshold):
                print(i,self.data[i].isna().sum())
                self.data=self.data.drop(i,axis=1)


    def knn_impute(self,n_neighbors,col_list):
        imputer=knn(n_neighbors=n_neighbors)
        for i in col_list:
            self.data[i]=imputer.fit_transform(self.data[[i]])[0][0]


    def get_all_Null(self,dtype=""):
        x=self.data.isna().sum()>0
        l=[]
        for i in  list(x.index):
            thresh=self.data[i].isna().sum()/len(self.data)
            if(x[i]==True and (data[i].dtypes==dtype) ):
                print(i,data[i].isna().sum())
                l+=[i]
        return l
    # Drop Outlier Rows 
    def outlier_remove(self,col):
        
        q1=self.data[col].quantile(0.25)
        q3=self.data[col].quantile(0.75)
        iqr=q3-q1
        l_whis=q1-1.5*iqr
        u_whis=q3+1.5*iqr
        self.data= self.data[(self.data[col]>=l_whis)& (self.data[col]<=u_whis)]
    
    # One hot Encoding using get_dummies
    def one_hot_encoding(self):
        z=(self.data.dtypes=='object')
        k=pd.DataFrame(z)
        obj_list=list(k[k[0]==True].index)
        print(obj_list)
        for i in obj_list:
            dummy=pd.get_dummies(self.data[i],prefix=i,drop_first=True)
            #print(dummy)
            self.data=self.data.drop(i,axis=1)
            self.data=self.data.join(dummy)
            #self.data=pd.concat([self.data,dummy],axis=1)
    # standardize data
    def drop_uniq_thresh(self,thresh=5):
        col=data.columns
        x=pd.DataFrame(self.data.dtypes)
        ll=list(x[x[0]=="object"].index)
        droplist=[]
        for i in ll:
            print(i)
            if (len(self.data[i].unique())>thresh):
                droplist+=[i]
        print(droplist)
        self.data=self.data.drop(columns=droplist)
        
            
    def Label_Encoding(self):
        label_encoder = preprocessing.LabelEncoder() 
        x=pd.DataFrame(self.data.dtypes)
        ll=list(x[x[0]=="object"].index)
        for i in ll:
            self.data[i]= label_encoder.fit_transform(self.data[i]) 
    
    def StdScale(self):
        for i in self.data.columns:
            if self.data[i].dtypes!='object' and i!='Customer Status':
                scale = StandardScaler().fit(self.data[[i]])
    
                self.data[i] = scale.transform(self.data[[i]])

        
    ## DANGER ZONE Col Spare NEEDED To Keep y_pred. RAM HOGGING FUNCTION .
    #Use Wisely! Plus Parallize the operation for better efficacy? Maybe???
                
    def drop_vif(self,thresh=5,col_Spare=['Customer Status','intercept']):

        
        vif=Utils_Suite(self.data).compute_vif()
        z1=vif[vif["vif"]>thresh]
        z1=z1.sort_values(by='vif', kind='mergesort',ascending=[False])
        while True:
            try:
                col=z1.iloc[0,0]
                if z1.empty:
                    break
                if col in col_Spare:
                    z1=z1.iloc[1:]
                    continue
                self.data=self.data.drop(col,axis=1)
                vif=Utils_Suite(self.data).compute_vif()
                z1=vif[vif["vif"]>thresh]
                z1=z1.sort_values(by='vif', kind='mergesort',ascending=[False])
            except IndexError:
                break
        
    # Wrapper Function to dump data to a variable  
    def write_df(self):
        return self.data
    

In [213]:
x=PreProcess(data)

Churn Category 4720
Churn Reason 4720
Avg Monthly Long Distance Charges 644
Avg Monthly GB Download 1344
Customer ID
Gender
Married
City
Offer
Phone Service
Multiple Lines
Internet Service
Internet Type
Online Security
Online Backup
Device Protection Plan
Premium Tech Support
Streaming TV
Streaming Movies
Streaming Music
Unlimited Data
Contract
Paperless Billing
Payment Method
['Customer ID', 'City', 'Offer']


Unnamed: 0,Gender,Age,Married,Number of Dependents,Number of Referrals,Multiple Lines,Internet Type,Online Backup,Streaming TV,Unlimited Data,Contract,Paperless Billing,Payment Method,Monthly Charge,Total Long Distance Charges,Customer Status
0,0,-0.579600,1,-0.491619,-0.006926,0,0,1,1,1,1,1,1,0.018307,-0.487965,1
1,1,-0.045161,0,-0.491619,-0.668349,1,0,0,0,0,0,0,1,-2.219753,-0.822156,1
2,1,0.192368,0,-0.491619,-0.668349,0,2,0,0,1,0,1,0,0.285202,-0.777187,0
3,1,1.855067,1,-0.491619,-0.337638,0,2,1,1,1,0,1,0,1.060162,-0.511217,0
4,0,1.676921,1,-0.491619,0.323785,0,2,0,1,1,0,1,1,0.606762,-0.908919,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6584,0,-0.638982,0,-0.491619,-0.668349,0,3,2,2,2,0,0,0,-1.417460,-0.925436,0
6585,0,-1.589096,0,-0.491619,-0.668349,0,1,0,0,1,1,0,1,-0.317724,-0.224021,1
6586,1,-0.401453,1,-0.491619,-0.337638,1,2,0,0,1,0,1,0,0.645349,-0.517378,0
6587,1,-1.529714,1,-0.491619,0.985207,0,0,0,0,1,2,0,1,0.090658,-0.768472,1


In [198]:
data=x.write_df()

In [199]:
data.columns

Index(['Gender', 'Age', 'Married', 'Number of Dependents',
       'Number of Referrals', 'Multiple Lines', 'Internet Type',
       'Online Backup', 'Streaming TV', 'Unlimited Data', 'Contract',
       'Paperless Billing', 'Payment Method', 'Monthly Charge',
       'Total Long Distance Charges', 'Customer Status'],
      dtype='object')

In [200]:
from sklearn.model_selection import train_test_split

In [201]:


x=data.iloc[:,:-1].values
y=data.iloc[:,-1].values
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=0)



In [202]:
from sklearn.linear_model import LogisticRegression
reg=LogisticRegression()
reg.fit(x_train,y_train)

LogisticRegression()

In [203]:


y_pred=reg.predict(x_test)
(y_test,y_pred)
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

confusion_matrix(y_test,y_pred)



array([[ 384,  169],
       [ 200, 1224]])

In [204]:
accuracy_score(y_test,y_pred)

0.8133535660091047