In [29]:
import pandas as pd
import numpy as np
from category_encoders import BinaryEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
dataset=pd.read_csv('dataset1.csv')

In [30]:
dataset.columns

Index(['CustomerID', 'Count', 'Country', 'State', 'City', 'Zip Code',
       'Lat Long', 'Latitude', 'Longitude', 'Gender', 'Senior Citizen',
       'Partner', 'Dependents', 'Tenure Months', 'Phone Service',
       'Multiple Lines', 'Internet Service', 'Online Security',
       'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV',
       'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method',
       'Monthly Charges', 'Total Charges', 'Churn Label', 'Churn Value',
       'Churn Score', 'CLTV', 'Churn Reason'],
      dtype='object')

In [31]:
dataset=dataset.drop(['CustomerID','Lat Long','Latitude', 'Longitude','Churn Label',
       'Churn Score', 'CLTV', 'Churn Reason'],axis=1)

In [32]:
dataset.rename(columns={'Churn Value':'Churn'},inplace=True)


In [33]:
clf=RandomForestClassifier(n_estimators=100)


In [34]:
dataset.rename(columns={'Churn Value': 'Churn'}, inplace=True)

In [35]:
### remove rows with missing values
dataset=dataset.replace({' ':np.nan})
dataset=dataset.dropna(axis=0)


In [36]:
## check for unique values in each column
for column in dataset.columns:
    print(column)
    print(dataset[column].nunique())
    print('')

Count
1

Country
1

State
1

City
1129

Zip Code
1652

Gender
2

Senior Citizen
2

Partner
2

Dependents
2

Tenure Months
72

Phone Service
2

Multiple Lines
3

Internet Service
3

Online Security
3

Online Backup
3

Device Protection
3

Tech Support
3

Streaming TV
3

Streaming Movies
3

Contract
3

Paperless Billing
2

Payment Method
4

Monthly Charges
1584

Total Charges
6530

Churn
2



In [37]:
#removing features with 1 unique values since it will not help in prediction
for column in dataset.columns:
    if dataset[column].nunique()==1:
        dataset=dataset.drop(column,axis=1)

In [38]:
scaler=StandardScaler()
scaling_columns=["Monthly Charges","Total Charges"]
dataset[scaling_columns]=scaler.fit_transform(dataset[scaling_columns])

In [39]:
dataset["Zip Code"]=dataset["Zip Code"].astype(str)

In [40]:
dataset["Total Charges"]=dataset["Total Charges"].astype(float)

In [90]:
X=dataset.drop('Churn',axis=1)
y=dataset['Churn']
for column in X.columns:
    if X[column].dtype=='object':
        encoder=OrdinalEncoder(cols=[column])
        X=encoder.fit_transform(X)
X=X.drop(['Phone Service','Gender','Senior Citizen','Partner','Multiple Lines','Paperless Billing'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [93]:
clf=RandomForestClassifier(n_estimators=100,min_samples_split=2, max_depth=10, random_state=42)
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.8407960199004975

In [94]:
zip_importance= zip(X.columns,clf.feature_importances_)
zip_importance=sorted(zip_importance,key=lambda x:x[1],reverse=True)
zip_importance

[('Zip Code', 0.22438316933077412),
 ('Tenure Months', 0.12460147274686122),
 ('Contract', 0.12182202607645302),
 ('City', 0.10326912217839834),
 ('Total Charges', 0.08670216534438084),
 ('Monthly Charges', 0.08081361908499536),
 ('Tech Support', 0.0610149366430161),
 ('Dependents', 0.04965386771842792),
 ('Internet Service', 0.03373188403379362),
 ('Online Security', 0.02654025401441371),
 ('Payment Method', 0.023999092099365987),
 ('Device Protection', 0.019407336604764045),
 ('Online Backup', 0.017272565383057343),
 ('Streaming Movies', 0.013630557991707119),
 ('Streaming TV', 0.013157930749591415)]

In [49]:
clf.score(X_test,y_test)

0.8415067519545132