# Description
This is a python program to predict customer churn using sklearn.

In [1]:
#import the libraries
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import  LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
# import Data set into a data frame

df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head(7)

FileNotFoundError: [Errno 2] File WA_Fn-UseC_-Telco-Customer-Churn.csv does not exist: 'WA_Fn-UseC_-Telco-Customer-Churn.csv'

In [None]:
#show number of columns and rows
df.shape

# 7043 Customers as rows and 21 data points as columns

In [None]:
#show all of columns
df.columns.values

In [None]:
#check for missing data or NA values
df.isna().sum()

In [None]:
#show some statistics
df.describe()

In [None]:
#get customer churn count
df['Churn'].value_counts()

In [None]:
#visualize the count of customer churn
sns.countplot(df['Churn'])

In [None]:
#what is the percentage of customer that are leaving
num_retained = df[df.Churn == 'No'].shape[0]
num_churned = df[df.Churn == 'Yes'].shape[0]

#print the percentage of customers that stayed
print(num_retained / (num_retained + num_churned) * 100, '% of customers stayed with the company')

#print the percentage of customers that left
print(num_churned / (num_retained + num_churned) * 100, '% of customers left with the company')

In [None]:
#visualalize the churn count for both male and females
sns.countplot(x='gender', hue='Churn', data=df)

It seems gender has **no impact on our data** as the ration of male and female retained and left is the same. so gender biase is not the reason.

In [None]:
#Visualizr the churn for internet service
sns.countplot(x='InternetService', hue='Churn', data=df)

Here's our first insight that will make an impact when proving your case. It seems customers that were using **DSL service were not satisfied** and left while customers that were using **Fiber Optic stayed with the company**.

In [None]:
#
numerical_features = ['tenure', 'MonthlyCharges']
fig, ax = plt.subplots(1, 2, figsize=(28,8))
#plt.title('monthly charges')
df[df.Churn == 'No'][numerical_features].hist(bins=20, color='blue', alpha=0.5, ax=ax)
df[df.Churn == 'Yes'][numerical_features].hist(bins=20, color='orange', alpha=0.5, ax=ax)

- **Left is the Tenure Hist - Right is the Monthly Charges Hist**
- **Blue are the customers that stayed and Oragnge are the customers that moved**

**When inspecting Monthly Charges:**
When inspecting the monthly charges we can conclude that customers paying between 20 - 30. The churn count is a lot higher for customers paying 70 - 100.

**When inspecting Tenure:**
When inspecting the tenure we can conclude that most customers tend to churn in the first month whereas the least amount of people churn in long term tenure between 65 and 70 months.

In [None]:
#remove unnecessary columns
cleaned_df = df.drop('customerID', axis= 1)

#look at nunber of clean rows and columns
cleaned_df.head(5)

In [None]:
#convert all of non-numeric columns to numeric columns
for column in cleaned_df.columns:
    if cleaned_df[column].dtype == np.number:
        continue
    cleaned_df[column] = LabelEncoder().fit_transform(cleaned_df[column])

#look at cleaned dataset data types
cleaned_df.dtypes

In [None]:
#look at nunber of cleaned Data set
cleaned_df.head(5)

In [None]:
# scale the data

x= cleaned_df.drop('Churn', axis= 1) # feature data set
y= cleaned_df['Churn']

x= StandardScaler().fit_transform(x)

In [None]:
# Split the data into 80% training and 20% testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
#Create the model
model = LogisticRegression()

#Train the model
model.fit(x_train, y_train)

In [None]:
#create predictions on the test data
predictions = model.predict((x_test))

#print the pridictions
print(predictions)

In [None]:
# Check the precision, recall, f1-score
print(classification_report(y_test, predictions))