In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.ticker as mtick
import matplotlib.pyplot as plt

In [None]:
telco_base_data= pd.read_csv("/content/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [None]:
telco_base_data.head()

In [None]:
print(telco_base_data['TotalCharges'].dtypes)

In [None]:
 telco_base_data.shape

In [None]:
telco_base_data.columns.values

telco_base_data.dtypes

In [None]:
telco_base_data.describe()

In [None]:
#  To find the churn ration
telco_base_data['Churn'].value_counts().plot(kind= 'barh', figsize=(7,5))
plt.xlabel("Count", labelpad= 14)
plt.ylabel("Target Variable", labelpad= 14)
plt.title("Count of Target Variable per category", y=1.02)

In [None]:
cout= telco_base_data["Churn"].value_counts()

In [None]:
for i in cout:
  print((i/len(telco_base_data["Churn"]))*100)

In [None]:
# For imbalanced data we use upsampling and downsampling

In [None]:
telco_base_data.info(verbose=True)

In [None]:
missing= pd.DataFrame((telco_base_data.isnull().sum())*100/telco_base_data.shape[0]).reset_index()
print(missing)
plt.figure(figsize= (16, 5))
# The 'x' and 'y' values are specified within the 'data' argument
# using column names. The '0' column contains the percentage of
# missing values, which was previously unnamed.
missing.rename(columns={0: 'missing_percentage'}, inplace=True)
print(missing)
sns.pointplot(x='index', y='missing_percentage', data=missing)
plt.show()


In [None]:
import pandas as pd

# Sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, 30, 28]}
df = pd.DataFrame(data, index=['a', 'b', 'c'])

# # 1. Resetting the index, keeping the original index as a column
# df_reset = df.reset_index()
# print(df_reset)

# # 2. Resetting the index, dropping the original index
# df_reset_drop = df.reset_index(drop=True)
# print(df_reset_drop)

# # 3. Resetting a MultiIndex
# # (Assume df has a MultiIndex)
# df_reset_level = df.reset_index(level=1)

# # 4. Setting the name of the new column
# df_reset_name = df.reset_index(name='OriginalIndex')
# print(df_reset_name)

# # 5. Modifying the original DataFrame directly
# df.reset_index(inplace=True)
# print(df)

In [None]:
# Data cleaning

In [None]:
# Create a copy of base data for manupulation & processing
telco_data= telco_base_data.copy()

In [None]:
telco_data.TotalCharges= pd.to_numeric(telco_data.TotalCharges, errors='coerce')
telco_data.isnull().sum()

In [None]:
telco_data.loc[telco_data['TotalCharges'].isnull()==True]
print(telco_data)

In [None]:
 telco_data.dropna(how= 'any', inplace= True)
 #

In [None]:
# df.fillna(0)  # Fills all NaN values with 0
# df['Age'].fillna(df['Age'].mean())  # Fills NaN in 'Age' with the mean age

In [None]:
print(telco_data['tenure'].max())

In [None]:
labels= ["{0} - {0}".format(i, i+11) for i in range(1, 72, 12)]
telco_data['tenure_group']= pd.cut(telco_data.tenure, range(1, 80, 12), right= False, labels= labels)

In [None]:
telco_data['tenure_group'].value_counts()

In [None]:
telco_data.drop(columns= ['customerID', 'tenure'], axis= 1, inplace= True)

In [None]:
telco_data.head()

In [None]:
for i, predictor in enumerate(telco_data.drop(columns= ['Churn', 'TotalCharges', 'MonthlyCharges'])):
  plt.figure(i)
  gen= sns.countplot(data=telco_data, y= predictor, hue= 'Churn')


In [None]:
telco_data['Churn']= np.where(telco_data.Churn== 'Yes', 1, 0)

In [None]:
telco_data.head()

In [None]:
#One hot encoding --> use to convert chat data
# in this process lets take geography as an example there will be many contries for each country we create new column add put 1 if its the location or print 0
# Another process dummy trap

In [None]:
telco_data_dumies= pd.get_dummies(telco_data)

In [None]:
telco_data_dumies.head()

In [None]:
sns.lmplot(data= telco_data_dumies, x= 'MonthlyCharges', y='TotalCharges', fit_reg= False)

In [None]:
plt.figure()
Mth= sns.kdeplot(telco_data_dumies.MonthlyCharges[(telco_data_dumies['Churn']==0)], color= 'Red', shade= True)
Mth= sns.kdeplot(telco_data_dumies.MonthlyCharges[(telco_data_dumies['Churn']==1)], color= 'Blue', shade= True)
Mth.legend(["No Churn", "Churn"], loc= 'upper right')
Mth.set_ylabel('Density')
Mth.set_xlabel('Monthly Charges')
Mth.set_title('Monthly charges by churn')

In [None]:
plt.figure(figsize= (20, 8))
telco_data_dumies.corr()['Churn'].sort_values(ascending= False).plot(kind= 'bar')

In [None]:
correlation_matrix= np.corrcoef(telco_data_dumies['MonthlyCharges'], telco_data_dumies['TotalCharges'])
print(correlation_matrix)

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(telco_data_dumies.corr(), cmap="Paired")

In [None]:
new_df1_target0=telco_data.loc[telco_data["Churn"]==0]
new_df1_target1=telco_data.loc[telco_data["Churn"]==1]

In [None]:

def uniplot(df,col,title,hue =None):

    sns.set_style('whitegrid')
    sns.set_context('talk')
    plt.rcParams["axes.labelsize"] = 20
    plt.rcParams['axes.titlesize'] = 22
    plt.rcParams['axes.titlepad'] = 30


    temp = pd.Series(data = hue)
    fig, ax = plt.subplots()
    width = len(df[col].unique()) + 7 + 4*len(temp.unique())
    fig.set_size_inches(width , 8)
    plt.xticks(rotation=45)
    plt.yscale('log')
    plt.title(title)
    ax = sns.countplot(data = df, x= col, order=df[col].value_counts().index,hue = hue,palette='bright')

    plt.show()


In [None]:
uniplot(new_df1_target1,col='Partner',title='Distribution of Gender for Churned Customers',hue='gender')

In [None]:
uniplot(new_df1_target0,col='Partner',title='Distribution of Gender for Non Churned Customers',hue='gender')

In [None]:
uniplot(new_df1_target1,col='PaymentMethod',title='Distribution of PaymentMethod for Churned Customers',hue='gender')

In [None]:
uniplot(new_df1_target1,col='Contract',title='Distribution of Contract for Churned Customers',hue='gender')


In [None]:
uniplot(new_df1_target1,col='TechSupport',title='Distribution of TechSupport for Churned Customers',hue='gender')


In [None]:
uniplot(new_df1_target1,col='SeniorCitizen',title='Distribution of SeniorCitizen for Churned Customers',hue='gender')


In [None]:
telco_data_dumies.to_csv('tel_churn.csv')
