In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import os

# Get the current working directory
base_dir = os.getcwd()

data_dir = 'C:/Users/HP/Desktop/Uni work/Projets/Prediction du Churn/data'

df_bigml_20 = pd.read_csv(os.path.join(data_dir, 'churn-bigml-20.csv'))
df_bigml_80 = pd.read_csv(os.path.join(data_dir, 'churn-bigml-80.csv'))
df = pd.concat([df_bigml_20, df_bigml_80], axis=0)

# Check the shape
print("Combined dataset shape:", df.shape)
df.head()

Combined dataset shape: (3333, 20)


Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,LA,117,408,No,No,0,184.5,97,31.37,351.6,80,29.89,215.8,90,9.71,8.7,4,2.35,1,False
1,IN,65,415,No,No,0,129.1,137,21.95,228.5,83,19.42,208.8,111,9.4,12.7,6,3.43,4,True
2,NY,161,415,No,No,0,332.9,67,56.59,317.8,97,27.01,160.6,128,7.23,5.4,9,1.46,4,True
3,SC,111,415,No,No,0,110.4,103,18.77,137.3,102,11.67,189.6,105,8.53,7.7,6,2.08,2,False
4,HI,49,510,No,No,0,119.3,117,20.28,215.1,109,18.28,178.7,90,8.04,11.1,1,3.0,1,False


In [3]:
# Convert target variable to binary
le = LabelEncoder()
df['Churn'] = le.fit_transform(df['Churn'])
print("Churn variable encoded (False=0, True=1):")
print(df['Churn'].value_counts())

Churn variable encoded (False=0, True=1):
Churn
0    2850
1     483
Name: count, dtype: int64


In [4]:
# Handle categorical variables
# First, identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
print("Categorical columns:", list(categorical_cols))

# Convert 'yes'/'no' features to binary (1/0)
for col in ['International plan', 'Voice mail plan']:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

Categorical columns: ['State', 'International plan', 'Voice mail plan']


In [5]:
# One-hot encode remaining categorical features
df = pd.get_dummies(df, columns=['State'], drop_first=True)

# Check the updated dataframe
print("DataFrame shape after encoding:", df.shape)
df.head()

DataFrame shape after encoding: (3333, 69)


Unnamed: 0,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,...,State_SD,State_TN,State_TX,State_UT,State_VA,State_VT,State_WA,State_WI,State_WV,State_WY
0,117,408,0,0,0,184.5,97,31.37,351.6,80,...,False,False,False,False,False,False,False,False,False,False
1,65,415,0,0,0,129.1,137,21.95,228.5,83,...,False,False,False,False,False,False,False,False,False,False
2,161,415,0,0,0,332.9,67,56.59,317.8,97,...,False,False,False,False,False,False,False,False,False,False
3,111,415,0,0,0,110.4,103,18.77,137.3,102,...,False,False,False,False,False,False,False,False,False,False
4,49,510,0,0,0,119.3,117,20.28,215.1,109,...,False,False,False,False,False,False,False,False,False,False


In [9]:
# Feature engineering
# Create new features that might be useful
df['Minutes_per_call'] = df['Total day minutes'] / df['Total day calls']
df['Charge_per_minute'] = df['Total day charge'] / df['Total day minutes']


# Fill NaN values created by division by zero
df.fillna(0, inplace=True)

# Check the new features
df[['Minutes_per_call', 'Charge_per_minute']].describe()

Unnamed: 0,Minutes_per_call,Charge_per_minute
count,3333.0,3333.0
mean,1.871421,0.169901
std,0.732772,0.004164
min,0.0,0.0
25%,1.379592,0.169989
50%,1.790517,0.170004
75%,2.261765,0.170017
max,7.223333,0.170513


In [10]:
# Remove redundant features
# Since charges are directly calculated from minutes, we can remove them
cols_to_drop = ['Total day charge', 'Total eve charge', 'Total night charge', 'Total intl charge']
df = df.drop(columns=cols_to_drop)

print("Dataframe shape after dropping redundant features:", df.shape)

Dataframe shape after dropping redundant features: (3333, 67)


In [11]:
# Standardize numerical features
numerical_cols = [col for col in df.select_dtypes(include=['int64', 'float64']).columns 
                  if col != 'Churn']

scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Check the scaled data
df[numerical_cols].describe()

Unnamed: 0,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total eve minutes,Total eve calls,Total night minutes,Total night calls,Total intl minutes,Total intl calls,Customer service calls,Minutes_per_call,Charge_per_minute
count,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0
mean,1.449652e-16,4.263683e-16,-2.5582100000000002e-17,-6.928485e-17,7.674629e-17,-3.155125e-16,-1.982612e-16,-7.568037000000001e-17,3.309684e-16,8.100997e-17,-5.143067000000001e-17,-3.400287e-16,-6.395524e-18,3.197762e-18,1.684155e-16,6.42537e-15
std,1.00015,1.00015,1.00015,1.00015,1.00015,1.00015,1.00015,1.00015,1.00015,1.00015,1.00015,1.00015,1.00015,1.00015,1.00015,1.00015
min,-2.513172,-0.6888343,-0.3275805,-0.6183963,-0.5917599,-3.301096,-5.005247,-3.963622,-5.025911,-3.513648,-3.42987,-3.667413,-1.820289,-1.188218,-2.554275,-40.80963
25%,-0.6797448,-0.6888343,-0.3275805,-0.6183963,-0.5917599,-0.6624241,-0.6695701,-0.67803,-0.658361,-0.6698545,-0.669934,-0.622369,-0.6011951,-0.427932,-0.67129,0.02109521
50%,-0.001627644,-0.5236033,-0.3275805,-0.6183963,-0.5917599,-0.006887677,0.02812491,0.008276141,-0.00573863,0.006485803,-0.005505089,0.02246393,-0.1948306,-0.427932,-0.1104239,0.02471983
75%,0.651374,1.718817,-0.3275805,1.617086,0.8695542,0.6725198,0.6759846,0.676833,0.6970854,0.6808485,0.6589239,0.6672969,0.6178983,0.3323545,0.5327749,0.02774944
max,3.564766,1.718817,3.052685,1.617086,3.134591,3.140422,3.217588,3.209066,3.508382,3.839081,3.827739,3.497397,6.307001,5.65436,7.304748,0.1469036


In [12]:
# Split the data into features and target
X = df.drop('Churn', axis=1)
y = df['Churn']

# Save the cleaned data
df.to_csv('cleaned_telecom_data.csv', index=False)

print("Data cleaning completed. Dataset saved as 'cleaned_telecom_data.csv'")
print("Final dataset shape:", df.shape)

Data cleaning completed. Dataset saved as 'cleaned_telecom_data.csv'
Final dataset shape: (3333, 67)
