In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Show first few rows
df.head()

In [None]:
# Dataset shape
print("Shape:", df.shape)

# Data types & missing values
df.info()

# Check for nulls
df.isnull().sum()

In [5]:
# Replace blank strings with NaN and convert to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Drop rows with missing TotalCharges
df.dropna(subset=['TotalCharges'], inplace=True)

In [None]:
# Count of Churn values
sns.countplot(data=df, x='Churn')
plt.title("Churn Count")
plt.show()

# Churn ratio
df['Churn'].value_counts(normalize=True)

In [None]:
sns.countplot(data=df, x='Contract', hue='Churn')
plt.title("Churn by Contract Type")
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(data=df,x='Churn', y='MonthlyCharges')
plt.title("Monthly Charges by Churn")
plt.show()

In [9]:
df.drop('customerID', axis=1, inplace=True)


In [10]:
from sklearn.preprocessing import LabelEncoder

# Convert 'Yes/No' to 1/0
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']

for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

# Replace 'gender' with 1 (Male) and 0 (Female)
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})

# One-hot encode other categorical columns
df = pd.get_dummies(df, drop_first=True)


In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[num_cols] = scaler.fit_transform(df[num_cols])


In [None]:
from sklearn.model_selection import train_test_split

# Features and target
X = df.drop('Churn', axis=1)
y = df['Churn']

# Split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check shapes
print("X_train:", X_train.shape)
print("X_test :", X_test.shape)
print("y_train:", y_train.shape)
print("y_test :", y_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression

# Create model
model = LogisticRegression(max_iter=1000)

# Train (fit) the model on training data
model.fit(X_train, y_train)

In [14]:
# Predict on test data
y_pred = model.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, classification_report

# Create a dictionary of models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

# Dictionary to hold results
accuracies = {}

# Evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)                     # Train the model
    y_pred = model.predict(X_test)                  # Predict on test set
    acc = accuracy_score(y_test, y_pred)           # Calculate accuracy
    accuracies[name]=acc
    print(f"\n{name}")
    print("-" * len(name))
    print("Accuracy:", acc)
    print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
import matplotlib.pyplot as plt

# Bar chart
plt.figure(figsize=(10,6))
plt.bar(accuracies.keys(), accuracies.values(), color='skyblue')
plt.title('Model Accuracy Comparison')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.ylim(0, 1)  # Accuracy is between 0 and 1
plt.xticks(rotation=30)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.model_selection import cross_val_score

rf = RandomForestClassifier()
scores = cross_val_score(rf, X, y, cv=5)  # 5-fold cross-validation

print("Cross-validation scores:", scores)
print("Average accuracy:", scores.mean())


In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Grid search
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validated Accuracy:", grid_search.best_score_)


In [None]:
best_model = grid_search.best_estimator_

# Train on full training data
best_model.fit(X_train, y_train)

# Evaluate on test data
from sklearn.metrics import classification_report, accuracy_score

y_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
