<a href="https://colab.research.google.com/github/jahannusrat8052/Learning-Data-Analysis---Telco-Customer-Churn-Focused-customer-retention-programs/blob/main/churn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("Telco-Customer-Churn.csv")

# Quick look
print(df.head())
print(df.info())
print(df["Churn"].value_counts())


In [None]:
# Handle missing values
df = df.dropna()

# Convert categorical to numeric
df = pd.get_dummies(df, drop_first=True)


In [None]:
from sklearn.model_selection import train_test_split

X = df.drop("Churn_Yes", axis=1)   # Features
y = df["Churn_Yes"]                # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Reload the original dataframe to identify categorical columns before one-hot encoding
df_original = pd.read_csv("Telco-Customer-Churn.csv")

# Identify categorical columns (excluding 'customerID' and numerical columns)
categorical_cols = df_original.select_dtypes(include='object').columns.tolist()
categorical_cols.remove('customerID')
categorical_cols.remove('TotalCharges') # TotalCharges is numeric but read as object initially

# Plot count plots for each categorical feature
for col in categorical_cols:
    plt.figure(figsize=(8, 5))
    sns.countplot(data=df_original, x=col, hue='Churn')
    plt.title(f'Distribution of {col} and Churn')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

## Visualize the distribution of numerical features

### Subtask:
Plot histograms for numerical features to understand their distributions.


**Reasoning**:
Plot histograms for the numerical features to visualize their distributions.



In [None]:
# Identify numerical columns in the original DataFrame
numerical_cols = df_original.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Add 'TotalCharges' to the numerical columns list, handling potential errors if it's not convertible yet
try:
    df_original['TotalCharges'] = pd.to_numeric(df_original['TotalCharges'], errors='coerce')
    numerical_cols.append('TotalCharges')
except ValueError:
    print("Could not convert 'TotalCharges' to numeric.")
    pass # Skip if conversion is not possible

# Plot histograms for numerical features
for col in numerical_cols:
    if col in df_original.columns:
        plt.figure(figsize=(8, 5))
        sns.histplot(data=df_original, x=col, kde=True)
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
        plt.tight_layout()
        plt.show()


## Visualize the relationship between features and churn

### Subtask:
Create visualizations to explore how different features relate to customer churn.


**Reasoning**:
Create box plots, violin plots, a correlation heatmap, and stacked bar plots to visualize the relationship between different features and churn, as per the instructions.



In [None]:
# 1. Create box plots for numerical features vs. Churn
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
for feature in numerical_features:
    plt.figure(figsize=(8, 5))
    sns.boxplot(data=df_original, x='Churn', y=feature)
    plt.title(f'Box plot of {feature} vs. Churn')
    plt.xlabel('Churn')
    plt.ylabel(feature)
    plt.show()

# 2. Create violin plots for numerical features vs. Churn
for feature in numerical_features:
    plt.figure(figsize=(8, 5))
    sns.violinplot(data=df_original, x='Churn', y=feature)
    plt.title(f'Violin plot of {feature} vs. Churn')
    plt.xlabel('Churn')
    plt.ylabel(feature)
    plt.show()

# 3. Generate a heatmap of the correlation matrix for numerical features and Churn
# Convert 'Churn' to numerical (0 for No, 1 for Yes) for correlation
df_original['Churn_numeric'] = df_original['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)
correlation_matrix = df_original[numerical_features + ['Churn_numeric']].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numerical Features and Churn')
plt.show()

# 4. Create stacked bar plots for select categorical features vs. Churn
categorical_features_for_stacked = ['Contract', 'InternetService', 'PaymentMethod']

for feature in categorical_features_for_stacked:
    plt.figure(figsize=(10, 6))
    # Create a cross-tabulation of the feature and Churn
    ct = pd.crosstab(df_original[feature], df_original['Churn'], normalize='index') * 100
    ct.plot(kind='bar', stacked=True, colormap='viridis', ax=plt.gca())
    plt.title(f'Churn Rate by {feature}')
    plt.xlabel(feature)
    plt.ylabel('Percentage')
    plt.xticks(rotation=45, ha='right')
    plt.legend(title='Churn')
    plt.tight_layout()
    plt.show()


## Summary:

### Data Analysis Key Findings

*   Customers with shorter tenure appear more likely to churn.
*   Customers with higher monthly charges seem to have a higher churn rate.
*   Customers with higher total charges tend to have a lower churn rate.
*   Customers with month-to-month contracts have a significantly higher churn rate compared to those with one-year or two-year contracts.
*   Customers using Fiber optic internet service have a higher churn rate than those using DSL or no internet service.
*   Customers using Electronic check payment methods show a higher propensity to churn.
*   There is a moderate positive correlation between 'MonthlyCharges' and 'Churn\_numeric' (0.19).
*   There is a moderate negative correlation between 'tenure' and 'Churn\_numeric' (-0.35).
*   There is a weak negative correlation between 'TotalCharges' and 'Churn\_numeric' (-0.20).

### Insights or Next Steps

*   The visualizations highlight that contract type, internet service, and payment method are strong indicators of churn risk. These features should be considered important in churn prediction models.
*   Further investigation into the relationship between numerical features and churn, perhaps through feature engineering or non-linear modeling techniques, could provide deeper insights.


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print("RF Accuracy:", rf.score(X_test, y_test))


In [None]:
import matplotlib.pyplot as plt

importances = rf.feature_importances_
features = X.columns
plt.barh(features, importances)
plt.show()