In [1]:
import pandas as pd

# Load the dataset
file_path = 'ypur_path'
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

# Data Cleaning
def preprocess_data(df):
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df.dropna(inplace=True)
    df['SeniorCitizen'] = df['SeniorCitizen'].map({0: 'No', 1: 'Yes'})
    return df

# Prepare features and target
def prepare_features(df):
    X = df.drop(['Churn', 'customerID'], axis=1)
    y = df['Churn'].map({'Yes': 1, 'No': 0})
    
    numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
    categorical_features = X.select_dtypes(include=['object']).columns.tolist()
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    return X, y, preprocessor

# Clean and prepare the data
df_cleaned = preprocess_data(df)
X, y, preprocessor = prepare_features(df_cleaned)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])
model.fit(X_train, y_train)

# Predict churn probabilities for non-churned customers
non_churned_customers = df_cleaned[df_cleaned['Churn'] == 'No'].copy()
non_churned_features = non_churned_customers.drop('Churn', axis=1)
churn_probabilities = model.predict_proba(non_churned_features.drop('customerID', axis=1))[:, 1]

# Add churn probability to the dataset
non_churned_customers['Churn_Probability'] = churn_probabilities

# Sort by churn probability in descending order
non_churned_customers_sorted = non_churned_customers.sort_values('Churn_Probability', ascending=False)

# Save the dataset
non_churned_customers_sorted.to_csv('non_churned_customers_churn_probability.csv', index=False)

print("Dataset of non-churned customers with churn probabilities has been created and saved.")
print(f"Total non-churned customers: {len(non_churned_customers_sorted)}")
print("\nTop 5 customers with highest churn risk:")
print(non_churned_customers_sorted[['customerID', 'Churn_Probability']].head())

Dataset of non-churned customers with churn probabilities has been created and saved.
Total non-churned customers: 5163

Top 5 customers with highest churn risk:
      customerID  Churn_Probability
3346  2545-EBUPK               1.00
684   8040-MNRTF               1.00
1081  1751-NCDLI               0.95
3324  5043-TRZWM               0.95
5140  7577-SWIFR               0.95
