<h1 style="text-align: center;">White Swan Project</h1>

<h2 style="text-align: center;"> By: Ashka, Elga, Liam and Yassim </h2>

In [39]:
# Importing necessary libraries
import pandas as pd  # data manipulation
import matplotlib.pyplot as plt  # plotting graphs

# Sklearn models
from sklearn.model_selection import train_test_split  #splitting data into training and testing sets
from sklearn.linear_model import LogisticRegression  # logistic regression model

# For model evaluation metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, auc  


In [41]:
# Load the data
df = pd.read_csv('Swan_Project_Data.csv')

## Feature Engineering

In [44]:
# Define a function for feature engineering
def feature_eng(dataframe):
    # Make a copy of the dataframe to avoid changing the original data
    df = dataframe.copy()
    
    ##### Fill missing values #######
    # List of columns with 'Yes' and 'No' values
    yn_cols = ['Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines', 'Online Security',
               'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies',
               'Paperless Billing']
    
    # Fill missing values in yn_cols with 'No'
    df[yn_cols] = df[yn_cols].fillna('No')
    
    # Map 'Yes' to 1 and 'No' to 0 and 0 for 'No phone service' and 'No internet service'
    for col in yn_cols:
        df[col] = df[col].map({'No': 0, 'Yes': 1, 'No phone service': 0, 'No internet service': 0}).astype(int)
    
    # Fill missing values for Gender and map 'Male' to 0 and 'Female' to 1
    df['Gender'] = df['Gender'].fillna('Male')
    df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})
    
    # Convert 'Total Charges' and 'Monthly Charges' to numeric, coercing errors to NaN
    df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')
    df['Monthly Charges'] = pd.to_numeric(df['Monthly Charges'], errors='coerce')
    
    # Fill missing values for numerical columns with 0
    df['Total Charges'] = df['Total Charges'].fillna(0)
    df['Monthly Charges'] = df['Monthly Charges'].fillna(0)
    
    ###### Drop Columns #########
    # Drop columns that are not needed for the analysis
    df.drop(['CustomerID', 'Count', 'Country', 'State', 'City', 'Zip Code',
             'Lat Long', 'Latitude', 'Longitude', 'Churn Label'], axis=1, inplace=True)
    
    ###### One-Hot Encoding ###############
    # One-hot encode categorical columns and add them to the dataframe
    df = pd.get_dummies(data=df, columns=['Internet Service'], prefix='IS', drop_first=False, dtype=int)
    df = pd.get_dummies(data=df, columns=['Contract'], prefix='Contract', drop_first=False, dtype=int)
    df = pd.get_dummies(data=df, columns=['Payment Method'], prefix='PM', drop_first=False, dtype=int)
    
    # Fill missing values for 'Churn Reason' and one-hot encode it
    df['Churn Reason'] = df['Churn Reason'].fillna('None')
    df = pd.get_dummies(data=df, columns=['Churn Reason'], prefix='Reason', drop_first=False, dtype=int)
    
    ##### Handle hidden nulls in disguise #########
    # Ensure 'Total Charges' column is numeric and fill nulls with zeros
    df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')
    df['Total Charges'] = df['Total Charges'].fillna(value=0)
    
    return df

# Apply feature engineering function to the dataframe
df = feature_eng(df)


## Prepare Features and Target

In [47]:
# Define features (X) and target (y)
X = df.drop('Churn Value', axis=1)  # Drop the target column from the features
y = df['Churn Value']  # Target column

# Fill any remaining NaNs in x with 0
X = X.fillna(0)

# Fill any remaining NaNs in y with the most frequent value (mode)
y = y.fillna(y.mode()[0])


## Split the data

In [50]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Train the Logistic Regression Model

In [95]:
# from sklearn.model_selection import cross_val_score

# # Initialize the model
# model = LogisticRegression(max_iter=1000)

# # Perform cross-validation
# cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

# # Print the cross-validation scores
# print(f"Cross-Validation Scores: {cv_scores}")
# print(f"Mean Cross-Validation Score: {cv_scores.mean()}")
# print(f"Standard Deviation of Cross-Validation Scores: {cv_scores.std()}")


In [97]:
# Train the logistic regression model
model = LogisticRegression(max_iter=1000)  # Initialise the model with a max of 1000 iterations
model.fit(X_train, y_train)  # Train the model on the training data


## Make Predictions and Evaluate the Model

In [100]:
# Make predictions on the test set
y_pred = model.predict(X_test)  # Predicted class labels
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Predicted probabilities for the positive class

# Evaluate the model
print(classification_report(y_test, y_pred))  # Print classification report
print(confusion_matrix(y_test, y_pred))  # Print confusion matrix
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')  # Print accuracy score


              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1526
         1.0       1.00      1.00      1.00       588

    accuracy                           1.00      2114
   macro avg       1.00      1.00      1.00      2114
weighted avg       1.00      1.00      1.00      2114

[[1526    0]
 [   0  588]]
Accuracy: 1.0


### Feature Importance
Print the feature importance to understand which features are most influential.

In [103]:
# Feature importance
feature_importance = pd.Series(model.coef_[0], index=X.columns).sort_values(ascending=False)
print(feature_importance)

Reason_Attitude of support person                   0.947440
Reason_Competitor offered higher download speeds    0.924234
Contract_Month-to-month                             0.876225
Reason_Competitor offered more data                 0.856050
Reason_Don't know                                   0.758687
Reason_Competitor made better offer                 0.753981
Reason_Attitude of service provider                 0.727042
Reason_Product dissatisfaction                      0.718239
Reason_Competitor had better devices                0.684199
IS_No                                               0.610031
Reason_Network reliability                          0.604245
Reason_Price too high                               0.557993
Reason_Service dissatisfaction                      0.517245
Reason_Lack of self-service on Website              0.463352
IS_DSL                                              0.420595
PM_Electronic check                                 0.387878
Reason_Lack of affordabl

### Predict Churn Probabilities for the Entire Dataset
Predict churn probabilities for the entire dataset and select customers at risk.
Identify the top 500 customers who are at risk of churning

In [106]:
# Predict churn probabilities for the entire dataset
df['Churn Probability'] = model.predict_proba(X)[:, 1]  # Add churn probabilities to the dataframe

In [108]:
# Select top 500 customers at risk of churning
at_risk_customers = df.sort_values(by='Churn Probability', 
                                   ascending=False).head(500)# Sort by churn probability and select top 500

In [110]:
at_risk_customers.head()

Unnamed: 0,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Online Security,Online Backup,Device Protection,...,Reason_Long distance charges,Reason_Moved,Reason_Network reliability,Reason_None,Reason_Poor expertise of online support,Reason_Poor expertise of phone support,Reason_Price too high,Reason_Product dissatisfaction,Reason_Service dissatisfaction,Churn Probability
1382,1,0,1,0,3.0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.998257
1044,1,0,1,0,6.0,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0.998206
584,1,1,1,0,1.0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0.998204
449,0,0,0,0,1.0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0.998086
1681,0,1,1,0,7.0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0.998077


In [118]:
# Target the customers most at risk of churning with a marketing campaign
# Select 20% of those at risk of churning
mailing_list = at_risk_customers.sample(frac=0.2, 
                                        random_state=42)  # Randomly select 20% from the top 500

In [120]:
mailing_list.head()

Unnamed: 0,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Online Security,Online Backup,Device Protection,...,Reason_Long distance charges,Reason_Moved,Reason_Network reliability,Reason_None,Reason_Poor expertise of online support,Reason_Poor expertise of phone support,Reason_Price too high,Reason_Product dissatisfaction,Reason_Service dissatisfaction,Churn Probability
1806,0,0,0,0,7.0,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0.996458
89,1,0,0,0,2.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.997453
1745,1,0,1,0,13.0,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0.996418
419,0,1,0,0,1.0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0.997046
637,1,1,0,0,14.0,1,1,0,1,1,...,0,0,0,0,0,0,0,1,0,0.997264


In [121]:
print(mailing_list)  # Print the mailing list

      Gender  Senior Citizen  Partner  Dependents  Tenure Months  \
1806       0               0        0           0            7.0   
89         1               0        0           0            2.0   
1745       1               0        1           0           13.0   
419        0               1        0           0            1.0   
637        1               1        0           0           14.0   
...      ...             ...      ...         ...            ...   
841        1               0        0           0            7.0   
625        1               0        0           0            5.0   
996        0               1        1           0           10.0   
1181       1               0        0           0            1.0   
1183       1               1        1           0           11.0   

      Phone Service  Multiple Lines  Online Security  Online Backup  \
1806              1               1                0              0   
89                1               0      