In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

data_d = pd.read_csv('telecom_demographics.csv')
data_u = pd.read_csv('telecom_usage.csv')

data_combined = pd.merge(data_d, data_u, on='customer_id')

churn_data = data_combined['churn']
churn_distribution = churn_data.value_counts(normalize=True)
print(churn_distribution)
print(data_combined.info())

dummies = pd.get_dummies(data_combined, columns=['telecom_partner', 'gender', 'state', 'city', 'registration_event'])
feature_set = dummies.drop(columns=['customer_id', 'churn'])
feature_set_scaled = StandardScaler().fit_transform(feature_set)
target_variable = dummies['churn']
X_train, X_test, y_train, y_test = train_test_split(feature_set_scaled, target_variable, test_size=0.2, random_state=42)

ridge = RidgeClassifier(random_state=42)
ridge.fit(X_train, y_train)
ridge_preds = ridge.predict(X_test)

print(confusion_matrix(y_test, ridge_preds))
print(classification_report(y_test, ridge_preds))

random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)
forest_preds = random_forest.predict(X_test)

print(confusion_matrix(y_test, forest_preds))
print(classification_report(y_test, forest_preds))

accuracy_ridge = accuracy_score(y_test, ridge_preds)
accuracy_forest = accuracy_score(y_test, forest_preds)

if accuracy_forest > accuracy_ridge:
    best_model = "RandomForest"
else:
    best_model = "RidgeClassifier"

print(f"Superior model: {best_model}")


churn
0    0.799538
1    0.200462
Name: proportion, dtype: float64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6500 entries, 0 to 6499
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   customer_id         6500 non-null   int64 
 1   telecom_partner     6500 non-null   object
 2   gender              6500 non-null   object
 3   age                 6500 non-null   int64 
 4   state               6500 non-null   object
 5   city                6500 non-null   object
 6   pincode             6500 non-null   int64 
 7   registration_event  6500 non-null   object
 8   num_dependents      6500 non-null   int64 
 9   estimated_salary    6500 non-null   int64 
 10  calls_made          6500 non-null   int64 
 11  sms_sent            6500 non-null   int64 
 12  data_used           6500 non-null   int64 
 13  churn               6500 non-null   int64 
dtypes: int64(9), object(5)
memory usage: 711.1+ KB
None
[