In [52]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import numpy as np

In [16]:
df=pd.read_csv("data_set_telecom.csv")

In [17]:
df

Unnamed: 0,S.No,Account ID,Avg_bill_payment,Cust_age,product_count,exchange,Curr_Data_rate,Conn_catg,cust_catg,no_SRs_in_3Yrs,...,Data_services,Avg_calls_2018,Avg_calls_2019,Avg_calls_2020,Avg_calls_in_3Yrs,avg_usage_2018,avg_usage_2019,avg_usage_2020,Avg_usage_in_3Yrs,churn
0,20512,1.000000e+11,2429,4.7,2,04KHTDAK,6 Mbps,Residential,Gold,2,...,partial,2,0,0,1,125,179,4321,1542,NO
1,32801,2.107212e+09,4390,4.1,2,04BNUMSH,6 Mbps,Forces,Gold,1,...,No,9381,0,0,3127,49,0,0,16,YES
2,32853,2.106854e+09,2623,4.9,2,04BNUMSH,4 Mbps,Forces,Gold,7,...,No,1236,5,0,414,61,0,0,20,YES
3,43286,2.106700e+09,1627,6.0,2,04BNUMSH,8 Mbps,Forces,Gold,2,...,No,963,305,1,423,65,208,802,358,YES
4,43364,2.106447e+09,3426,4.0,2,04BNUMSH,8 Mbps,Forces,Gold,1,...,No,4486,553,0,1680,0,109,0,36,YES
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45126,27952,1.000000e+11,22610,1.4,2,04BNUMSH,15 Mbps,Govt_institutes,Platinum,3,...,No,0,0,3861,1287,0,0,894,298,NO
45127,19346,1.000000e+11,4138,5.4,2,04BNUBNU,6 Mbps,Residential,Platinum,2,...,Yes,1,0,0,0,0,0,1964,655,NO
45128,44461,1.000000e+11,2125,3.3,1,04DIKW0,0,Residential,silver,1,...,No,1803,240,68,704,0,0,0,0,YES
45129,40705,1.000000e+11,0,0.3,2,04DIKPHP,8 Mbps,Residential,silver,1,...,Yes,0,0,0,0,0,0,248,83,YES


In [18]:
df.head()

Unnamed: 0,S.No,Account ID,Avg_bill_payment,Cust_age,product_count,exchange,Curr_Data_rate,Conn_catg,cust_catg,no_SRs_in_3Yrs,...,Data_services,Avg_calls_2018,Avg_calls_2019,Avg_calls_2020,Avg_calls_in_3Yrs,avg_usage_2018,avg_usage_2019,avg_usage_2020,Avg_usage_in_3Yrs,churn
0,20512,100000000000.0,2429,4.7,2,04KHTDAK,6 Mbps,Residential,Gold,2,...,partial,2,0,0,1,125,179,4321,1542,NO
1,32801,2107212000.0,4390,4.1,2,04BNUMSH,6 Mbps,Forces,Gold,1,...,No,9381,0,0,3127,49,0,0,16,YES
2,32853,2106854000.0,2623,4.9,2,04BNUMSH,4 Mbps,Forces,Gold,7,...,No,1236,5,0,414,61,0,0,20,YES
3,43286,2106700000.0,1627,6.0,2,04BNUMSH,8 Mbps,Forces,Gold,2,...,No,963,305,1,423,65,208,802,358,YES
4,43364,2106447000.0,3426,4.0,2,04BNUMSH,8 Mbps,Forces,Gold,1,...,No,4486,553,0,1680,0,109,0,36,YES


In [19]:
 #Preprocessing
numeric_features = ['Avg_bill_payment', 'Cust_age', 'product_count', 'exchange', 'Curr_Data_rate', 'no_SRs_in_3Yrs', 
                    'Avg_calls_2018', 'Avg_calls_2019', 'Avg_calls_2020', 'Avg_usage_in_3Yrs']
categorical_features = ['Conn_catg', 'cust_catg', 'Data_services']

In [20]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [23]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [24]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [26]:

# Split data
X =df.drop('churn', axis=1)
y = df['churn'].apply(lambda x: 1 if x == 'YES' else 0)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
# Model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])


In [33]:

# Define preprocessing steps for numeric and categorical data
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns


In [34]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [38]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [39]:
# Combine preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
])

In [40]:
# Append classifier to preprocessing pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(random_state=42))])

In [41]:
# Fit model
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['S.No', 'Account ID', 'Avg_bill_payment', 'Cust_age', 'product_count',
       'no_SRs_in_3Yrs', 'rep_SRs_in_3Yrs', 'MTTR', 'Denial in_3Yrs',
       'Avg_calls_2018', 'Avg_calls_2019', 'Avg_calls_2020',
       'Avg_ca...avg_usage_2019',
       'avg_usage_2020', 'Avg_usage_in_3Yrs'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                  

In [42]:
# Predictions
y_pred = pipeline.predict(X_test)

In [43]:

# Evaluation
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

Accuracy: 0.9853772017281489
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      5933
           1       0.96      1.00      0.98      3094

    accuracy                           0.99      9027
   macro avg       0.98      0.99      0.98      9027
weighted avg       0.99      0.99      0.99      9027



In [45]:
from sklearn.metrics import classification_report, confusion_matrix

# Print classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      5933
           1       0.96      1.00      0.98      3094

    accuracy                           0.99      9027
   macro avg       0.98      0.99      0.98      9027
weighted avg       0.99      0.99      0.99      9027

Confusion Matrix:
[[5816  117]
 [  15 3079]]


In [46]:
from sklearn.model_selection import cross_val_score

# Example of 5-fold cross-validation
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())


Cross-validation scores: [0.98587453 0.98518211 0.98698241 0.98462817 0.98642659]
Mean CV accuracy: 0.9858187626808697


In [66]:
df.columns

Index(['S.No', 'Account ID', 'Avg_bill_payment', 'Cust_age', 'product_count',
       'exchange', 'Curr_Data_rate', 'Conn_catg', 'cust_catg',
       'no_SRs_in_3Yrs', 'rep_SRs_in_3Yrs', 'MTTR', 'Denial in_3Yrs',
       'Data_services', 'Avg_calls_2018', 'Avg_calls_2019', 'Avg_calls_2020',
       'Avg_calls_in_3Yrs', 'avg_usage_2018', 'avg_usage_2019',
       'avg_usage_2020', 'Avg_usage_in_3Yrs', 'churn'],
      dtype='object')

In [86]:
df.columns


Index(['S.No', 'Account ID', 'Avg_bill_payment', 'Cust_age', 'product_count',
       'exchange', 'Curr_Data_rate', 'Conn_catg', 'cust_catg',
       'no_SRs_in_3Yrs', 'rep_SRs_in_3Yrs', 'MTTR', 'Denial in_3Yrs',
       'Data_services', 'Avg_calls_2018', 'Avg_calls_2019', 'Avg_calls_2020',
       'Avg_calls_in_3Yrs', 'avg_usage_2018', 'avg_usage_2019',
       'avg_usage_2020', 'Avg_usage_in_3Yrs', 'churn'],
      dtype='object')

In [98]:
# Example user input (replace with actual user input)
user_input = {
    'Avg_bill_payment': 200.0,
    'Cust_age': 30,
    'product_count': 1,
    'exchange': 'B',
    'Curr_Data_rate': 150,
    'Conn_catg': 'Category2',
    'cust_catg': 'CategoryB',
    'no_SRs_in_3Yrs': 0,
    'rep_SRs_in_3Yrs': 2,
    'MTTR': 3,
    'Denial in_3Yrs': 0,
    'Data_services': 'ServiceY',
    'Avg_calls_2018': 15,
    'Avg_calls_2019': 20,
    'Avg_calls_2020': 25,
    'Avg_calls_in_3Yrs': 20,
    'avg_usage_2018': 250,
    'avg_usage_2019': 300,
    'avg_usage_2020': 350,
    'Avg_usage_in_3Yrs': 300,
    'S.No': 1,  # Example, replace with actual values
    'Account ID': 'A12345'  # Example, replace with actual values
}

# Convert user input to DataFrame
import pandas as pd
user_df = pd.DataFrame([user_input])

# Ensure columns are in the same order and names as used during training
# Adjust if necessary to match `numeric_features` and `categorical_features`
user_df = user_df[['S.No', 'Account ID', 'Avg_bill_payment', 'Cust_age', 'product_count', 
                   'exchange', 'Curr_Data_rate', 'Conn_catg', 'cust_catg', 'no_SRs_in_3Yrs',
                   'rep_SRs_in_3Yrs', 'MTTR', 'Denial in_3Yrs', 'Data_services', 
                   'Avg_calls_2018', 'Avg_calls_2019', 'Avg_calls_2020', 'Avg_calls_in_3Yrs',
                   'avg_usage_2018', 'avg_usage_2019', 'avg_usage_2020', 'Avg_usage_in_3Yrs']]

# Make predictions
try:
    # Ensure pipeline is fitted before making predictions
    pipeline.fit(X_train, y_train)
    
    # Access the ColumnTransformer within pipeline
    preprocessor = pipeline.named_steps['preprocessor']
    
    # Make predictions using pipeline
    user_pred = pipeline.predict(user_df)
    
    print(f"Predicted churn label for user input: {user_pred[0]}")
except Exception as e:
    error = str(e)
    print(f"Error occurred: {error}")


Error occurred: Cannot use median strategy with non-numeric data:
could not convert string to float: '6 Mbps'


In [100]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Example dataset (replace with your actual dataset)
data = {
    'Avg_bill_payment': [200.0, 300.0, '400.0'],
    'Cust_age': [30, 25, 35],
    'product_count': [1, 2, 3],
    'exchange': ['A', 'B', 'C'],
    'Curr_Data_rate': ['6 Mbps', '8 Mbps', '10 Mbps'],
    'Conn_catg': ['Category1', 'Category2', 'Category3'],
    'cust_catg': ['CategoryA', 'CategoryB', 'CategoryC'],
    'no_SRs_in_3Yrs': [0, 1, 2],
    'rep_SRs_in_3Yrs': [1, 2, 3],
    'MTTR': [2, 3, 4],
    'Denial in_3Yrs': [0, 0, 1],
    'Data_services': ['ServiceX', 'ServiceY', 'ServiceZ'],
    'Avg_calls_2018': [20, 25, 30],
    'Avg_calls_2019': [22, 27, 32],
    'Avg_calls_2020': [24, 29, 34],
    'Avg_calls_in_3Yrs': [22, 26, 30],
    'avg_usage_2018': [250, 300, 350],
    'avg_usage_2019': [260, 310, 360],
    'avg_usage_2020': [270, 320, 370],
    'Avg_usage_in_3Yrs': [260, 310, 360],
    'churn': [0, 1, 0]
}

# Convert data to DataFrame
df = pd.DataFrame(data)

# Separate numeric and categorical features
numeric_features = ['Avg_bill_payment', 'Cust_age', 'product_count',
                    'no_SRs_in_3Yrs', 'rep_SRs_in_3Yrs', 'MTTR', 'Denial in_3Yrs',
                    'Avg_calls_2018', 'Avg_calls_2019', 'Avg_calls_2020', 'Avg_calls_in_3Yrs',
                    'avg_usage_2018', 'avg_usage_2019', 'avg_usage_2020', 'Avg_usage_in_3Yrs']

categorical_features = ['exchange', 'Conn_catg', 'cust_catg', 'Data_services']

# Clean numeric columns
for col in numeric_features:
    # Convert to numeric if possible, coerce errors to NaN
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Define preprocessing steps for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Use median to impute missing values
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute with most frequent value for categorical
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Append classifier to preprocessing pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Example: Training the pipeline
X_train = df.drop('churn', axis=1)
y_train = df['churn']

# Fit the pipeline on training data
pipeline.fit(X_train, y_train)

# Example user input (replace with actual user input)
user_input = {
    'Avg_bill_payment': 400.0,
    'Cust_age': 28,
    'product_count': 2,
    'exchange': 'B',
    'Curr_Data_rate': '8 Mbps',
    'Conn_catg': 'Category2',
    'cust_catg': 'CategoryB',
    'no_SRs_in_3Yrs': 1,
    'rep_SRs_in_3Yrs': 2,
    'MTTR': 3,
    'Denial in_3Yrs': 0,
    'Data_services': 'ServiceY',
    'Avg_calls_2018': 23,
    'Avg_calls_2019': 28,
    'Avg_calls_2020': 33,
    'Avg_calls_in_3Yrs': 27,
    'avg_usage_2018': 280,
    'avg_usage_2019': 330,
    'avg_usage_2020': 380,
    'Avg_usage_in_3Yrs': 330,
}

# Convert user input to DataFrame
import pandas as pd
user_df = pd.DataFrame([user_input])

# Clean numeric columns in user input
for col in numeric_features:
    # Convert to numeric if possible, coerce errors to NaN
    user_df[col] = pd.to_numeric(user_df[col], errors='coerce')

# Ensure columns are in the same order and names as used during training
user_df = user_df[X_train.columns]

# Make predictions
try:
    # Predict probabilities for churn (label 1) and not churn (label 0)
    user_pred_proba = pipeline.predict_proba(user_df)
    
    # Determine predicted label based on threshold (e.g., 0.5 for binary classification)
    predicted_label = 1 if user_pred_proba[0][1] >= 0.5 else 0
    
    # Print predictions
    if predicted_label == 1:
        print("Predicted: Churn")
    else:
        print("Predicted: Not Churn")
except Exception as e:
    error = str(e)
    print(f"Error occurred: {error}")


Predicted: Churn


In [101]:
# Example user input (replace with actual user input)
user_input = {
    'Avg_bill_payment': 400.0,
    'Cust_age': 28,
    'product_count': 2,
    'exchange': 'B',
    'Curr_Data_rate': '8 Mbps',
    'Conn_catg': 'Category2',
    'cust_catg': 'CategoryB',
    'no_SRs_in_3Yrs': 1,
    'rep_SRs_in_3Yrs': 2,
    'MTTR': 3,
    'Denial in_3Yrs': 0,
    'Data_services': 'ServiceY',
    'Avg_calls_2018': 23,
    'Avg_calls_2019': 28,
    'Avg_calls_2020': 33,
    'Avg_calls_in_3Yrs': 27,
    'avg_usage_2018': 280,
    'avg_usage_2019': 330,
    'avg_usage_2020': 380,
    'Avg_usage_in_3Yrs': 330,
}

# Convert user input to DataFrame
import pandas as pd
user_df = pd.DataFrame([user_input])

# Clean numeric columns in user input
for col in numeric_features:
    # Convert to numeric if possible, coerce errors to NaN
    user_df[col] = pd.to_numeric(user_df[col], errors='coerce')

# Ensure columns are in the same order and names as used during training
user_df = user_df[X_train.columns]

# Make predictions
try:
    # Predict probabilities for churn (label 1) and not churn (label 0)
    user_pred_proba = pipeline.predict_proba(user_df)
    
    # Determine predicted label based on threshold (e.g., 0.5 for binary classification)
    predicted_label = 1 if user_pred_proba[0][1] >= 0.5 else 0
    
    # Print predictions
    if predicted_label == 1:
        print("Predicted: Churn")
    else:
        print("Predicted: Not Churn")
    
    # Print probabilities for both churn and not churn
    prob_churn = user_pred_proba[0][1]
    prob_not_churn = user_pred_proba[0][0]
    
    print(f"Probability of Churn: {prob_churn:.2f}")
    print(f"Probability of Not Churn: {prob_not_churn:.2f}")
    
except Exception as e:
    error = str(e)
    print(f"Error occurred: {error}")


Predicted: Churn
Probability of Churn: 0.56
Probability of Not Churn: 0.44
