In [1]:
import pandas as pd

# Load the dataset
data = pd.read_csv('TelcoCustomerChurn.csv')

# Display the first few rows to get an overview of the data
data.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [2]:
# Get summary statistics for numerical features
data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [3]:
# Get counts for categorical features
data['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [4]:
# Handle missing values if any (if needed)
data.dropna(inplace=True)

# Encode categorical variables using one-hot encoding
data_encoded = pd.get_dummies(data)

# Split data into features (X) and target (y)
X = data_encoded.drop('Churn_Yes', axis=1)  # Churn_Yes is the encoded target
y = data_encoded['Churn_Yes']


In [5]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the training features using the scaler
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test features using the same scaler
X_test_scaled = scaler.transform(X_test)

# Train the Logistic Regression Model on the standardized training data
model = LogisticRegression(max_iter=1000)  # Adjust max_iter if needed
model.fit(X_train_scaled, y_train)


LogisticRegression(max_iter=1000)

In [7]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Make predictions on the testing set
y_pred = model.predict(X_test_scaled)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))

print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.94

Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1036
           1       0.99      0.80      0.88       373

    accuracy                           0.94      1409
   macro avg       0.96      0.90      0.92      1409
weighted avg       0.95      0.94      0.94      1409


Confusion Matrix:
[[1032    4]
 [  74  299]]


In [8]:
import joblib

# Save the trained model to a file
joblib.dump(model, 'newChurn_trained_model.pkl')


['newChurn_trained_model.pkl']

In [10]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

def preprocess_data(data):
    # Create a copy of the input DataFrame to avoid modifying the original data
    processed_data = data.copy()

    # Identify categorical and numerical columns
    categorical_columns = processed_data.select_dtypes(include=['object']).columns.tolist()
    numerical_columns = processed_data.select_dtypes(exclude=['object']).columns.tolist()

    # Encode categorical variables using label encoding
    label_encoders = {}
    for column in categorical_columns:
        label_encoders[column] = LabelEncoder()
        processed_data[column] = label_encoders[column].fit_transform(processed_data[column])

    # Scale numerical features using StandardScaler
    scaler = StandardScaler()
    processed_data[numerical_columns] = scaler.fit_transform(processed_data[numerical_columns])

    return processed_data


In [11]:
import pandas as pd

# Read the new data from a CSV file
new_data_df = pd.read_csv('testData.csv')  # Replace 'new_data.csv' with your new data file

# Preprocess the new data using the same preprocessing functions
new_data_processed = preprocess_data(new_data_df)


In [12]:
new_data_processed

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,2,0,0.0,1,0,-1.005302,0,1,0,0,1,0,0,0,0,0,1,1,-1.488389,-1.042907
1,1,1,0.0,0,0,0.695978,1,0,0,1,0,1,0,0,0,1,0,2,1.050421,1.026465
2,0,1,0.0,0,0,-0.953748,1,0,0,1,1,0,0,0,0,0,1,2,0.760004,-0.955776
3,3,1,0.0,0,0,1.263071,0,1,0,1,0,1,1,0,0,1,0,0,-0.322035,0.972217


## Load the trained model

In [14]:
import joblib

# Load the trained model
model = joblib.load('newChurn_trained_model.pkl')  # Replace with your model file path


In [15]:
model

LogisticRegression(max_iter=1000)

In [16]:
# Use the loaded model to make predictions on the preprocessed new data
predictions = model.predict(new_data_processed)

# The 'predictions' variable now contains the predicted labels (e.g., Churn_Yes or Churn_No) for the new data.




ValueError: X has 20 features, but LogisticRegression is expecting 13619 features as input.