In [275]:
!pip install scikit-learn
!pip install pytest




In [276]:
!pip install -r requirements.txt
# Install reportlab in your environment
!pip install reportlab




In [277]:
import os
import joblib
import logging
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pytest





In [278]:

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [279]:
df = pd.read_csv('Processed/MiniProjectS_Model_Evaluation.csv')
df.head(15)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target,Churn_Prob,Retention_Segment,Retention_Priority,Risk_Segment
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,0.6656,0.7119,0.2654,0.4601,0.1189,0,0.353677,Medium-Risk (Engage),Medium,Low
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,0.1866,0.2416,0.186,0.275,0.08902,0,0.248558,Low-Risk (Retain),Low,Low
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,0.4245,0.4504,0.243,0.3613,0.08758,0,0.415959,Medium-Risk (Engage),Medium,Medium
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,0.8663,0.6869,0.2575,0.6638,0.173,0,0.159968,Low-Risk (Retain),Low,Low
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,0.205,0.4,0.1625,0.2364,0.07678,0,0.550283,Medium-Risk (Engage),High,Medium
5,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,...,0.5249,0.5355,0.1741,0.3985,0.1244,0,0.110945,Low-Risk (Retain),Low,Low
6,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,0.05742,...,0.2576,0.3784,0.1932,0.3063,0.08368,0,0.509897,Medium-Risk (Engage),High,Medium
7,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,...,0.3682,0.2678,0.1556,0.3196,0.1151,0,0.17727,Low-Risk (Retain),Low,Low
8,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,...,0.5401,0.539,0.206,0.4378,0.1072,0,0.19829,Low-Risk (Retain),Low,Low
9,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243,...,1.058,1.105,0.221,0.4366,0.2075,0,0.376237,Medium-Risk (Engage),Medium,Low


4.1.1 Data Processing Modules


4.1.2 Model Development Modules


4.1.3 Pipeline Orchestration

In [280]:


# Setup
warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO)
np.random.seed(42)

# Directories
Path("output/charts").mkdir(parents=True, exist_ok=True)
Path("output/artifacts").mkdir(parents=True, exist_ok=True)


CONFIG = {
    "test_size": 0.2,
    "random_state": 42,
    "model_params": {"n_estimators": 200, "max_depth": 8, "random_state": 42},
    "target": "Churn"
}


class DataPipeline:
    def __init__(self, path: str):
        self.path = path
        self.df = None

    def load_data(self):
        logging.info("Loading dataset...")
        self.df = pd.read_csv(self.path)
        return self.df

    def handle_missing_values(self):
        self.df = self.df.replace(" ", np.nan).dropna()
        # Chart: Missing values
        missing_counts = self.df.isnull().sum()
        plt.figure(figsize=(8,4))
        sns.barplot(x=missing_counts.index, y=missing_counts.values)
        plt.xticks(rotation=90)
        plt.title("Missing Values per Feature")
        plt.savefig("output/charts/missing_values.png")
        plt.close()
        return self.df

    def detect_outliers(self, col):
        Q1, Q3 = self.df[col].quantile([0.25, 0.75])
        IQR = Q3 - Q1
        lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
        outliers = self.df[(self.df[col] < lower) | (self.df[col] > upper)]
        # Chart: Boxplot
        plt.figure(figsize=(6,4))
        sns.boxplot(x=self.df[col])
        plt.title(f"Outlier Detection: {col}")
        plt.savefig(f"output/charts/outliers_{col}.png")
        plt.close()
        return outliers

    def add_tenure_bins(self):
        bins = [0, 12, 24, 48, 72]
        labels = ["New", "Established", "Loyal", "Very Loyal"]
        self.df["TenureCategory"] = pd.cut(self.df["tenure"], bins=bins, labels=labels, right=False)
        # Chart: Tenure distribution
        plt.figure(figsize=(6,4))
        sns.countplot(x="TenureCategory", data=self.df, palette="Set2")
        plt.title("Tenure Categories")
        plt.savefig("output/charts/tenure_bins.png")
        plt.close()
        return self.df

    def split_data(self):
        X = self.df.drop(columns=[CONFIG["target"], "customerID"])
        y = self.df[CONFIG["target"]].apply(lambda x: 1 if str(x).lower()=="yes" else 0)
        # Chart: Target distribution
        plt.figure(figsize=(5,4))
        sns.countplot(x=y, palette="pastel")
        plt.title("Churn Distribution")
        plt.savefig("output/charts/churn_distribution.png")
        plt.close()
        return train_test_split(X, y, test_size=CONFIG["test_size"], random_state=CONFIG["random_state"], stratify=y)


class ModelPipeline:
    def __init__(self):
        self.pipeline = None

    def build_preprocessor(self, X):
        num = X.select_dtypes(include="number").columns.tolist()
        cat = X.select_dtypes(include="object").columns.tolist()
        num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
        cat_pipe = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder(handle_unknown="ignore"))])
        return ColumnTransformer([("num", num_pipe, num), ("cat", cat_pipe, cat)])

    def build_pipeline(self, preprocessor):
        self.pipeline = Pipeline([
            ("preprocessor", preprocessor),
            ("model", RandomForestClassifier(**CONFIG["model_params"]))
        ])
        return self.pipeline

    def train(self, X_train, y_train):
        self.pipeline.fit(X_train, y_train)
        return self.pipeline

    def evaluate(self, X_test, y_test):
        preds = self.pipeline.predict(X_test)
        print(classification_report(y_test, preds))

        # Confusion Matrix
        cm = confusion_matrix(y_test, preds)
        plt.figure(figsize=(5,4))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No","Yes"], yticklabels=["No","Yes"])
        plt.title("Confusion Matrix")
        plt.savefig("output/charts/confusion_matrix.png")
        plt.close()

        # Feature Importance
        model = self.pipeline.named_steps["model"]
        importances = model.feature_importances_
        features = self.pipeline.named_steps["preprocessor"].get_feature_names_out()
        imp_df = pd.DataFrame({"feature": features, "importance": importances}).sort_values(by="importance", ascending=False).head(15)
        plt.figure(figsize=(10,6))
        sns.barplot(x="importance", y="feature", data=imp_df)
        plt.title("Top Features - Random Forest")
        plt.savefig("output/charts/feature_importance.png")
        plt.close()

    def predict_single(self, sample: dict):
        df = pd.DataFrame([sample])
        pred = self.pipeline.predict(df)[0]
        proba = self.pipeline.predict_proba(df)[0][1]
        # Chart: Prediction probability
        plt.figure(figsize=(4,4))
        sns.barplot(x=["Not Churn","Churn"], y=[1-proba, proba])
        plt.title("Prediction Probability")
        plt.savefig("output/charts/single_prediction.png")
        plt.close()
        return {"prediction": "Churn" if pred==1 else "Not Churn", "probability": proba}


class TrainingPipeline:
    def __init__(self, data_path):
        self.data_pipeline = DataPipeline(data_path)
        self.model_pipeline = ModelPipeline()
        self.pipeline = None

    def run(self):
        df = self.data_pipeline.load_data()
        df = self.data_pipeline.handle_missing_values()
        self.data_pipeline.detect_outliers("MonthlyCharges")
        df = self.data_pipeline.add_tenure_bins()
        X_train, X_test, y_train, y_test = self.data_pipeline.split_data()

        pre = self.model_pipeline.build_preprocessor(X_train)
        self.model_pipeline.build_pipeline(pre)
        self.model_pipeline.train(X_train, y_train)
        self.model_pipeline.evaluate(X_test, y_test)

        # Save model
        joblib.dump(self.model_pipeline.pipeline, "output/artifacts/churn_model.pkl")
        logging.info(" Model saved to output/artifacts/churn_model.pkl")

        # Chart: Churn Rate in Test
        churn_rate = y_test.mean()
        plt.figure(figsize=(5,4))
        sns.barplot(x=["Not Churn","Churn"], y=[1-churn_rate, churn_rate])
        plt.title("Churn Rate - Test Set")
        plt.savefig("output/charts/test_churn_rate.png")
        plt.close()

        self.pipeline = self.model_pipeline.pipeline
        return self.pipeline, (X_test, y_test)

    def streaming_inference(self, samples):
        results = []
        for s in samples:
            res = self.model_pipeline.predict_single(s)
            results.append(res)
            print(res)
        return results


def validate_pipeline(pipeline, X_test, y_test):
    assert hasattr(pipeline, "predict"), "Pipeline missing predict()"
    assert len(X_test) == len(y_test), "Mismatch in test shapes"
    preds = pipeline.predict(X_test)
    assert set(np.unique(preds)).issubset({0,1}), "Unexpected prediction values"
    logging.info(" Pipeline validation passed.")


# ... existing code ...

if __name__ == "__main__":
    data_file = ('Raw/MiniProjectS.csv')  # Changed from Processed/MiniProjectS_Model_Evaluation.csv
    assert os.path.exists(data_file), "CSV file not found!"

    trainer = TrainingPipeline(data_file)
    pipeline, test_data = trainer.run()

    # Streaming inference example
    sample = {
        "gender":"Female","SeniorCitizen":0,"Partner":"Yes","Dependents":"No","tenure":5,
        "PhoneService":"Yes","MultipleLines":"No","InternetService":"Fiber optic",
        "OnlineSecurity":"No","OnlineBackup":"No","DeviceProtection":"No","TechSupport":"No",
        "StreamingTV":"Yes","StreamingMovies":"Yes","Contract":"Month-to-month",
        "PaperlessBilling":"Yes","PaymentMethod":"Electronic check",
        "MonthlyCharges":70.0,"TotalCharges":300.0,"TenureCategory":"New"
    }
    trainer.streaming_inference([sample])


INFO:root:Loading dataset...
INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.


              precision    recall  f1-score   support

           0       0.73      1.00      0.85      1033
           1       0.00      0.00      0.00       374

    accuracy                           0.73      1407
   macro avg       0.37      0.50      0.42      1407
weighted avg       0.54      0.73      0.62      1407



INFO:root: Model saved to output/artifacts/churn_model.pkl


{'prediction': 'Not Churn', 'probability': np.float64(0.427662040901877)}


# 4.2 Pipeline Implementation Requirements

4.2.1 Code Structure Standards

Object-Oriented Design

In [281]:
import logging

class BasePipeline:
    def __init__(self):
        self.logger = logging.getLogger(self.__class__.__name__)

    def log(self, message: str):
        self.logger.info(message)

    def handle_error(self, e: Exception):
        self.logger.error(f"Error in {self.__class__.__name__}: {str(e)}")
        raise e


Strategy Pattern (Encoding, Scaling, Models)

In [282]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler

class EncodingStrategy:
    def one_hot(self): return OneHotEncoder(handle_unknown="ignore")
    def ordinal(self): return OrdinalEncoder()

class ScalingStrategy:
    def standard(self): return StandardScaler()
    def minmax(self): return MinMaxScaler()


In [283]:
encoder = EncodingStrategy().one_hot()
scaler = ScalingStrategy().standard()


Load Configuration Management

In [284]:
import yaml

with open("config.yaml", "r") as f:
    CONFIG = yaml.safe_load(f)


 Error Handling + Logging

In [285]:
class DataIngestion(BasePipeline):
    def load_data(self, path: str):
        try:
            self.log("Loading dataset...")
            df = pd.read_csv(path)
            return df
        except Exception as e:
            self.handle_error(e)
print(df.head())
print(df.shape)
df.info()

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst compactness  worst concavity  \
0                 0.07

Type Hints

In [286]:
def load_data(self, path: str) -> pd.DataFrame:
    print(df.head())
print(df.shape)
df.info()


(569, 35)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 35 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error

# 4.2.2 Data Pipeline Features

Configurable Processing

In [287]:
import yaml

with open("config.yaml", "r") as f:
    CONFIG = yaml.safe_load(f)

    


 Data Validation

In [288]:
class DataValidation(BasePipeline):
    def validate(self, df: pd.DataFrame):
        assert not df.empty, "Dataset is empty!"
        assert "customerID" in df.columns, "customerID missing!"
        self.log("Data validation passed.")
        return True





Pipeline Persistence

In [289]:

preprocessor = Pipeline(steps=[
    ("encode", EncodingStrategy().one_hot()),
    ("scale", ScalingStrategy().standard())
])

joblib.dump(preprocessor, "output/artifacts/preprocessor.pkl")
pre = joblib.load("output/artifacts/preprocessor.pkl")


Reproducibility

In [290]:
import numpy as np
import random

np.random.seed(42)
random.seed(42)




# 4.2.3 Training Pipeline Features

Multiple Model Support

In [291]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

class ModelFactory:
    def get_model(self, model_name: str, params: dict):
        if model_name == "random_forest":
            return RandomForestClassifier(**params)
        elif model_name == "gradient_boost":
            return GradientBoostingClassifier(**params)
        elif model_name == "adaboost":
            return AdaBoostClassifier(**params)
        else:
            raise ValueError(f"Model {model_name} not supported")



 Hyperparameter Optimization

In [292]:


# Option 1: Use the existing pipeline
trainer = TrainingPipeline('Raw/MiniProjectS.csv')
pipeline, test_data = trainer.run()
X_test, y_test = test_data

# Get the training data from the data pipeline
df = trainer.data_pipeline.df
X = df.drop(columns=[CONFIG["target"], "customerID"])
y = df[CONFIG["target"]].apply(lambda x: 1 if str(x).lower()=="yes" else 0)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=CONFIG["test_size"], 
    random_state=CONFIG["random_state"], stratify=y
)

# Create preprocessing pipeline
num_cols = X_train.select_dtypes(include="number").columns.tolist()
cat_cols = X_train.select_dtypes(include="object").columns.tolist()

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")), 
    ("scaler", StandardScaler())
])
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")), 
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipe, num_cols), 
    ("cat", cat_pipe, cat_cols)
])

# Create full pipeline with preprocessing
full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier())
])

# use GridSearchCV with the full pipeline
param_grid = {
    "classifier__n_estimators": [100, 200], 
    "classifier__max_depth": [5, 10]
}

grid = GridSearchCV(full_pipeline, param_grid, cv=3, scoring="f1")
grid.fit(X_train, y_train)
best_model = grid.best_estimator_

print(f"Best parameters: {grid.best_params_}")
print(f"Best score: {grid.best_score_:.4f}")

INFO:root:Loading dataset...


INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.


              precision    recall  f1-score   support

           0       0.73      1.00      0.85      1033
           1       0.00      0.00      0.00       374

    accuracy                           0.73      1407
   macro avg       0.37      0.50      0.42      1407
weighted avg       0.54      0.73      0.62      1407



INFO:root: Model saved to output/artifacts/churn_model.pkl


Best parameters: {'classifier__max_depth': 10, 'classifier__n_estimators': 200}
Best score: 0.2062


Model Comparison

In [293]:
# Multiple Model Support with Proper Preprocessing
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score

class ModelFactory:
    def get_model(self, model_name: str, params: dict):
        if model_name == "random_forest":
            return RandomForestClassifier(**params)
        elif model_name == "gradient_boost":
            return GradientBoostingClassifier(**params)
        elif model_name == "adaboost":
            return AdaBoostClassifier(**params)
        else:
            raise ValueError(f"Model {model_name} not supported")

# Create preprocessing pipeline
num_cols = X_train.select_dtypes(include="number").columns.tolist()
cat_cols = X_train.select_dtypes(include="object").columns.tolist()

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")), 
    ("scaler", StandardScaler())
])
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")), 
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipe, num_cols), 
    ("cat", cat_pipe, cat_cols)
])

# Test multiple models with preprocessing
results = {}
for model_name in ["random_forest", "gradient_boost", "adaboost"]:
    # Create full pipeline with preprocessing and model
    full_pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", ModelFactory().get_model(model_name, {}))
    ])
    
    # Fit the pipeline
    full_pipeline.fit(X_train, y_train)
    
    # Make predictions
    preds = full_pipeline.predict(X_test)
    results[model_name] = f1_score(y_test, preds)

# Display results
for model_name, score in results.items():
    print(f"{model_name}: F1 Score = {score:.4f}")

# Find best model
best_model_name = max(results, key=results.get)
print(f"\nBest model: {best_model_name} with F1 Score: {results[best_model_name]:.4f}")

random_forest: F1 Score = 0.5416
gradient_boost: F1 Score = 0.5667
adaboost: F1 Score = 0.5602

Best model: gradient_boost with F1 Score: 0.5667


Model Persistence



In [294]:

joblib.dump(best_model, "output/artifacts/best_model.pkl")

with open("output/artifacts/evaluation_report.txt", "w") as f:
    f.write(str(results))


# 4.2.4 Inference Pipeline Features


Batch Prediction

Single Sample Prediction

Input Validation

Probability Outputs

In [295]:
import numpy as np
import joblib

# Load the saved model
model = joblib.load("best_model.pkl")


X_batch = np.random.rand(5, 5)  
batch_preds = model.predict(X_batch)
print("Batch Predictions:", batch_preds)


X_single = np.random.rand(1, 5) 
single_pred = model.predict(X_single)
print("Single Prediction:", single_pred)

try:
    X_wrong = np.random.rand(5) 
    model.predict(X_wrong)
except Exception as e:
    print("Input Validation Error:", str(e))


probs = model.predict_proba(X_batch)
print("Prediction Probabilities:\n", probs)


Batch Predictions: [0 1 0 0 0]
Single Prediction: [1]
Input Validation Error: Expected 2D array, got 1D array instead:
array=[0.4303059  0.20052473 0.49159455 0.06420894 0.5819714 ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
Prediction Probabilities:
 [[0.63 0.37]
 [0.32 0.68]
 [0.63 0.37]
 [0.88 0.12]
 [0.69 0.31]]


# 4.3 Pipeline Testing and Validation


Unit Tests


Integration Tests:

Data Validation Tests

Model Performance Tests

In [302]:
import unittest
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Dummy dataset
X = np.random.rand(100, 5)
y = np.random.randint(0, 2, 100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)



# Unit test: check model is trained
def test_model_training():
    assert model is not None, "Model not initialized"
    assert hasattr(model, "predict"), "Model has no predict method"
    print("Model Training Test: PASSED")

test_model_training()



# Integration test: end-to-end pipeline
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Integration Test Accuracy:", acc)



# Data validation: check for NaN values
assert not np.isnan(X_train).any(), "NaN values in training data"
assert not np.isnan(X_test).any(), "NaN values in testing data"
print("Data Validation Test: PASSED")


# Model performance threshold
threshold = 0.6
if acc >= threshold:
    print("Model Performance Test: PASSED (Accuracy >= 0.6)")
else:
    print("Model Performance Test: FAILED (Accuracy < 0.6)")


Model Training Test: PASSED
Integration Test Accuracy: 0.65
Data Validation Test: PASSED
Model Performance Test: PASSED (Accuracy >= 0.6)


In [None]:
import pandas as pd

# Load the data from your existing processed file
df = pd.read_csv('Processed/MiniProjectS_Model_Evaluation.csv')


df.to_csv('Processed/MiniProjectS_Production_Ready.csv', index=False)


Data saved successfully! Shape: (569, 35)


In [4]:
# First, let's check what columns are actually available in your data
print("Available columns in the dataset:")
print(df.columns.tolist())
print("\nFirst few rows:")
print(df.head())
print("\nData shape:", df.shape)

Available columns in the dataset:
['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean concave points', 'mean symmetry', 'mean fractal dimension', 'radius error', 'texture error', 'perimeter error', 'area error', 'smoothness error', 'compactness error', 'concavity error', 'concave points error', 'symmetry error', 'fractal dimension error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness', 'worst compactness', 'worst concavity', 'worst concave points', 'worst symmetry', 'worst fractal dimension', 'target', 'Churn_Prob', 'Retention_Segment', 'Retention_Priority', 'Risk_Segment']

First few rows:
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0       