In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

#### Load dataset

In [None]:
df = pd.read_csv("survey_results.csv")
df.info()

In [None]:
df[df.duplicated(keep=False)].shape

In [None]:
# There are 10 duplicates

In [None]:
df = df.drop_duplicates(keep='first')

In [None]:
df['age'].max()

In [None]:
df['age'].min()

In [None]:
df.loc[df['age'] > 100,:].shape

In [None]:
# Drop 9 records that have extremely high age values

In [None]:
df = df[df['age'] < 100]
df.shape

In [None]:
df['income_levels'].value_counts()

In [None]:
df['income_levels'].isnull().sum()

In [None]:
df.loc[df['income_levels'].isnull(), 'income_levels'] = "Not_reported"

In [None]:
df['income_levels'].value_counts()

In [None]:
# Missing income levels indicated as "Not_reported"

In [None]:
df["consume_frequency(weekly)"].value_counts()

In [None]:
df.loc[df['consume_frequency(weekly)'].isnull(),:].shape

In [None]:
df["consume_frequency(weekly)"].mode()[0]

In [None]:
# Simply replace null-values in "consume_frequency(weekly)" with mode.
# It's not going to have big impact as there are only 8 such records out of 30010.

In [None]:
df.loc[df['consume_frequency(weekly)'].isnull(),"consume_frequency(weekly)"] = df["consume_frequency(weekly)"].mode()[0]

In [None]:
df["consume_frequency(weekly)"].value_counts()

In [None]:
df["purchase_channel"].value_counts()

In [None]:
df["purchase_channel"].mode()[0]

In [None]:
df.loc[df['purchase_channel'].isnull(),:].shape

In [None]:
# Simply replace null-values in "purchase_channel" with mode. 
# It's not going to have big impact as there are only 10 such records out of 30010.

In [None]:
df.loc[df['purchase_channel'].isnull(),"purchase_channel"] = df["purchase_channel"].mode()[0]

In [None]:
df['zone'].value_counts()

In [None]:
df.loc[df['zone'] == "Metor", 'zone'] = "Metro"
df.loc[df['zone'] == "urbna", 'zone'] = "Urban"

df['zone'].value_counts()

In [None]:
df['current_brand'].value_counts()

In [None]:
df.loc[df['current_brand'] == "Establishd", 'current_brand'] = "Established"
df.loc[df['current_brand'] == "newcomer", 'current_brand'] = "Newcomer"

df['current_brand'].value_counts()

#### Feature engineering

In [None]:
bins = [18, 25, 35, 45, 55, float('inf')]
labels = ['18–25', '26–35', '36–45', '46–55', '56+']

df['age_group'] = pd.cut(
    df['age'], 
    bins=bins, 
    labels=labels, 
    right=True,
    include_lowest=True
)


In [None]:
df['age_group'].value_counts()

In [None]:
df.shape

In [None]:
df = df.drop(columns=['age'])
df.shape

In [None]:
df['consume_frequency(weekly)'].value_counts()

In [None]:
cf_mapping = {
    "0-2 times" : 1,
    "3-4 times" : 2,
    "5-7 times" : 3
}

df['cf_score'] = df['consume_frequency(weekly)'].map(cf_mapping)

df['cf_score'].value_counts()

In [None]:
df['awareness_of_other_brands'].value_counts()

In [None]:
ab_mapping = {
    "0 to 1" : 1,
    "2 to 4" : 2,
    "above 4" : 3
}

df['ab_score'] = df['awareness_of_other_brands'].map(ab_mapping)

df['ab_score'].value_counts()

In [None]:
df['cf_ab_score'] = df['cf_score'] / (df['cf_score'] + df['ab_score'])
df[['cf_score', 'ab_score', 'cf_ab_score']].sample(5)

In [None]:
df['zone'].value_counts()

In [None]:
zone_mapping = {
    "Rural" : 1,
    "Semi-Urban" : 2,
    "Urban" : 3,
    "Metro" : 4
}

df['zone_score'] = df['zone'].map(zone_mapping)

df['zone_score'].value_counts()

In [None]:
df['income_levels'].value_counts()

In [None]:
income_mapping = {
    "Not_reported" : 0,
    "<10L" : 1,
    "10L - 15L" : 2,
    "16L - 25L" : 3,
    "26L - 35L" : 4,
    "> 35L" : 5
}

df['income_score'] = df['income_levels'].map(income_mapping)

df['income_score'].value_counts()

In [None]:
df['zas_score'] = df['zone_score'] * df['income_score']
df[['zone_score', 'income_score', 'zas_score']].sample(5)

In [None]:
df['reasons_for_choosing_brands'].value_counts()

In [None]:
df['current_brand'].value_counts()

In [None]:
df['bsi'] = np.where(
    (df['current_brand'] != "Established") & (df['reasons_for_choosing_brands'].isin(['Price','Quality'])),
    1,
    0
)

df[['current_brand','reasons_for_choosing_brands','bsi']].sample(5)

In [None]:
pd.crosstab(df['age_group'],df['occupation'])

In [None]:
df.loc[(df['age_group'] == "56+") & (df['occupation'] == 'Student'), :]

In [None]:
df.shape

In [None]:
df = df.loc[~((df['age_group'] == "56+") & (df['occupation'] == 'Student')), :]

In [None]:
df.shape

In [None]:
df['zas_score'].nunique()

In [None]:
df.loc[df['bsi'] == 0, :].shape

### MODEL BUILDING

In [None]:
df.info()

#### Exclude these columns from model building process. 

In [None]:
columns_to_exclude = ['respondent_id','income_levels','awareness_of_other_brands','consume_frequency(weekly)','zone','price_range']

X = df.drop(columns=columns_to_exclude)
y = df['price_range']

In [None]:
from sklearn.model_selection import train_test_split

# train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,      # 25% test
    random_state=42,    
    stratify=y          
)

In [None]:
X_train.info()

#### Perform label-encoding & one-hot encoding on respective columns

In [None]:
label_encode_columns = ['preferable_consumption_size','health_concerns','age_group']

one_hot_encode_columns = ['typical_consumption_situations','gender','occupation','current_brand','reasons_for_choosing_brands',
                          'flavor_preference','purchase_channel']



In [None]:
df['zone_score'].value_counts()

In [None]:
df['income_score'].value_counts()

In [None]:
df['age_group'].value_counts()

In [None]:
df['typical_consumption_situations'].value_counts()

In [None]:
df['occupation'].value_counts()

In [None]:
df['current_brand'].value_counts()

In [None]:
df['preferable_consumption_size'].value_counts()

In [None]:
df['reasons_for_choosing_brands'].value_counts()

In [None]:
df['flavor_preference'].value_counts()

In [None]:
df['purchase_channel'].value_counts()

In [None]:
df['packaging_preference'].value_counts()

In [None]:
df['health_concerns'].value_counts()

In [None]:
manual_mappings = {
    'age_group': {
        '18–25': 0,
        '26–35': 1,
        '36–45': 2,
        '46–55': 3,
        '56+': 4
    },
    'preferable_consumption_size': {
        'Small (250 ml)': 0,
        'Medium (500 ml)': 1,
        'Large (1 L)': 2
    },
    'health_concerns': {
        "Low (Not very concerned)": 0,
        "Medium (Moderately health-conscious)": 1,
        "High (Very health-conscious)": 2
    }
}

In [None]:
for col in label_encode_columns:
    X_train[col] = X_train[col].map(manual_mappings[col])



In [None]:
for col in label_encode_columns:
    X_test[col] = X_test[col].map(manual_mappings[col])



In [None]:
X_train.info()

In [None]:
X_train['preferable_consumption_size'].head()

In [None]:
X_train['age_group'] = X_train['age_group'].astype('int64')
X_test['age_group'] = X_test['age_group'].astype('int64')


In [None]:
X_test.info()

In [None]:
X_train = pd.get_dummies(
    X_train,
    columns=X_train.select_dtypes(include='object').columns,
    drop_first=True
)

X_test = pd.get_dummies(
    X_test,
    columns=X_test.select_dtypes(include='object').columns,
    drop_first=True
)

In [None]:
X_train.head()

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
manual_mappings_target = {
    '50-100': 0,
    '100-150': 1,
    '150-200': 2,
    '200-250': 3,
}


In [None]:
y_train = y_train.map(manual_mappings_target).astype('int64')
y_test = y_test.map(manual_mappings_target).astype('int64')

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
from sklearn.model_selection import RandomizedSearchCV

#### Gaussian NB

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

model_nb = GaussianNB()
model_nb.fit(X_train,y_train)

y_pred = model_nb.predict(X_test)

accuracy_nb = accuracy_score(y_test, y_pred)
print("Accuracy on Test Data using NB:", accuracy_nb)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
report_dict_gnb = classification_report(y_test, y_pred, output_dict=True)
report_dict_gnb

In [None]:
nb_params = model_nb.get_params()
nb_params

In [None]:
import mlflow

#### Use below code to log models one-by-one

In [None]:
# mlflow.set_experiment("Beverage Price Predictor")
# mlflow.set_tracking_uri("http://127.0.0.1:5000")

# with mlflow.start_run(run_name='Gaussian NB'):
#     mlflow.log_params(nb_params)
#     mlflow.log_metrics(
#         {'accuracy' : report_dict_gnb['accuracy'],
#          'recall_class_0' : report_dict_gnb['0']['recall'],
#          'recall_class_1' : report_dict_gnb['1']['recall'],
#          'recall_class_2' : report_dict_gnb['2']['recall'],
#          'recall_class_3' : report_dict_gnb['3']['recall'],
#          'precision_class_0' : report_dict_gnb['0']['precision'],
#          'precision_class_1' : report_dict_gnb['1']['precision'],
#          'precision_class_2' : report_dict_gnb['2']['precision'],
#          'precision_class_3' : report_dict_gnb['3']['precision'],
#          'f1-score_macro' : report_dict_gnb['macro avg']['f1-score'],
#          'f1-score_weighted' : report_dict_gnb['weighted avg']['f1-score'],         
#         }
#     )
#     mlflow.sklearn.log_model(model_nb, "Gaussian Navive Bayes")

#### Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression(max_iter=1000)  # increase max_iter to avoid convergence issues
model_lr.fit(X_train, y_train)

y_pred = model_lr.predict(X_test)

accuracy_lr = accuracy_score(y_test, y_pred)
print("Accuracy on Test Data using Logistic Regression:", accuracy_lr)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
report_dict_lr = classification_report(y_test, y_pred, output_dict=True)
# report_dict_lr

In [None]:
lr_params = model_lr.get_params()
lr_params

#### SVC

In [None]:
from sklearn.svm import SVC

##### Standardize the features using StandardScaler for SVC

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
model_svc = SVC(probability=True)   # kernel params are 'linear', 'poly', 'rbf', 'sigmoid'
model_svc.fit(X_train_scaled, y_train)

# --------------------------------------------------
# 6. Predict on test data
# --------------------------------------------------
y_pred = model_svc.predict(X_test_scaled)

# --------------------------------------------------
# 7. Accuracy
# --------------------------------------------------
accuracy_svc = accuracy_score(y_test, y_pred)
print("Accuracy on Test Data using SVC:", accuracy_svc)

# --------------------------------------------------
# 8. Classification Report
# --------------------------------------------------
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
report_dict_svc = classification_report(y_test, y_pred, output_dict=True)
# report_dict_svc

In [None]:
svc_params = model_svc.get_params()
svc_params

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(
    # n_estimators=100,     # number of trees
    # max_depth=None,       # fully grown trees
    random_state=42,
    n_jobs=-1             # use all CPU cores
)

model_rf.fit(X_train, y_train)

y_pred = model_rf.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred)
print("Accuracy on Test Data using RandomForestClassifier:", accuracy_rf)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
report_dict_rf = classification_report(y_test, y_pred, output_dict=True)
# report_dict_rf

In [None]:
rf_params = model_rf.get_params()
rf_params

#### XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
model_xgb = XGBClassifier(
    # n_estimators=200,
    # learning_rate=0.1,
    # max_depth=6,
    # subsample=0.8,
    # colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

model_xgb.fit(X_train, y_train)

y_pred = model_xgb.predict(X_test)

accuracy_xgb = accuracy_score(y_test, y_pred)
print("Accuracy on Test Data using XGboost Classifier:", accuracy_xgb)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
report_dict_xgb = classification_report(y_test, y_pred, output_dict=True)
# report_dict_xgb

In [None]:
xgb_params = model_xgb.get_params()
xgb_params

#### Light GBM

In [None]:
!pip install lightgbm

In [None]:
import lightgbm as lgb

model_lgb = lgb.LGBMClassifier(
    # n_estimators=300,
    # learning_rate=0.05,
    # max_depth=-1,           # -1 means no limit
    # num_leaves=31,
    # subsample=0.8,
    # colsample_bytree=0.8,
    random_state=42
)

In [None]:
model_lgb.fit(X_train, y_train)

y_pred = model_lgb.predict(X_test)

accuracy_lgb = accuracy_score(y_test, y_pred)
print("Accuracy on Test Data using Light GBM:", accuracy_lgb)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
report_dict_lgb = classification_report(y_test, y_pred, output_dict=True)
# report_dict_lgb

In [None]:
lgb_params = model_lgb.get_params()
lgb_params

#### Dagshub

In [None]:
import dagshub
dagshub.init(repo_owner='arjun.kopites', repo_name='CB_AT_I_1', mlflow=True)

In [None]:
import mlflow


#### Single function to log the models using MLFlow

In [None]:
MODEL_NAMES = [
    "Gaussian NB",
    "Logistic Regression",
    "Random Forest",
    "SVC",
    "XGBoost",
    "LightGBM"
]

In [None]:
def log_model_to_mlflow(model_name, model, params, report_dict):

    model_id = MODEL_NAMES.index(model_name)

    with mlflow.start_run(run_name=model_name):

        # Human-readable tag (click run → see model name)
        mlflow.set_tag("model_name", model_name)

        # Numeric model ID (shown on charts)
        mlflow.log_param("model_name_id", model_id)

        # Log hyperparameters
        mlflow.log_params(params)

        # Log metrics
        metrics = {
            'accuracy': report_dict['accuracy'],
            'f1_macro': report_dict['macro avg']['f1-score'],
            'f1_weighted': report_dict['weighted avg']['f1-score'],
        }

        for cls in ['0', '1', '2', '3']:
            metrics[f'precision_class_{cls}'] = report_dict[cls]['precision']
            metrics[f'recall_class_{cls}'] = report_dict[cls]['recall']

        mlflow.log_metrics(metrics)

        # Save model
        mlflow.sklearn.log_model(model, artifact_path=model_name)


In [None]:
# Set these ONCE at the top
mlflow.set_experiment("Beverage Price Predictor")
mlflow.set_tracking_uri("https://dagshub.com/arjun.kopites/CB_AT_I_1.mlflow")

# --- Gaussian NB ---
log_model_to_mlflow("Gaussian NB", model_nb, nb_params, report_dict_gnb)

# --- Logistic Regression ---
log_model_to_mlflow("Logistic Regression", model_lr, lr_params, report_dict_lr)

# --- Random Forest ---
log_model_to_mlflow("Random Forest", model_rf, rf_params, report_dict_rf)

# --- SVC ---
log_model_to_mlflow("SVC", model_svc, svc_params, report_dict_svc)




In [None]:
model_name = "XGBoost"
model_name_id = MODEL_NAMES.index(model_name)

with mlflow.start_run(run_name=model_name):

    # Human-readable model name (shows in Run Table)
    mlflow.set_tag("model_name", model_name)

    # Numeric ID used in Parallel Coordinates Plot
    mlflow.log_param("model_name_id", model_name_id)

    # Log hyperparameters
    mlflow.log_params(xgb_params)

    # Log metrics
    metrics = {
        'accuracy': report_dict_xgb['accuracy'],
        'f1_macro': report_dict_xgb['macro avg']['f1-score'],
        'f1_weighted': report_dict_xgb['weighted avg']['f1-score'],
    }

    # Add per-class precision & recall automatically
    for cls in ['0', '1', '2', '3']:
        metrics[f'precision_class_{cls}'] = report_dict_xgb[cls]['precision']
        metrics[f'recall_class_{cls}'] = report_dict_xgb[cls]['recall']

    mlflow.log_metrics(metrics)

    # Log XGBoost model
    mlflow.xgboost.log_model(model_xgb, artifact_path=model_name)


In [None]:
model_name = "LightGBM"
model_name_id = MODEL_NAMES.index(model_name)

with mlflow.start_run(run_name=model_name):

    # Human-readable model name tag
    mlflow.set_tag("model_name", model_name)

    # Numeric model name ID (used in MLflow visualizations)
    mlflow.log_param("model_name_id", model_name_id)

    # Log hyperparameters
    mlflow.log_params(lgb_params)

    # Log metrics
    metrics = {
        'accuracy': report_dict_lgb['accuracy'],
        'f1_macro': report_dict_lgb['macro avg']['f1-score'],
        'f1_weighted': report_dict_lgb['weighted avg']['f1-score'],
    }

    # Add per-class precision & recall
    for cls in ['0', '1', '2', '3']:
        metrics[f'precision_class_{cls}'] = report_dict_lgb[cls]['precision']
        metrics[f'recall_class_{cls}'] = report_dict_lgb[cls]['recall']

    mlflow.log_metrics(metrics)

    # Log LightGBM model
    mlflow.lightgbm.log_model(model_lgb, artifact_path=model_name)


#### Code to delete an already registered model

In [None]:
# import mlflow
# from mlflow.tracking import MlflowClient

In [None]:
# client = MlflowClient()

In [None]:
# model_name = "lightGBM_model"  # Replace with the actual name of your registered model
# client.delete_registered_model(name=model_name)

#### Register the models

In [None]:
model_name = "XGBoost_Classifier"
run_id = input("Enter Run ID:")
model_uri = f"runs:/{run_id}/{model_name}"

result = mlflow.register_model(
    model_uri, model_name
)



In [None]:
model_name = "Light_GBM_Classifier"
run_id = input("Enter Run ID:")
model_uri = f"runs:/{run_id}/{model_name}"

result = mlflow.register_model(
    model_uri, model_name
)



In [None]:
model_name = "Random_Forest_Classifier"
run_id = input("Enter Run ID:")
model_uri = f"runs:/{run_id}/{model_name}"

result = mlflow.register_model(
    model_uri, model_name
)



In [None]:
model_name = "Support_Vector_Classifier"
run_id = input("Enter Run ID:")
model_uri = f"runs:/{run_id}/{model_name}"

result = mlflow.register_model(
    model_uri, model_name
)



In [None]:
model_name = "Gaussian_Naive_Bayes_Classifier"
run_id = input("Enter Run ID:")
model_uri = f"runs:/{run_id}/{model_name}"

result = mlflow.register_model(
    model_uri, model_name
)



In [None]:
model_name = "Logistic_Regression_Classifier"
run_id = input("Enter Run ID:")
model_uri = f"runs:/{run_id}/{model_name}"

result = mlflow.register_model(
    model_uri, model_name
)



#### Load the champion model viz., LightGBM in this case

In [None]:
model_uri = 'runs:/f32289d3ec8242678211bc75996dd8a7/LightGBM'
loaded_model = mlflow.lightgbm.load_model(model_uri=model_uri)

y_pred = loaded_model.predict(X_test)
y_pred[:4]

#### Transition from development env to production env

In [None]:
client = mlflow.MlflowClient()
client.search_registered_models()

In [None]:
development_model_uri = "models:/Light_GBM_Classifier@champion"
production_model_uri = 'beverage_price_predictor'

client.copy_model_version(src_model_uri=development_model_uri,dst_name=production_model_uri)

#### Load production model & make prediction

In [None]:
# Load the production model from the MLflow Model Registry
model_uri = "runs:/f32289d3ec8242678211bc75996dd8a7/LightGBM"
loaded_model = mlflow.lightgbm.load_model(model_uri)

# Make predictions locally
y_pred = loaded_model.predict(X_test)
y_pred[:4]