In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

#### Load dataset

In [2]:
df = pd.read_csv("survey_results.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30010 entries, 0 to 30009
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   respondent_id                   30010 non-null  object
 1   age                             30010 non-null  int64 
 2   gender                          30010 non-null  object
 3   zone                            30010 non-null  object
 4   occupation                      30010 non-null  object
 5   income_levels                   21946 non-null  object
 6   consume_frequency(weekly)       30002 non-null  object
 7   current_brand                   30010 non-null  object
 8   preferable_consumption_size     30010 non-null  object
 9   awareness_of_other_brands       30010 non-null  object
 10  reasons_for_choosing_brands     30010 non-null  object
 11  flavor_preference               30010 non-null  object
 12  purchase_channel                30000 non-null

In [3]:
df[df.duplicated(keep=False)].shape

(20, 17)

In [4]:
# There are 10 duplicates

In [5]:
df = df.drop_duplicates(keep='first')

In [6]:
df['age'].max()

604

In [7]:
df['age'].min()

18

In [8]:
df.loc[df['age'] > 100,:].shape

(9, 17)

In [9]:
# Drop 9 records that have extremely high age values

In [10]:
df = df[df['age'] < 100]
df.shape

(29991, 17)

In [11]:
df['income_levels'].value_counts()

income_levels
16L - 25L    5897
10L - 15L    5251
<10L         4661
26L - 35L    3872
> 35L        2250
Name: count, dtype: int64

In [12]:
df['income_levels'].isnull().sum()

8060

In [13]:
df.loc[df['income_levels'].isnull(), 'income_levels'] = "Not_reported"

In [14]:
df['income_levels'].value_counts()

income_levels
Not_reported    8060
16L - 25L       5897
10L - 15L       5251
<10L            4661
26L - 35L       3872
> 35L           2250
Name: count, dtype: int64

In [15]:
# Missing income levels indicated as "Not_reported"

In [16]:
df["consume_frequency(weekly)"].value_counts()

consume_frequency(weekly)
3-4 times    11786
5-7 times     9774
0-2 times     8423
Name: count, dtype: int64

In [17]:
df.loc[df['consume_frequency(weekly)'].isnull(),:].shape

(8, 17)

In [18]:
df["consume_frequency(weekly)"].mode()[0]

'3-4 times'

In [19]:
# Simply replace null-values in "consume_frequency(weekly)" with mode.
# It's not going to have big impact as there are only 8 such records out of 30010.

In [20]:
df.loc[df['consume_frequency(weekly)'].isnull(),"consume_frequency(weekly)"] = df["consume_frequency(weekly)"].mode()[0]

In [21]:
df["consume_frequency(weekly)"].value_counts()

consume_frequency(weekly)
3-4 times    11794
5-7 times     9774
0-2 times     8423
Name: count, dtype: int64

In [22]:
df["purchase_channel"].value_counts()

purchase_channel
Online          16562
Retail Store    13419
Name: count, dtype: int64

In [23]:
df["purchase_channel"].mode()[0]

'Online'

In [24]:
df.loc[df['purchase_channel'].isnull(),:].shape

(10, 17)

In [25]:
# Simply replace null-values in "purchase_channel" with mode. 
# It's not going to have big impact as there are only 10 such records out of 30010.

In [26]:
df.loc[df['purchase_channel'].isnull(),"purchase_channel"] = df["purchase_channel"].mode()[0]

In [27]:
df['zone'].value_counts()

zone
Metro         11906
Urban         10686
Semi-Urban     5274
Rural          2116
urbna             5
Metor             4
Name: count, dtype: int64

In [28]:
df.loc[df['zone'] == "Metor", 'zone'] = "Metro"
df.loc[df['zone'] == "urbna", 'zone'] = "Urban"

df['zone'].value_counts()

zone
Metro         11910
Urban         10691
Semi-Urban     5274
Rural          2116
Name: count, dtype: int64

In [29]:
df['current_brand'].value_counts()

current_brand
Established    15442
Newcomer       14499
newcomer          30
Establishd        20
Name: count, dtype: int64

In [30]:
df.loc[df['current_brand'] == "Establishd", 'current_brand'] = "Established"
df.loc[df['current_brand'] == "newcomer", 'current_brand'] = "Newcomer"

df['current_brand'].value_counts()

current_brand
Established    15462
Newcomer       14529
Name: count, dtype: int64

#### Feature engineering

In [31]:
bins = [18, 25, 35, 45, 55, float('inf')]
labels = ['18–25', '26–35', '36–45', '46–55', '56+']

df['age_group'] = pd.cut(
    df['age'], 
    bins=bins, 
    labels=labels, 
    right=True,
    include_lowest=True
)


In [32]:
df['age_group'].value_counts()

age_group
18–25    10468
26–35     9093
36–45     5972
46–55     2966
56+       1492
Name: count, dtype: int64

In [33]:
df.shape

(29991, 18)

In [34]:
df = df.drop(columns=['age'])
df.shape

(29991, 17)

In [35]:
df['consume_frequency(weekly)'].value_counts()

consume_frequency(weekly)
3-4 times    11794
5-7 times     9774
0-2 times     8423
Name: count, dtype: int64

In [36]:
cf_mapping = {
    "0-2 times" : 1,
    "3-4 times" : 2,
    "5-7 times" : 3
}

df['cf_score'] = df['consume_frequency(weekly)'].map(cf_mapping)

df['cf_score'].value_counts()

cf_score
2    11794
3     9774
1     8423
Name: count, dtype: int64

In [37]:
df['awareness_of_other_brands'].value_counts()

awareness_of_other_brands
0 to 1     12966
2 to 4     11225
above 4     5800
Name: count, dtype: int64

In [38]:
ab_mapping = {
    "0 to 1" : 1,
    "2 to 4" : 2,
    "above 4" : 3
}

df['ab_score'] = df['awareness_of_other_brands'].map(ab_mapping)

df['ab_score'].value_counts()

ab_score
1    12966
2    11225
3     5800
Name: count, dtype: int64

In [39]:
df['cf_ab_score'] = df['cf_score'] / (df['cf_score'] + df['ab_score'])
df[['cf_score', 'ab_score', 'cf_ab_score']].sample(5)

Unnamed: 0,cf_score,ab_score,cf_ab_score
24253,1,1,0.5
5788,3,1,0.75
28846,3,1,0.75
12859,3,1,0.75
21598,2,3,0.4


In [40]:
df['zone'].value_counts()

zone
Metro         11910
Urban         10691
Semi-Urban     5274
Rural          2116
Name: count, dtype: int64

In [41]:
zone_mapping = {
    "Rural" : 1,
    "Semi-Urban" : 2,
    "Urban" : 3,
    "Metro" : 4
}

df['zone_score'] = df['zone'].map(zone_mapping)

df['zone_score'].value_counts()

zone_score
4    11910
3    10691
2     5274
1     2116
Name: count, dtype: int64

In [42]:
df['income_levels'].value_counts()

income_levels
Not_reported    8060
16L - 25L       5897
10L - 15L       5251
<10L            4661
26L - 35L       3872
> 35L           2250
Name: count, dtype: int64

In [43]:
income_mapping = {
    "Not_reported" : 0,
    "<10L" : 1,
    "10L - 15L" : 2,
    "16L - 25L" : 3,
    "26L - 35L" : 4,
    "> 35L" : 5
}

df['income_score'] = df['income_levels'].map(income_mapping)

df['income_score'].value_counts()

income_score
0    8060
3    5897
2    5251
1    4661
4    3872
5    2250
Name: count, dtype: int64

In [44]:
df['zas_score'] = df['zone_score'] * df['income_score']
df[['zone_score', 'income_score', 'zas_score']].sample(5)

Unnamed: 0,zone_score,income_score,zas_score
22545,3,2,6
16608,3,0,0
12321,4,4,16
18401,2,0,0
21363,4,0,0


In [45]:
df['reasons_for_choosing_brands'].value_counts()

reasons_for_choosing_brands
Price               14138
Availability         6590
Brand Reputation     4661
Quality              4602
Name: count, dtype: int64

In [46]:
df['current_brand'].value_counts()

current_brand
Established    15462
Newcomer       14529
Name: count, dtype: int64

In [47]:
df['bsi'] = np.where(
    (df['current_brand'] != "Established") & (df['reasons_for_choosing_brands'].isin(['Price','Quality'])),
    1,
    0
)

df[['current_brand','reasons_for_choosing_brands','bsi']].sample(5)

Unnamed: 0,current_brand,reasons_for_choosing_brands,bsi
14546,Established,Brand Reputation,0
20693,Established,Brand Reputation,0
7089,Newcomer,Price,1
23120,Newcomer,Price,1
17654,Established,Quality,0


In [48]:
pd.crosstab(df['age_group'],df['occupation'])

occupation,Entrepreneur,Retired,Student,Working Professional
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18–25,535,0,7328,2605
26–35,1826,0,697,6570
36–45,1619,0,0,4353
46–55,799,0,0,2167
56+,221,1130,35,106


In [49]:
df.loc[(df['age_group'] == "56+") & (df['occupation'] == 'Student'), :]

Unnamed: 0,respondent_id,gender,zone,occupation,income_levels,consume_frequency(weekly),current_brand,preferable_consumption_size,awareness_of_other_brands,reasons_for_choosing_brands,...,typical_consumption_situations,price_range,age_group,cf_score,ab_score,cf_ab_score,zone_score,income_score,zas_score,bsi
182,R00183,F,Urban,Student,Not_reported,5-7 times,Established,Large (1 L),above 4,Price,...,Casual (eg. At home),150-200,56+,3,3,0.5,3,0,0,0
3526,R03525,F,Semi-Urban,Student,Not_reported,0-2 times,Established,Medium (500 ml),0 to 1,Price,...,"Active (eg. Sports, gym)",50-100,56+,1,1,0.5,2,0,0,0
3527,R03526,M,Semi-Urban,Student,Not_reported,0-2 times,Newcomer,Small (250 ml),2 to 4,Price,...,Casual (eg. At home),100-150,56+,1,2,0.333333,2,0,0,1
3772,R03771,M,Metro,Student,Not_reported,5-7 times,Newcomer,Medium (500 ml),2 to 4,Availability,...,"Active (eg. Sports, gym)",150-200,56+,3,2,0.6,4,0,0,0
4033,R04032,M,Urban,Student,Not_reported,3-4 times,Established,Medium (500 ml),0 to 1,Price,...,Social (eg. Parties),150-200,56+,2,1,0.666667,3,0,0,0
6545,R06543,F,Urban,Student,Not_reported,5-7 times,Newcomer,Medium (500 ml),0 to 1,Price,...,Social (eg. Parties),150-200,56+,3,1,0.75,3,0,0,1
6594,R06592,M,Semi-Urban,Student,Not_reported,5-7 times,Established,Medium (500 ml),0 to 1,Availability,...,Casual (eg. At home),100-150,56+,3,1,0.75,2,0,0,0
6648,R06646,F,Semi-Urban,Student,Not_reported,0-2 times,Established,Medium (500 ml),2 to 4,Price,...,Casual (eg. At home),100-150,56+,1,2,0.333333,2,0,0,0
7420,R07418,F,Rural,Student,Not_reported,3-4 times,Established,Small (250 ml),0 to 1,Price,...,"Active (eg. Sports, gym)",100-150,56+,2,1,0.666667,1,0,0,0
7596,R07594,M,Urban,Student,Not_reported,5-7 times,Newcomer,Medium (500 ml),2 to 4,Price,...,"Active (eg. Sports, gym)",200-250,56+,3,2,0.6,3,0,0,1


In [50]:
df.shape

(29991, 24)

In [51]:
df = df.loc[~((df['age_group'] == "56+") & (df['occupation'] == 'Student')), :]

In [52]:
df.shape

(29956, 24)

In [53]:
df['zas_score'].nunique()

14

In [54]:
df.loc[df['bsi'] == 0, :].shape

(20796, 24)

### MODEL BUILDING

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29956 entries, 0 to 30009
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   respondent_id                   29956 non-null  object  
 1   gender                          29956 non-null  object  
 2   zone                            29956 non-null  object  
 3   occupation                      29956 non-null  object  
 4   income_levels                   29956 non-null  object  
 5   consume_frequency(weekly)       29956 non-null  object  
 6   current_brand                   29956 non-null  object  
 7   preferable_consumption_size     29956 non-null  object  
 8   awareness_of_other_brands       29956 non-null  object  
 9   reasons_for_choosing_brands     29956 non-null  object  
 10  flavor_preference               29956 non-null  object  
 11  purchase_channel                29956 non-null  object  
 12  packaging_preference   

#### Exclude these columns from model building process. 

In [56]:
columns_to_exclude = ['respondent_id','income_levels','awareness_of_other_brands','consume_frequency(weekly)','zone','price_range']

X = df.drop(columns=columns_to_exclude)
y = df['price_range']

In [57]:
from sklearn.model_selection import train_test_split

# train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,      # 25% test
    random_state=42,    
    stratify=y          
)

In [58]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22467 entries, 11016 to 11163
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   gender                          22467 non-null  object  
 1   occupation                      22467 non-null  object  
 2   current_brand                   22467 non-null  object  
 3   preferable_consumption_size     22467 non-null  object  
 4   reasons_for_choosing_brands     22467 non-null  object  
 5   flavor_preference               22467 non-null  object  
 6   purchase_channel                22467 non-null  object  
 7   packaging_preference            22467 non-null  object  
 8   health_concerns                 22467 non-null  object  
 9   typical_consumption_situations  22467 non-null  object  
 10  age_group                       22467 non-null  category
 11  cf_score                        22467 non-null  int64   
 12  ab_score           

#### Perform label-encoding & one-hot encoding on respective columns

In [59]:
label_encode_columns = ['preferable_consumption_size','health_concerns','age_group']

one_hot_encode_columns = ['typical_consumption_situations','gender','occupation','current_brand','reasons_for_choosing_brands',
                          'flavor_preference','purchase_channel']



In [60]:
df['zone_score'].value_counts()

zone_score
4    11901
3    10677
2     5265
1     2113
Name: count, dtype: int64

In [61]:
df['income_score'].value_counts()

income_score
0    8025
3    5897
2    5251
1    4661
4    3872
5    2250
Name: count, dtype: int64

In [62]:
df['age_group'].value_counts()

age_group
18–25    10468
26–35     9093
36–45     5972
46–55     2966
56+       1457
Name: count, dtype: int64

In [63]:
df['typical_consumption_situations'].value_counts()

typical_consumption_situations
Active (eg. Sports, gym)    11238
Casual (eg. At home)        10062
Social (eg. Parties)         8656
Name: count, dtype: int64

In [64]:
df['occupation'].value_counts()

occupation
Working Professional    15801
Student                  8025
Entrepreneur             5000
Retired                  1130
Name: count, dtype: int64

In [65]:
df['current_brand'].value_counts()

current_brand
Established    15444
Newcomer       14512
Name: count, dtype: int64

In [66]:
df['preferable_consumption_size'].value_counts()

preferable_consumption_size
Medium (500 ml)    13205
Small (250 ml)      9709
Large (1 L)         7042
Name: count, dtype: int64

In [67]:
df['reasons_for_choosing_brands'].value_counts()

reasons_for_choosing_brands
Price               14110
Availability         6583
Brand Reputation     4661
Quality              4602
Name: count, dtype: int64

In [68]:
df['flavor_preference'].value_counts()

flavor_preference
Traditional    15085
Exotic         14871
Name: count, dtype: int64

In [69]:
df['purchase_channel'].value_counts()

purchase_channel
Online          16561
Retail Store    13395
Name: count, dtype: int64

In [70]:
df['packaging_preference'].value_counts()

packaging_preference
Simple          14422
Premium         11692
Eco-Friendly     3842
Name: count, dtype: int64

In [71]:
df['health_concerns'].value_counts()

health_concerns
High (Very health-conscious)            11893
Medium (Moderately health-conscious)    10624
Low (Not very concerned)                 7439
Name: count, dtype: int64

In [72]:
manual_mappings = {
    'age_group': {
        '18–25': 0,
        '26–35': 1,
        '36–45': 2,
        '46–55': 3,
        '56+': 4
    },
    'preferable_consumption_size': {
        'Small (250 ml)': 0,
        'Medium (500 ml)': 1,
        'Large (1 L)': 2
    },
    'health_concerns': {
        "Low (Not very concerned)": 0,
        "Medium (Moderately health-conscious)": 1,
        "High (Very health-conscious)": 2
    }
}

In [73]:
for col in label_encode_columns:
    X_train[col] = X_train[col].map(manual_mappings[col])



In [74]:
for col in label_encode_columns:
    X_test[col] = X_test[col].map(manual_mappings[col])



In [75]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22467 entries, 11016 to 11163
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   gender                          22467 non-null  object  
 1   occupation                      22467 non-null  object  
 2   current_brand                   22467 non-null  object  
 3   preferable_consumption_size     22467 non-null  int64   
 4   reasons_for_choosing_brands     22467 non-null  object  
 5   flavor_preference               22467 non-null  object  
 6   purchase_channel                22467 non-null  object  
 7   packaging_preference            22467 non-null  object  
 8   health_concerns                 22467 non-null  int64   
 9   typical_consumption_situations  22467 non-null  object  
 10  age_group                       22467 non-null  category
 11  cf_score                        22467 non-null  int64   
 12  ab_score           

In [76]:
X_train['preferable_consumption_size'].head()

11016    0
25446    1
8271     0
18828    1
9399     0
Name: preferable_consumption_size, dtype: int64

In [77]:
X_train['age_group'] = X_train['age_group'].astype('int64')
X_test['age_group'] = X_test['age_group'].astype('int64')


In [78]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7489 entries, 28481 to 6945
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   gender                          7489 non-null   object 
 1   occupation                      7489 non-null   object 
 2   current_brand                   7489 non-null   object 
 3   preferable_consumption_size     7489 non-null   int64  
 4   reasons_for_choosing_brands     7489 non-null   object 
 5   flavor_preference               7489 non-null   object 
 6   purchase_channel                7489 non-null   object 
 7   packaging_preference            7489 non-null   object 
 8   health_concerns                 7489 non-null   int64  
 9   typical_consumption_situations  7489 non-null   object 
 10  age_group                       7489 non-null   int64  
 11  cf_score                        7489 non-null   int64  
 12  ab_score                        748

In [79]:
X_train = pd.get_dummies(
    X_train,
    columns=X_train.select_dtypes(include='object').columns,
    drop_first=True
)

X_test = pd.get_dummies(
    X_test,
    columns=X_test.select_dtypes(include='object').columns,
    drop_first=True
)

In [80]:
X_train.head()

Unnamed: 0,preferable_consumption_size,health_concerns,age_group,cf_score,ab_score,cf_ab_score,zone_score,income_score,zas_score,bsi,...,current_brand_Newcomer,reasons_for_choosing_brands_Brand Reputation,reasons_for_choosing_brands_Price,reasons_for_choosing_brands_Quality,flavor_preference_Traditional,purchase_channel_Retail Store,packaging_preference_Premium,packaging_preference_Simple,typical_consumption_situations_Casual (eg. At home),typical_consumption_situations_Social (eg. Parties)
11016,0,2,1,3,3,0.5,3,1,3,1,...,True,False,True,False,True,False,False,True,False,True
25446,1,1,3,1,3,0.25,4,3,12,0,...,True,False,False,False,False,True,False,False,False,True
8271,0,1,1,1,2,0.333333,2,3,6,0,...,False,True,False,False,True,True,True,False,True,False
18828,1,0,3,2,1,0.666667,1,1,1,0,...,False,False,True,False,True,False,True,False,True,False
9399,0,2,2,2,1,0.666667,4,4,16,0,...,False,False,False,True,False,False,False,True,False,True


In [81]:
y_train.value_counts()

price_range
200-250    7283
150-200    6598
100-150    5845
50-100     2741
Name: count, dtype: int64

In [82]:
y_test.value_counts()

price_range
200-250    2428
150-200    2199
100-150    1948
50-100      914
Name: count, dtype: int64

In [83]:
manual_mappings_target = {
    '50-100': 0,
    '100-150': 1,
    '150-200': 2,
    '200-250': 3,
}


In [84]:
y_train = y_train.map(manual_mappings_target).astype('int64')
y_test = y_test.map(manual_mappings_target).astype('int64')

In [85]:
y_train.value_counts()

price_range
3    7283
2    6598
1    5845
0    2741
Name: count, dtype: int64

In [86]:
y_test.value_counts()

price_range
3    2428
2    2199
1    1948
0     914
Name: count, dtype: int64

In [87]:
from sklearn.model_selection import RandomizedSearchCV

#### Gaussian NB

In [88]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

model_nb = GaussianNB()
model_nb.fit(X_train,y_train)

y_pred = model_nb.predict(X_test)

accuracy_nb = accuracy_score(y_test, y_pred)
print("Accuracy on Test Data using NB:", accuracy_nb)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy on Test Data using NB: 0.5670984110028041

Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.93      0.58       914
           1       0.48      0.26      0.33      1948
           2       0.57      0.33      0.42      2199
           3       0.69      0.90      0.78      2428

    accuracy                           0.57      7489
   macro avg       0.54      0.60      0.53      7489
weighted avg       0.57      0.57      0.53      7489



In [89]:
report_dict_gnb = classification_report(y_test, y_pred, output_dict=True)
report_dict_gnb

{'0': {'precision': 0.4175715695952616,
  'recall': 0.925601750547046,
  'f1-score': 0.5755102040816327,
  'support': 914.0},
 '1': {'precision': 0.47564469914040114,
  'recall': 0.25564681724845995,
  'f1-score': 0.3325542570951586,
  'support': 1948.0},
 '2': {'precision': 0.5707509881422925,
  'recall': 0.32833105957253295,
  'f1-score': 0.4168591224018476,
  'support': 2199.0},
 '3': {'precision': 0.6921612186607426,
  'recall': 0.8982701812191104,
  'f1-score': 0.7818605484853917,
  'support': 2428.0},
 'accuracy': 0.5670984110028041,
 'macro avg': {'precision': 0.5390321188846745,
  'recall': 0.6019624521467873,
  'f1-score': 0.5266960330160076,
  'support': 7489.0},
 'weighted avg': {'precision': 0.5666798171142683,
  'recall': 0.5670984110028041,
  'f1-score': 0.5326295421599913,
  'support': 7489.0}}

In [90]:
nb_params = model_nb.get_params()
nb_params

{'priors': None, 'var_smoothing': 1e-09}

In [91]:
import mlflow

  from google.protobuf import service as _service


#### Use below code to log models one-by-one

In [92]:
# mlflow.set_experiment("Beverage Price Predictor")
# mlflow.set_tracking_uri("http://127.0.0.1:5000")

# with mlflow.start_run(run_name='Gaussian NB'):
#     mlflow.log_params(nb_params)
#     mlflow.log_metrics(
#         {'accuracy' : report_dict_gnb['accuracy'],
#          'recall_class_0' : report_dict_gnb['0']['recall'],
#          'recall_class_1' : report_dict_gnb['1']['recall'],
#          'recall_class_2' : report_dict_gnb['2']['recall'],
#          'recall_class_3' : report_dict_gnb['3']['recall'],
#          'precision_class_0' : report_dict_gnb['0']['precision'],
#          'precision_class_1' : report_dict_gnb['1']['precision'],
#          'precision_class_2' : report_dict_gnb['2']['precision'],
#          'precision_class_3' : report_dict_gnb['3']['precision'],
#          'f1-score_macro' : report_dict_gnb['macro avg']['f1-score'],
#          'f1-score_weighted' : report_dict_gnb['weighted avg']['f1-score'],         
#         }
#     )
#     mlflow.sklearn.log_model(model_nb, "Gaussian Navive Bayes")

#### Logistic regression

In [93]:
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression(max_iter=1000)  # increase max_iter to avoid convergence issues
model_lr.fit(X_train, y_train)

y_pred = model_lr.predict(X_test)

accuracy_lr = accuracy_score(y_test, y_pred)
print("Accuracy on Test Data using Logistic Regression:", accuracy_lr)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy on Test Data using Logistic Regression: 0.8353585258378956

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.82      0.84       914
           1       0.81      0.80      0.80      1948
           2       0.79      0.79      0.79      2199
           3       0.90      0.91      0.90      2428

    accuracy                           0.84      7489
   macro avg       0.83      0.83      0.83      7489
weighted avg       0.84      0.84      0.84      7489



In [94]:
report_dict_lr = classification_report(y_test, y_pred, output_dict=True)
# report_dict_lr

In [95]:
lr_params = model_lr.get_params()
lr_params

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 1000,
 'multi_class': 'deprecated',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

#### SVC

In [96]:
from sklearn.svm import SVC

##### Standardize the features using StandardScaler for SVC

In [97]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [98]:
model_svc = SVC(probability=True)   # kernel params are 'linear', 'poly', 'rbf', 'sigmoid'
model_svc.fit(X_train_scaled, y_train)

# --------------------------------------------------
# 6. Predict on test data
# --------------------------------------------------
y_pred = model_svc.predict(X_test_scaled)

# --------------------------------------------------
# 7. Accuracy
# --------------------------------------------------
accuracy_svc = accuracy_score(y_test, y_pred)
print("Accuracy on Test Data using SVC:", accuracy_svc)

# --------------------------------------------------
# 8. Classification Report
# --------------------------------------------------
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy on Test Data using SVC: 0.8782213913740152

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.88      0.89       914
           1       0.86      0.86      0.86      1948
           2       0.83      0.85      0.84      2199
           3       0.93      0.92      0.93      2428

    accuracy                           0.88      7489
   macro avg       0.88      0.88      0.88      7489
weighted avg       0.88      0.88      0.88      7489



In [99]:
report_dict_svc = classification_report(y_test, y_pred, output_dict=True)
# report_dict_svc

In [100]:
svc_params = model_svc.get_params()
svc_params

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': True,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

#### Random Forest

In [101]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(
    # n_estimators=100,     # number of trees
    # max_depth=None,       # fully grown trees
    random_state=42,
    n_jobs=-1             # use all CPU cores
)

model_rf.fit(X_train, y_train)

y_pred = model_rf.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred)
print("Accuracy on Test Data using RandomForestClassifier:", accuracy_rf)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy on Test Data using RandomForestClassifier: 0.8967819468553879

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.90      0.91       914
           1       0.88      0.88      0.88      1948
           2       0.85      0.88      0.86      2199
           3       0.94      0.93      0.94      2428

    accuracy                           0.90      7489
   macro avg       0.90      0.90      0.90      7489
weighted avg       0.90      0.90      0.90      7489



In [102]:
report_dict_rf = classification_report(y_test, y_pred, output_dict=True)
# report_dict_rf

In [103]:
rf_params = model_rf.get_params()
rf_params

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

#### XGBoost

In [104]:
from xgboost import XGBClassifier

In [105]:
model_xgb = XGBClassifier(
    # n_estimators=200,
    # learning_rate=0.1,
    # max_depth=6,
    # subsample=0.8,
    # colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

model_xgb.fit(X_train, y_train)

y_pred = model_xgb.predict(X_test)

accuracy_xgb = accuracy_score(y_test, y_pred)
print("Accuracy on Test Data using XGboost Classifier:", accuracy_xgb)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy on Test Data using XGboost Classifier: 0.927627186540259

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.93      0.93       914
           1       0.93      0.91      0.92      1948
           2       0.89      0.92      0.91      2199
           3       0.96      0.95      0.95      2428

    accuracy                           0.93      7489
   macro avg       0.93      0.93      0.93      7489
weighted avg       0.93      0.93      0.93      7489



In [106]:
report_dict_xgb = classification_report(y_test, y_pred, output_dict=True)
# report_dict_xgb

In [107]:
xgb_params = model_xgb.get_params()
xgb_params

{'objective': 'multi:softprob',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'feature_weights': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': -1,
 'num_parallel_tree': None,
 'random_state': 42,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

#### Light GBM

In [108]:
!pip install lightgbm




[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [109]:
import lightgbm as lgb

model_lgb = lgb.LGBMClassifier(
    # n_estimators=300,
    # learning_rate=0.05,
    # max_depth=-1,           # -1 means no limit
    # num_leaves=31,
    # subsample=0.8,
    # colsample_bytree=0.8,
    random_state=42
)

In [110]:
model_lgb.fit(X_train, y_train)

y_pred = model_lgb.predict(X_test)

accuracy_lgb = accuracy_score(y_test, y_pred)
print("Accuracy on Test Data using Light GBM:", accuracy_lgb)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000764 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 82
[LightGBM] [Info] Number of data points in the train set: 22467, number of used features: 24
[LightGBM] [Info] Start training from score -2.103725
[LightGBM] [Info] Start training from score -1.346461
[LightGBM] [Info] Start training from score -1.225281
[LightGBM] [Info] Start training from score -1.126505
Accuracy on Test Data using Light GBM: 0.9285618907731339

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.92      0.93       914
           1       0.92      0.91      0.92      1948
           2       0.89      0.92      0.91      2199
           3       0.96      0.95      0.96      2428

    accuracy                           0.93      7489
   macro avg       0.93  

In [111]:
report_dict_lgb = classification_report(y_test, y_pred, output_dict=True)
# report_dict_lgb

In [112]:
lgb_params = model_lgb.get_params()
lgb_params

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'num_leaves': 31,
 'objective': None,
 'random_state': 42,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

#### Single function to log the models using MLFlow

In [113]:
MODEL_NAMES = [
    "Gaussian NB",
    "Logistic Regression",
    "Random Forest",
    "SVC",
    "XGBoost",
    "LightGBM"
]

In [114]:
def log_model_to_mlflow(model_name, model, params, report_dict):

    model_id = MODEL_NAMES.index(model_name)

    with mlflow.start_run(run_name=model_name):

        # Human-readable tag (click run → see model name)
        mlflow.set_tag("model_name", model_name)

        # Numeric model ID (shown on charts)
        mlflow.log_param("model_name_id", model_id)

        # Log hyperparameters
        mlflow.log_params(params)

        # Log metrics
        metrics = {
            'accuracy': report_dict['accuracy'],
            'f1_macro': report_dict['macro avg']['f1-score'],
            'f1_weighted': report_dict['weighted avg']['f1-score'],
        }

        for cls in ['0', '1', '2', '3']:
            metrics[f'precision_class_{cls}'] = report_dict[cls]['precision']
            metrics[f'recall_class_{cls}'] = report_dict[cls]['recall']

        mlflow.log_metrics(metrics)

        # Save model
        mlflow.sklearn.log_model(model, artifact_path=model_name)


In [115]:
# Set these ONCE at the top
mlflow.set_experiment("Beverage Price Predictor")
mlflow.set_tracking_uri("http://127.0.0.1:5000")

# --- Gaussian NB ---
log_model_to_mlflow("Gaussian NB", model_nb, nb_params, report_dict_gnb)

# --- Logistic Regression ---
log_model_to_mlflow("Logistic Regression", model_lr, lr_params, report_dict_lr)

# --- Random Forest ---
log_model_to_mlflow("Random Forest", model_rf, rf_params, report_dict_rf)

# --- SVC ---
log_model_to_mlflow("SVC", model_svc, svc_params, report_dict_svc)




2025/11/26 20:40:27 INFO mlflow.tracking.fluent: Experiment with name 'Beverage Price Predictor' does not exist. Creating a new experiment.


In [116]:
model_name = "XGBoost"
model_name_id = MODEL_NAMES.index(model_name)

with mlflow.start_run(run_name=model_name):

    # Human-readable model name (shows in Run Table)
    mlflow.set_tag("model_name", model_name)

    # Numeric ID used in Parallel Coordinates Plot
    mlflow.log_param("model_name_id", model_name_id)

    # Log hyperparameters
    mlflow.log_params(xgb_params)

    # Log metrics
    metrics = {
        'accuracy': report_dict_xgb['accuracy'],
        'f1_macro': report_dict_xgb['macro avg']['f1-score'],
        'f1_weighted': report_dict_xgb['weighted avg']['f1-score'],
    }

    # Add per-class precision & recall automatically
    for cls in ['0', '1', '2', '3']:
        metrics[f'precision_class_{cls}'] = report_dict_xgb[cls]['precision']
        metrics[f'recall_class_{cls}'] = report_dict_xgb[cls]['recall']

    mlflow.log_metrics(metrics)

    # Log XGBoost model
    mlflow.xgboost.log_model(model_xgb, artifact_path=model_name)


  self.get_booster().save_model(fname)


In [117]:
model_name = "LightGBM"
model_name_id = MODEL_NAMES.index(model_name)

with mlflow.start_run(run_name=model_name):

    # Human-readable model name tag
    mlflow.set_tag("model_name", model_name)

    # Numeric model name ID (used in MLflow visualizations)
    mlflow.log_param("model_name_id", model_name_id)

    # Log hyperparameters
    mlflow.log_params(lgb_params)

    # Log metrics
    metrics = {
        'accuracy': report_dict_lgb['accuracy'],
        'f1_macro': report_dict_lgb['macro avg']['f1-score'],
        'f1_weighted': report_dict_lgb['weighted avg']['f1-score'],
    }

    # Add per-class precision & recall
    for cls in ['0', '1', '2', '3']:
        metrics[f'precision_class_{cls}'] = report_dict_lgb[cls]['precision']
        metrics[f'recall_class_{cls}'] = report_dict_lgb[cls]['recall']

    mlflow.log_metrics(metrics)

    # Log LightGBM model
    mlflow.lightgbm.log_model(model_lgb, artifact_path=model_name)


#### Register the models

In [118]:
model_name = "XGBoost_Classifier"
run_id = input("Enter Run ID:")
model_uri = f"runs:/{run_id}/{model_name}"

result = mlflow.register_model(
    model_uri, model_name
)



Enter Run ID: 8a3cd7152c5148b0839cf303e4aba888


Successfully registered model 'XGBoost_Classifier'.
2025/11/26 20:45:32 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoost_Classifier, version 1
Created version '1' of model 'XGBoost_Classifier'.


In [119]:
model_name = "Light_GBM_Classifier"
run_id = input("Enter Run ID:")
model_uri = f"runs:/{run_id}/{model_name}"

result = mlflow.register_model(
    model_uri, model_name
)



Enter Run ID: f32289d3ec8242678211bc75996dd8a7


Successfully registered model 'Light_GBM_Classifier'.
2025/11/26 20:48:46 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Light_GBM_Classifier, version 1
Created version '1' of model 'Light_GBM_Classifier'.


In [120]:
model_name = "Random_Forest_Classifier"
run_id = input("Enter Run ID:")
model_uri = f"runs:/{run_id}/{model_name}"

result = mlflow.register_model(
    model_uri, model_name
)



Enter Run ID: 6c3f7e5a124e48adb57c7fc0998aaff8


Successfully registered model 'Random_Forest_Classifier'.
2025/11/26 20:49:55 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random_Forest_Classifier, version 1
Created version '1' of model 'Random_Forest_Classifier'.


In [121]:
model_name = "Support_Vector_Classifier"
run_id = input("Enter Run ID:")
model_uri = f"runs:/{run_id}/{model_name}"

result = mlflow.register_model(
    model_uri, model_name
)



Enter Run ID: 2d44404182ce4f30bc824a009d64accc


Successfully registered model 'Support_Vector_Classifier'.
2025/11/26 20:50:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Support_Vector_Classifier, version 1
Created version '1' of model 'Support_Vector_Classifier'.


In [122]:
model_name = "Gaussian_Naive_Bayes_Classifier"
run_id = input("Enter Run ID:")
model_uri = f"runs:/{run_id}/{model_name}"

result = mlflow.register_model(
    model_uri, model_name
)



Enter Run ID: d9755f7db78540b487885598a7ab5000


Successfully registered model 'Gaussian_Naive_Bayes_Classifier'.
2025/11/26 20:51:22 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Gaussian_Naive_Bayes_Classifier, version 1
Created version '1' of model 'Gaussian_Naive_Bayes_Classifier'.


In [123]:
model_name = "Logistic_Regression_Classifier"
run_id = input("Enter Run ID:")
model_uri = f"runs:/{run_id}/{model_name}"

result = mlflow.register_model(
    model_uri, model_name
)



Enter Run ID: 964cf6439c79460881f12e8a8ee91e02


Successfully registered model 'Logistic_Regression_Classifier'.
2025/11/26 20:52:09 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Logistic_Regression_Classifier, version 1
Created version '1' of model 'Logistic_Regression_Classifier'.


#### Load the champion model viz., LightGBM in this case

In [124]:
model_uri = 'runs:/f32289d3ec8242678211bc75996dd8a7/LightGBM'
loaded_model = mlflow.lightgbm.load_model(model_uri=model_uri)

y_pred = loaded_model.predict(X_test)
y_pred[:4]

array([0, 1, 3, 1], dtype=int64)

#### Transition from development env to production env

In [127]:
client = mlflow.MlflowClient()
client.search_registered_models()

[<RegisteredModel: aliases={'naivebayes': '1'}, creation_timestamp=1764170482288, description=('This model was trained to predict the beverage price as part of CB '
  'Internship 1.'), last_updated_timestamp=1764170504060, latest_versions=[<ModelVersion: aliases=['naivebayes'], creation_timestamp=1764170482355, current_stage='None', description='', last_updated_timestamp=1764170482355, name='Gaussian_Naive_Bayes_Classifier', run_id='d9755f7db78540b487885598a7ab5000', run_link='', source='file:///D:/Arjun/CB_VI_1/Wk_3/project_resources1/mlruns/998689494779775513/d9755f7db78540b487885598a7ab5000/artifacts/Gaussian_Naive_Bayes_Classifier', status='READY', status_message='', tags={}, user_id='', version='1'>], name='Gaussian_Naive_Bayes_Classifier', tags={}>,
 <RegisteredModel: aliases={'champion': '1', 'lightgbm': '1'}, creation_timestamp=1764170326522, description=('This model was trained to predict the beverage price as part of CB '
  'Internship 1.'), last_updated_timestamp=17641706712

In [128]:
development_model_uri = "models:/Light_GBM_Classifier@champion"
production_model_uri = 'beverage_price_predictor'

client.copy_model_version(src_model_uri=development_model_uri,dst_name=production_model_uri)

<ModelVersion: aliases=[], creation_timestamp=1764171538848, current_stage='None', description='', last_updated_timestamp=1764171538848, name='beverage_price_predictor', run_id='f32289d3ec8242678211bc75996dd8a7', run_link='', source='models:/Light_GBM_Classifier/1', status='READY', status_message='', tags={}, user_id='', version='1'>

#### Load production model & make prediction

In [131]:
# Load the production model from the MLflow Model Registry
model_uri = "runs:/f32289d3ec8242678211bc75996dd8a7/LightGBM"
loaded_model = mlflow.lightgbm.load_model(model_uri)

# Make predictions locally
y_pred = loaded_model.predict(X_test)
y_pred[:4]

array([0, 1, 3, 1], dtype=int64)