In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("/content/in-vehicle-coupon-recommendation.csv")

In [3]:
# TO UNDERSTAND THE DATA

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12684 entries, 0 to 12683
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   destination           12684 non-null  object
 1   passanger             12684 non-null  object
 2   weather               12684 non-null  object
 3   temperature           12684 non-null  int64 
 4   time                  12684 non-null  object
 5   coupon                12684 non-null  object
 6   expiration            12684 non-null  object
 7   gender                12684 non-null  object
 8   age                   12684 non-null  object
 9   maritalStatus         12684 non-null  object
 10  has_children          12684 non-null  int64 
 11  education             12684 non-null  object
 12  occupation            12684 non-null  object
 13  income                12684 non-null  object
 14  car                   108 non-null    object
 15  Bar                   12577 non-null

In [5]:
# FOR MISSING VALUES
print(df.isnull().sum())

destination                 0
passanger                   0
weather                     0
temperature                 0
time                        0
coupon                      0
expiration                  0
gender                      0
age                         0
maritalStatus               0
has_children                0
education                   0
occupation                  0
income                      0
car                     12576
Bar                       107
CoffeeHouse               217
CarryAway                 151
RestaurantLessThan20      130
Restaurant20To50          189
toCoupon_GEQ5min            0
toCoupon_GEQ15min           0
toCoupon_GEQ25min           0
direction_same              0
direction_opp               0
Y                           0
dtype: int64


In [7]:
# CATEGORICAL COLUMNS
categorical_columns = df.select_dtypes(include=['object']).columns
print("Categorical Columns:", categorical_columns)
for col in categorical_columns:                    # UNIQUE VALUES FOR EACH CATEGORICAL COLUMNS
    print(f"{col}: {df[col].nunique()}")


Categorical Columns: Index(['destination', 'passanger', 'weather', 'time', 'coupon', 'expiration',
       'gender', 'age', 'maritalStatus', 'education', 'occupation', 'income',
       'car', 'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20',
       'Restaurant20To50'],
      dtype='object')
destination: 3
passanger: 4
weather: 3
time: 5
coupon: 5
expiration: 2
gender: 2
age: 8
maritalStatus: 5
education: 6
occupation: 25
income: 9
car: 5
Bar: 5
CoffeeHouse: 5
CarryAway: 5
RestaurantLessThan20: 5
Restaurant20To50: 5


In [8]:
missing_percentage = df.isnull().sum() / len(df) * 100
print(missing_percentage.sort_values(ascending=False))
# CHECKING MISSING VALUES PERCENTAGE

car                     99.148534
CoffeeHouse              1.710817
Restaurant20To50         1.490066
CarryAway                1.190476
RestaurantLessThan20     1.024913
Bar                      0.843582
weather                  0.000000
temperature              0.000000
destination              0.000000
passanger                0.000000
maritalStatus            0.000000
age                      0.000000
gender                   0.000000
expiration               0.000000
coupon                   0.000000
time                     0.000000
income                   0.000000
occupation               0.000000
has_children             0.000000
education                0.000000
toCoupon_GEQ5min         0.000000
toCoupon_GEQ15min        0.000000
toCoupon_GEQ25min        0.000000
direction_same           0.000000
direction_opp            0.000000
Y                        0.000000
dtype: float64


In [9]:
# AS CAR COLUMN HAS MANY MISSING VALUES (99%), IT'S BETTER TO REMOVE THE COLUMN

In [14]:
if 'car' in df.columns:
    df.drop(columns=["car"], inplace=True)

In [16]:
df.head()

Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,1
3,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0


In [18]:
# MODE IMPUTATION (FILLING MISSING VALUES WITH MODE)
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [11]:
# WE USE LABEL ENCODING FOR ORDINAL DATA AND OHE(ONE HOT ENCODING) ON NOMINAL DATA

In [19]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

# Apply Label Encoding to all categorical columns
df_label_encoded = df.copy()
for col in categorical_columns:
    df_label_encoded[col] = label_encoder.fit_transform(df_label_encoded[col])

df_label_encoded.head()


Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,1,0,2,55,2,4,0,0,0,3,...,4,0,1,0,1,0,0,0,1,1
1,1,1,2,80,0,2,1,0,0,3,...,4,0,1,0,1,0,0,0,1,0
2,1,1,2,80,0,1,1,0,0,3,...,4,0,1,0,1,1,0,0,1,1
3,1,1,2,80,2,2,1,0,0,3,...,4,0,1,0,1,1,0,0,1,0
4,1,1,2,80,2,2,0,0,0,3,...,4,0,1,0,1,1,0,0,1,0


In [20]:
df_one_hot_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

df_one_hot_encoded.head()


Unnamed: 0,temperature,has_children,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y,destination_No Urgent Place,destination_Work,...,CarryAway_less1,CarryAway_never,RestaurantLessThan20_4~8,RestaurantLessThan20_gt8,RestaurantLessThan20_less1,RestaurantLessThan20_never,Restaurant20To50_4~8,Restaurant20To50_gt8,Restaurant20To50_less1,Restaurant20To50_never
0,55,1,1,0,0,0,1,1,True,False,...,False,False,True,False,False,False,False,False,False,False
1,80,1,1,0,0,0,1,0,True,False,...,False,False,True,False,False,False,False,False,False,False
2,80,1,1,1,0,0,1,1,True,False,...,False,False,True,False,False,False,False,False,False,False
3,80,1,1,1,0,0,1,0,True,False,...,False,False,True,False,False,False,False,False,False,False
4,80,1,1,1,0,0,1,0,True,False,...,False,False,True,False,False,False,False,False,False,False


In [21]:
from sklearn.model_selection import train_test_split

target = "Y"

# SEPARATE FETURES AND TARGET COLUMNS
X_label = df_label_encoded.drop(target, axis=1)
y_label = df_label_encoded[target]

X_one_hot = df_one_hot_encoded.drop(target, axis=1)
y_one_hot = df_one_hot_encoded[target]

X_train_label, X_test_label, y_train_label, y_test_label = train_test_split(X_label, y_label, test_size=0.2, random_state=42)
X_train_one_hot, X_test_one_hot, y_train_one_hot, y_test_one_hot = train_test_split(X_one_hot, y_one_hot, test_size=0.2, random_state=42)


In [22]:
# ANALYSING MODEL PERFOMANCES
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

#MODEL ON LABEL ENCODED DATA
model_label = RandomForestClassifier(random_state=42)
model_label.fit(X_train_label, y_train_label)
y_pred_label = model_label.predict(X_test_label)
acc_label = accuracy_score(y_test_label, y_pred_label)
print("Accuracy with Label Encoding:", acc_label)


Accuracy with Label Encoding: 0.7445802128498227


In [23]:

model_one_hot = RandomForestClassifier(random_state=42)
model_one_hot.fit(X_train_one_hot, y_train_one_hot)

#MODEL ON ONE HOT ENCODED DATA
y_pred_one_hot = model_one_hot.predict(X_test_one_hot)
acc_one_hot = accuracy_score(y_test_one_hot, y_pred_one_hot)
print("Accuracy with One-Hot Encoding:", acc_one_hot)


Accuracy with One-Hot Encoding: 0.7469452108789909


In [24]:
print(f"Model Accuracy Comparison:")
print(f"🔹 Label Encoding: {acc_label:.4f}")
print(f"🔹 One-Hot Encoding: {acc_one_hot:.4f}")


Model Accuracy Comparison:
🔹 Label Encoding: 0.7446
🔹 One-Hot Encoding: 0.7469


In [26]:
# ONE HOT ENCODED DATA IS SLIGHTLY WORKING BETTER ==> CATEGORICAL DATA HAS NO/LESS ORDINAL RELATIONSHIP AND SHOULD BE ENCODED AS INDEPENDENT FEATURES