# Importing Libraries

In [229]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Importing Dataset

In [230]:
data = pd.read_csv('evenmore-cleaned-in-vehicle-coupon-recommendation.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,...,income,Bar,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,Y
0,0,No Urgent Place,Alone,Sunny,0.375,2PM,Restaurant(<20),1d,Female,21,...,$37500 - $49999,never,never,1~3,4~8,1~3,0,0,0,1
1,1,No Urgent Place,Friend(s),Sunny,0.625,10AM,Coffee House,2h,Female,21,...,$37500 - $49999,never,never,1~3,4~8,1~3,0,0,0,0
2,2,No Urgent Place,Friend(s),Sunny,0.625,10AM,Carry out & Take away,2h,Female,21,...,$37500 - $49999,never,never,1~3,4~8,1~3,1,0,0,1
3,3,No Urgent Place,Friend(s),Sunny,0.625,2PM,Coffee House,2h,Female,21,...,$37500 - $49999,never,never,1~3,4~8,1~3,1,0,0,0
4,4,No Urgent Place,Friend(s),Sunny,0.625,2PM,Coffee House,1d,Female,21,...,$37500 - $49999,never,never,1~3,4~8,1~3,1,0,0,0


In [231]:
data.drop(columns=['Unnamed: 0'], inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12610 entries, 0 to 12609
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   destination           12610 non-null  object 
 1   passanger             12610 non-null  object 
 2   weather               12610 non-null  object 
 3   temperature           12610 non-null  float64
 4   time                  12610 non-null  object 
 5   coupon                12610 non-null  object 
 6   expiration            12610 non-null  object 
 7   gender                12610 non-null  object 
 8   age                   12610 non-null  object 
 9   maritalStatus         12610 non-null  object 
 10  has_children          12610 non-null  int64  
 11  education             12610 non-null  object 
 12  occupation            12610 non-null  object 
 13  income                12610 non-null  object 
 14  Bar                   12610 non-null  object 
 15  CoffeeHouse        

 #### Printing Column name : no of unique values : unique values to analyse Categorical data

In [232]:
for column in data.columns:
    unique_count = data[column].nunique()
    unique_values = data[column].unique() 
    print(f"{column} : {unique_count} : {unique_values}\n")

destination : 3 : ['No Urgent Place' 'Home' 'Work']

passanger : 4 : ['Alone' 'Friend(s)' 'Kid(s)' 'Partner']

weather : 3 : ['Sunny' 'Rainy' 'Snowy']

temperature : 3 : [0.375 0.625 0.125]

time : 5 : ['2PM' '10AM' '6PM' '7AM' '10PM']

coupon : 5 : ['Restaurant(<20)' 'Coffee House' 'Carry out & Take away' 'Bar'
 'Restaurant(20-50)']

expiration : 2 : ['1d' '2h']

gender : 2 : ['Female' 'Male']

age : 8 : ['21' '46' '26' '31' '41' '50plus' '36' 'below21']

maritalStatus : 5 : ['Unmarried partner' 'Single' 'Married partner' 'Divorced' 'Widowed']

has_children : 2 : [1 0]

education : 6 : ['Some college - no degree' 'Bachelors degree' 'Associates degree'
 'High School Graduate' 'Graduate degree (Masters or Doctorate)'
 'Some High School']

occupation : 25 : ['Unemployed' 'Architecture & Engineering' 'Student'
 'Education&Training&Library' 'Healthcare Support'
 'Healthcare Practitioners & Technical' 'Sales & Related' 'Management'
 'Arts Design Entertainment Sports & Media' 'Computer & Mat

#### Encoded Columns with 2 unique values with LabelEncoder

In [233]:
le = LabelEncoder()

for column in data.columns:
    unique_count = data[column].nunique()
    if unique_count==2 :
        data[column] = le.fit_transform(data[column])

In [234]:
data.head()

Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,income,Bar,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,Y
0,No Urgent Place,Alone,Sunny,0.375,2PM,Restaurant(<20),0,0,21,Unmarried partner,...,$37500 - $49999,never,never,1~3,4~8,1~3,0,0,0,1
1,No Urgent Place,Friend(s),Sunny,0.625,10AM,Coffee House,1,0,21,Unmarried partner,...,$37500 - $49999,never,never,1~3,4~8,1~3,0,0,0,0
2,No Urgent Place,Friend(s),Sunny,0.625,10AM,Carry out & Take away,1,0,21,Unmarried partner,...,$37500 - $49999,never,never,1~3,4~8,1~3,1,0,0,1
3,No Urgent Place,Friend(s),Sunny,0.625,2PM,Coffee House,1,0,21,Unmarried partner,...,$37500 - $49999,never,never,1~3,4~8,1~3,1,0,0,0
4,No Urgent Place,Friend(s),Sunny,0.625,2PM,Coffee House,0,0,21,Unmarried partner,...,$37500 - $49999,never,never,1~3,4~8,1~3,1,0,0,0


#### The Columns 'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50' had ordered values so i encoded these ordered values using mapping and encoded them with suitable numerical values

In [235]:
order = ['never', 'less1', '1~3', '4~8', 'gt8']
mapping = {category: i for i, category in enumerate(order)}

for column in ['Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']:
    data[column] = data[column].astype(str).str.lower().str.strip()
    data[column] = data[column].map(mapping).astype(int)

In [236]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12610 entries, 0 to 12609
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   destination           12610 non-null  object 
 1   passanger             12610 non-null  object 
 2   weather               12610 non-null  object 
 3   temperature           12610 non-null  float64
 4   time                  12610 non-null  object 
 5   coupon                12610 non-null  object 
 6   expiration            12610 non-null  int64  
 7   gender                12610 non-null  int64  
 8   age                   12610 non-null  object 
 9   maritalStatus         12610 non-null  object 
 10  has_children          12610 non-null  int64  
 11  education             12610 non-null  object 
 12  occupation            12610 non-null  object 
 13  income                12610 non-null  object 
 14  Bar                   12610 non-null  int64  
 15  CoffeeHouse        

In [237]:
data.head()

Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,income,Bar,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,Y
0,No Urgent Place,Alone,Sunny,0.375,2PM,Restaurant(<20),0,0,21,Unmarried partner,...,$37500 - $49999,0,0,2,3,2,0,0,0,1
1,No Urgent Place,Friend(s),Sunny,0.625,10AM,Coffee House,1,0,21,Unmarried partner,...,$37500 - $49999,0,0,2,3,2,0,0,0,0
2,No Urgent Place,Friend(s),Sunny,0.625,10AM,Carry out & Take away,1,0,21,Unmarried partner,...,$37500 - $49999,0,0,2,3,2,1,0,0,1
3,No Urgent Place,Friend(s),Sunny,0.625,2PM,Coffee House,1,0,21,Unmarried partner,...,$37500 - $49999,0,0,2,3,2,1,0,0,0
4,No Urgent Place,Friend(s),Sunny,0.625,2PM,Coffee House,0,0,21,Unmarried partner,...,$37500 - $49999,0,0,2,3,2,1,0,0,0


#### Used LabelEncoder for encoding Ordinal Columns

In [238]:
ordinal_cols = ['education', 'occupation', 'income']
for col in ordinal_cols:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])

In [239]:
data.head()

Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,income,Bar,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,Y
0,No Urgent Place,Alone,Sunny,0.375,2PM,Restaurant(<20),0,0,21,Unmarried partner,...,3,0,0,2,3,2,0,0,0,1
1,No Urgent Place,Friend(s),Sunny,0.625,10AM,Coffee House,1,0,21,Unmarried partner,...,3,0,0,2,3,2,0,0,0,0
2,No Urgent Place,Friend(s),Sunny,0.625,10AM,Carry out & Take away,1,0,21,Unmarried partner,...,3,0,0,2,3,2,1,0,0,1
3,No Urgent Place,Friend(s),Sunny,0.625,2PM,Coffee House,1,0,21,Unmarried partner,...,3,0,0,2,3,2,1,0,0,0
4,No Urgent Place,Friend(s),Sunny,0.625,2PM,Coffee House,0,0,21,Unmarried partner,...,3,0,0,2,3,2,1,0,0,0


#### Used OneHotEncoder for encoding Ordinal Columns

In [240]:
nominal_cols = ['destination', 'passanger', 'weather', 'time', 'coupon', 'age',  'maritalStatus']

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_data = ohe.fit_transform(data[nominal_cols])

encoded_df = pd.DataFrame(encoded_data, columns=ohe.get_feature_names_out(nominal_cols))

data = data.drop(nominal_cols, axis=1)  
data = pd.concat([data, encoded_df], axis=1) 

In [241]:
data.head()

Unnamed: 0,temperature,expiration,gender,has_children,education,occupation,income,Bar,CoffeeHouse,CarryAway,...,age_36,age_41,age_46,age_50plus,age_below21,maritalStatus_Divorced,maritalStatus_Married partner,maritalStatus_Single,maritalStatus_Unmarried partner,maritalStatus_Widowed
0,0.375,0,0,1,5,24,3,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.625,1,0,1,5,24,3,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.625,1,0,1,5,24,3,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.625,1,0,1,5,24,3,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.625,0,0,1,5,24,3,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [242]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12610 entries, 0 to 12609
Data columns (total 49 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   temperature                      12610 non-null  float64
 1   expiration                       12610 non-null  int64  
 2   gender                           12610 non-null  int64  
 3   has_children                     12610 non-null  int64  
 4   education                        12610 non-null  int64  
 5   occupation                       12610 non-null  int64  
 6   income                           12610 non-null  int64  
 7   Bar                              12610 non-null  int64  
 8   CoffeeHouse                      12610 non-null  int64  
 9   CarryAway                        12610 non-null  int64  
 10  RestaurantLessThan20             12610 non-null  int64  
 11  Restaurant20To50                 12610 non-null  int64  
 12  toCoupon_GEQ15min 

#### Now the data is cleaned and all the columns are usefull for model training , the columns are encoded using LabelEncoder and OneHotEncoder

In [243]:
data.to_csv('most-cleaned-in-vehicle-coupon-recommendation.csv')