In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [2]:
df = pd.read_csv('tips.csv')
df.head()

Unnamed: 0,total_bill,tip,gender,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560330000000000.0,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478070000000000.0,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011810000000000.0,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676140000000000.0,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832730000000000.0,Sun2251


In [3]:
print("Original 'day' column: \n", df['day'].head())

Original 'day' column: 
 0    Sun
1    Sun
2    Sun
3    Sun
4    Sun
Name: day, dtype: object


In [4]:
# 1. One-Hot Encoding (OHE) - 

ohe = OneHotEncoder(sparse_output=False, drop=None) # drop = None -> Keep all columns
# sparse_output=False -> compress columns (i.e., store only 1s, not 0s)
ohe_array = ohe.fit_transform(df[['day']])

In [5]:
# Column Names created by OHE
ohe_cols = ohe.get_feature_names_out(['day'])
# Create new dataframe with OHE columns
df_ohe = df.join(pd.DataFrame(ohe_array, columns=ohe_cols, index=df.index))

print("\nOne-Hot Encoded 'day': \n", df_ohe[ohe_cols].head())
print("\nOne-Hot Encoded 'day': \n", df_ohe[ohe_cols].tail())


One-Hot Encoded 'day': 
    day_Fri  day_Sat  day_Sun  day_Thur
0      0.0      0.0      1.0       0.0
1      0.0      0.0      1.0       0.0
2      0.0      0.0      1.0       0.0
3      0.0      0.0      1.0       0.0
4      0.0      0.0      1.0       0.0

One-Hot Encoded 'day': 
      day_Fri  day_Sat  day_Sun  day_Thur
239      0.0      1.0      0.0       0.0
240      0.0      1.0      0.0       0.0
241      0.0      1.0      0.0       0.0
242      0.0      1.0      0.0       0.0
243      0.0      0.0      0.0       1.0


In [6]:
# Drop one redundant category

ohe_drop = OneHotEncoder(sparse_output=False, drop='first') # drop='first' drops one category
ohe_array2 = ohe_drop.fit_transform(df[['day']])
ohe_cols2 = ohe_drop.get_feature_names_out(['day'])

df_ohe2 = df.join(pd.DataFrame(ohe_array2, columns=ohe_cols2, index=df.index))

print("\nOne-Hot Encoded 'day': \n", df_ohe2[ohe_cols2].head())
print("\nOne-Hot Encoded 'day': \n", df_ohe2[ohe_cols2].tail())


One-Hot Encoded 'day': 
    day_Sat  day_Sun  day_Thur
0      0.0      1.0       0.0
1      0.0      1.0       0.0
2      0.0      1.0       0.0
3      0.0      1.0       0.0
4      0.0      1.0       0.0

One-Hot Encoded 'day': 
      day_Sat  day_Sun  day_Thur
239      1.0      0.0       0.0
240      1.0      0.0       0.0
241      1.0      0.0       0.0
242      1.0      0.0       0.0
243      0.0      0.0       1.0


In [7]:
#2. Label Encoding

label_encoder = LabelEncoder()
df['day_label'] = label_encoder.fit_transform(df['day'])
print("\nLabel Encoded 'day':\n", df[['day', 'day_label']].head())
print("\nLabel Encoded 'day':\n", df[['day', 'day_label']].tail())


Label Encoded 'day':
    day  day_label
0  Sun          2
1  Sun          2
2  Sun          2
3  Sun          2
4  Sun          2

Label Encoded 'day':
       day  day_label
239   Sat          1
240   Sat          1
241   Sat          1
242   Sat          1
243  Thur          3


In [8]:
#3. Frequency Encoding
day_freq = df['day'].value_counts(normalize=False) # normalize = True means use % instead
df['day_freq'] = df['day'].map(day_freq)
print("\nFrequency Encoded 'day':\n", df[['day','day_freq']].head())
print("\nFrequency Encoded 'day':\n", df[['day','day_freq']].tail())


Frequency Encoded 'day':
    day  day_freq
0  Sun        76
1  Sun        76
2  Sun        76
3  Sun        76
4  Sun        76

Frequency Encoded 'day':
       day  day_freq
239   Sat        87
240   Sat        87
241   Sat        87
242   Sat        87
243  Thur        62
