In [1]:
import pandas as pd
import numpy as np

TRANSFORMING NOMINAL FEATURES

In [4]:
dataset = pd.read_csv("CO22339_netflix_users.csv")
dataset[['Name', 'Country', 'Subscription_Type',
         'Watch_Time_Hours', 'Favorite_Genre']].iloc[1:7]

Unnamed: 0,Name,Country,Subscription_Type,Watch_Time_Hours,Favorite_Genre
1,John Miller,USA,Premium,321.75,Sci-Fi
2,Emma Davis,UK,Basic,35.89,Comedy
3,Emma Miller,USA,Premium,261.56,Documentary
4,Jane Smith,USA,Standard,909.3,Drama
5,David Johnson,USA,Standard,615.93,Romance
6,John Hernandez,Canada,Standard,755.47,Romance


In [21]:
dataset.head()

Unnamed: 0,User_ID,Name,Age,Country,Subscription_Type,Watch_Time_Hours,Favorite_Genre,Last_Login,GenreLabel,SubscriptionLabel
0,21493,Michael Davis,61,UK,Basic,65.31,Documentary,2025-02-15,2,0
1,9489,James Brown,33,Canada,Basic,323.63,Horror,2025-01-01,4,0
2,16934,Alex Johnson,57,Australia,Premium,343.03,Horror,2024-10-12,4,1
3,12605,Chris Johnson,30,Mexico,Premium,667.99,Romance,2024-12-25,5,1
4,8223,James Garcia,42,Canada,Standard,449.88,Romance,2024-03-15,5,2


In [5]:
genres = np.unique(dataset['Favorite_Genre'])
genres

array(['Action', 'Comedy', 'Documentary', 'Drama', 'Horror', 'Romance',
       'Sci-Fi'], dtype=object)

In [6]:
from sklearn.preprocessing import LabelEncoder

gle = LabelEncoder()
genre_labels = gle.fit_transform(dataset['Favorite_Genre'])
genre_mappings = {index: label for index, label in enumerate(gle.classes_)}
genre_mappings

{0: 'Action',
 1: 'Comedy',
 2: 'Documentary',
 3: 'Drama',
 4: 'Horror',
 5: 'Romance',
 6: 'Sci-Fi'}

In [8]:
dataset['GenreLabel'] = genre_labels
dataset[['Name', 'Country', 'Subscription_Type',
         'Favorite_Genre', 'GenreLabel']].iloc[1:7]

Unnamed: 0,Name,Country,Subscription_Type,Favorite_Genre,GenreLabel
1,John Miller,USA,Premium,Sci-Fi,6
2,Emma Davis,UK,Basic,Comedy,1
3,Emma Miller,USA,Premium,Documentary,2
4,Jane Smith,USA,Standard,Drama,3
5,David Johnson,USA,Standard,Romance,5
6,John Hernandez,Canada,Standard,Romance,5


TRANSFORMING ORDINAL FEATURES

In [11]:
dataset = dataset.sample(random_state=1, frac=1).reset_index(drop=True)

np.unique(dataset['Subscription_Type'])

array(['Basic', 'Premium', 'Standard'], dtype=object)

In [12]:
gen = {'Basic': 1, 'Premium': 2, 'Standard': 3}

dataset['SubscriptionLabel'] = dataset['Subscription_Type'].map(gen)
dataset[['Name', 'Subscription_Type', 'SubscriptionLabel']].iloc[4:10]

Unnamed: 0,Name,Subscription_Type,SubscriptionLabel
4,James Garcia,Standard,3
5,Michael Martinez,Basic,1
6,Emma Hernandez,Basic,1
7,Michael Garcia,Premium,2
8,Michael Williams,Standard,3
9,Emma Miller,Premium,2


ENCODING CATEGORICAL FEATURES


One-Hot Encoding Scheme

In [14]:
dataset[['Name', 'Favorite_Genre', 'Subscription_Type']].iloc[4:10]

Unnamed: 0,Name,Favorite_Genre,Subscription_Type
4,James Garcia,Romance,Standard
5,Michael Martinez,Action,Basic
6,Emma Hernandez,Romance,Basic
7,Michael Garcia,Action,Premium
8,Michael Williams,Horror,Standard
9,Emma Miller,Horror,Premium


In [18]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

gen_le = LabelEncoder()
gen_labels = gen_le.fit_transform(dataset['Favorite_Genre'])
dataset['GenreLabel'] = gen_labels

sub_le = LabelEncoder()
sub_labels = sub_le.fit_transform(dataset['Subscription_Type'])
dataset['SubscriptionLabel'] = sub_labels

dataset_sub = dataset[['Name', 'Favorite_Genre', 'GenreLabel', 'Subscription_Type', 'SubscriptionLabel']]
dataset_sub.iloc[7:13]

Unnamed: 0,Name,Favorite_Genre,GenreLabel,Subscription_Type,SubscriptionLabel
7,Michael Garcia,Action,0,Premium,1
8,Michael Williams,Horror,4,Standard,2
9,Emma Miller,Horror,4,Premium,1
10,Emma Smith,Drama,3,Standard,2
11,David Miller,Documentary,2,Standard,2
12,Jane Johnson,Horror,4,Basic,0


In [19]:
gen_ohe = OneHotEncoder()
gen_feature_arr = gen_ohe.fit_transform(dataset[['GenreLabel']]).toarray()
gen_feature_labels = list(gen_le.classes_)
gen_features = pd.DataFrame(gen_feature_arr, columns=gen_feature_labels)

sub_ohe = OneHotEncoder()
sub_feature_arr = sub_ohe.fit_transform(dataset[['SubscriptionLabel']]).toarray()
sub_feature_labels = ['Subscription_'+str(cls_label) for cls_label in sub_le.classes_]
sub_features = pd.DataFrame(sub_feature_arr, columns=sub_feature_labels)

In [20]:
dataset_ohe = pd.concat([dataset_sub, gen_features, sub_features], axis=1)
columns = sum([['Name', 'Favorite_Genre', 'GenreLabel'],gen_feature_labels,
              ['Subscription_Type', 'SubscriptionLabel'],sub_feature_labels], [])
dataset_ohe[columns].iloc[7:13]

Unnamed: 0,Name,Favorite_Genre,GenreLabel,Action,Comedy,Documentary,Drama,Horror,Romance,Sci-Fi,Subscription_Type,SubscriptionLabel,Subscription_Basic,Subscription_Premium,Subscription_Standard
7,Michael Garcia,Action,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Premium,1,0.0,1.0,0.0
8,Michael Williams,Horror,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Standard,2,0.0,0.0,1.0
9,Emma Miller,Horror,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Premium,1,0.0,1.0,0.0
10,Emma Smith,Drama,3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Standard,2,0.0,0.0,1.0
11,David Miller,Documentary,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Standard,2,0.0,0.0,1.0
12,Jane Johnson,Horror,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Basic,0,1.0,0.0,0.0


In [33]:
new_dataset = pd.DataFrame([['Michael Davis', 'Horror', 'Basic'],
                           ['Chris Johnson', 'Romance', 'Premium'],
                           ['Jane Johnson', 'Action', 'Standard']],
                           columns=['Name', 'Favorite_Genre', 'Subscription_Type'])
new_dataset

Unnamed: 0,Name,Favorite_Genre,Subscription_Type
0,Michael Davis,Horror,Basic
1,Chris Johnson,Romance,Premium
2,Jane Johnson,Action,Standard


In [35]:
new_gen_labels = gen_le.transform(new_dataset['Favorite_Genre'])
new_dataset['GenreLabel'] = new_gen_labels

new_sub_labels = sub_le.transform(new_dataset['Subscription_Type'])
new_dataset['SubscriptionLabel'] = new_sub_labels

new_dataset[['Name', 'Favorite_Genre', 'GenreLabel', 'Subscription_Type', 'SubscriptionLabel']]

Unnamed: 0,Name,Favorite_Genre,GenreLabel,Subscription_Type,SubscriptionLabel
0,Michael Davis,Horror,4,Basic,0
1,Chris Johnson,Romance,5,Premium,1
2,Jane Johnson,Action,0,Standard,2


In [36]:
new_gen_feature_arr = gen_ohe.transform(new_dataset[['GenreLabel']]).toarray()
new_gen_features = pd.DataFrame(new_gen_feature_arr, columns=gen_feature_labels)

new_leg_feature_arr = sub_ohe.transform(new_dataset[['SubscriptionLabel']]).toarray()
new_leg_features = pd.DataFrame(new_leg_feature_arr, columns=sub_feature_labels)

new_dataset_ohe = pd.concat([new_dataset, new_gen_features, new_leg_features], axis=1)
columns = sum([['Name', 'Favorite_Genre', 'GenreLabel'], gen_feature_labels,
               ['Subscription_Type', 'SubscriptionLabel'], sub_feature_labels], [])
new_dataset_ohe[columns]


Unnamed: 0,Name,Favorite_Genre,GenreLabel,Action,Comedy,Documentary,Drama,Horror,Romance,Sci-Fi,Subscription_Type,SubscriptionLabel,Subscription_Basic,Subscription_Premium,Subscription_Standard
0,Michael Davis,Horror,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Basic,0,1.0,0.0,0.0
1,Chris Johnson,Romance,5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Premium,1,0.0,1.0,0.0
2,Jane Johnson,Action,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Standard,2,0.0,0.0,1.0


In [38]:
gen_onehot_features = pd.get_dummies(dataset['Favorite_Genre'])
pd.concat([dataset[['Name', 'Favorite_Genre']], gen_onehot_features], axis=1).iloc[4:10]

Unnamed: 0,Name,Favorite_Genre,Action,Comedy,Documentary,Drama,Horror,Romance,Sci-Fi
4,James Garcia,Romance,False,False,False,False,False,True,False
5,Michael Martinez,Action,True,False,False,False,False,False,False
6,Emma Hernandez,Romance,False,False,False,False,False,True,False
7,Michael Garcia,Action,True,False,False,False,False,False,False
8,Michael Williams,Horror,False,False,False,False,True,False,False
9,Emma Miller,Horror,False,False,False,False,True,False,False


Dummy Coding Scheme

In [40]:
gen_dummy_features = pd.get_dummies(dataset['Favorite_Genre'], drop_first=True)
pd.concat([dataset[['Name', 'Favorite_Genre']], gen_dummy_features], axis=1).iloc[4:10]

Unnamed: 0,Name,Favorite_Genre,Comedy,Documentary,Drama,Horror,Romance,Sci-Fi
4,James Garcia,Romance,False,False,False,False,True,False
5,Michael Martinez,Action,False,False,False,False,False,False
6,Emma Hernandez,Romance,False,False,False,False,True,False
7,Michael Garcia,Action,False,False,False,False,False,False
8,Michael Williams,Horror,False,False,False,True,False,False
9,Emma Miller,Horror,False,False,False,True,False,False


In [41]:
gen_onehot_features = pd.get_dummies(dataset['Favorite_Genre'])
gen_dummy_features = gen_onehot_features.iloc[:,:-1]
pd.concat([dataset[['Name', 'Favorite_Genre']], gen_dummy_features], axis=1).iloc[4:10]

Unnamed: 0,Name,Favorite_Genre,Action,Comedy,Documentary,Drama,Horror,Romance
4,James Garcia,Romance,False,False,False,False,False,True
5,Michael Martinez,Action,True,False,False,False,False,False
6,Emma Hernandez,Romance,False,False,False,False,False,True
7,Michael Garcia,Action,True,False,False,False,False,False
8,Michael Williams,Horror,False,False,False,False,True,False
9,Emma Miller,Horror,False,False,False,False,True,False


Effect Coding Scheme

In [42]:
gen_onehot_features = pd.get_dummies(dataset['Favorite_Genre'])
gen_effect_features = gen_onehot_features.iloc[:,:-1]
gen_effect_features.loc[np.all(gen_effect_features == 0, axis=1)] = -1.
pd.concat([dataset[['Name', 'Favorite_Genre']], gen_effect_features], axis=1).iloc[4:10]

  gen_effect_features.loc[np.all(gen_effect_features == 0, axis=1)] = -1.
  gen_effect_features.loc[np.all(gen_effect_features == 0, axis=1)] = -1.
  gen_effect_features.loc[np.all(gen_effect_features == 0, axis=1)] = -1.
  gen_effect_features.loc[np.all(gen_effect_features == 0, axis=1)] = -1.
  gen_effect_features.loc[np.all(gen_effect_features == 0, axis=1)] = -1.
  gen_effect_features.loc[np.all(gen_effect_features == 0, axis=1)] = -1.


Unnamed: 0,Name,Favorite_Genre,Action,Comedy,Documentary,Drama,Horror,Romance
4,James Garcia,Romance,False,False,False,False,False,True
5,Michael Martinez,Action,True,False,False,False,False,False
6,Emma Hernandez,Romance,False,False,False,False,False,True
7,Michael Garcia,Action,True,False,False,False,False,False
8,Michael Williams,Horror,False,False,False,False,True,False
9,Emma Miller,Horror,False,False,False,False,True,False


Feature Hashing Scheme

In [43]:
unique_genres = np.unique(dataset[['Favorite_Genre']])
print("Total genres:", len(unique_genres))
print(unique_genres)

Total genres: 7
['Action' 'Comedy' 'Documentary' 'Drama' 'Horror' 'Romance' 'Sci-Fi']


In [45]:
from sklearn.feature_extraction import FeatureHasher

fh = FeatureHasher(n_features=6, input_type='string')
hashed_features = fh.fit_transform(dataset['Favorite_Genre'].astype(str).apply(lambda x: [x]))
hashed_features = hashed_features.toarray()
pd.concat([dataset[['Name', 'Favorite_Genre']], pd.DataFrame(hashed_features)], axis=1).iloc[1:7]

Unnamed: 0,Name,Favorite_Genre,0,1,2,3,4,5
1,James Brown,Horror,0.0,0.0,1.0,0.0,0.0,0.0
2,Alex Johnson,Horror,0.0,0.0,1.0,0.0,0.0,0.0
3,Chris Johnson,Romance,1.0,0.0,0.0,0.0,0.0,0.0
4,James Garcia,Romance,1.0,0.0,0.0,0.0,0.0,0.0
5,Michael Martinez,Action,-1.0,0.0,0.0,0.0,0.0,0.0
6,Emma Hernandez,Romance,1.0,0.0,0.0,0.0,0.0,0.0


In [47]:
fh.get_params()

{'alternate_sign': True,
 'dtype': numpy.float64,
 'input_type': 'string',
 'n_features': 6}