run this in your terminal first (not in the one currently running jupyter notebook -- open up a new one): pip install category_encoders

In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split


import category_encoders as ce
pd.options.display.max_columns=200

In [3]:
data = pd.read_csv("../datasets/vgsales.csv")

In [195]:
data.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [196]:
data.shape

(16598, 11)

In [197]:
df = data.sample(frac=.075).reset_index(drop=True)

In [198]:
df.shape

(1245, 11)

In [199]:
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,11707,Mind Quiz: Your Brain Coach,DS,2006.0,Misc,Ubisoft,0.0,0.0,0.08,0.0,0.08
1,3988,Dead or Alive Ultimate,XB,2004.0,Fighting,Tecmo Koei,0.3,0.1,0.08,0.02,0.5
2,13881,Monster High: New Ghoul in School,X360,2015.0,Action,Little Orbit,0.0,0.04,0.0,0.0,0.04
3,8677,SpongeBob's Boating Bash,DS,2010.0,Misc,THQ,0.14,0.01,0.0,0.01,0.16
4,5618,Sniper Elite 3,X360,2014.0,Shooter,505 Games,0.16,0.14,0.0,0.03,0.32


In [201]:
df.Publisher.nunique()

210

In [202]:
df.Genre.nunique()

12

In [203]:
X = df.iloc[:,1:]

In [204]:
X.head()

Unnamed: 0,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,Mind Quiz: Your Brain Coach,DS,2006.0,Misc,Ubisoft,0.0,0.0,0.08,0.0,0.08
1,Dead or Alive Ultimate,XB,2004.0,Fighting,Tecmo Koei,0.3,0.1,0.08,0.02,0.5
2,Monster High: New Ghoul in School,X360,2015.0,Action,Little Orbit,0.0,0.04,0.0,0.0,0.04
3,SpongeBob's Boating Bash,DS,2010.0,Misc,THQ,0.14,0.01,0.0,0.01,0.16
4,Sniper Elite 3,X360,2014.0,Shooter,505 Games,0.16,0.14,0.0,0.03,0.32


In [205]:
y = df['Rank']

## One Hot Encoding
map each category to a vector consisting of 0s and 1s that denote the absence or presence of the feature

In [206]:
ce_one_hot = ce.OneHotEncoder(cols = ['Genre'])
newX = ce_one_hot.fit_transform(X, y)

In [208]:
newX.head()

Unnamed: 0,Name,Platform,Year,Genre_1,Genre_2,Genre_3,Genre_4,Genre_5,Genre_6,Genre_7,Genre_8,Genre_9,Genre_10,Genre_11,Genre_12,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,Mind Quiz: Your Brain Coach,DS,2006.0,1,0,0,0,0,0,0,0,0,0,0,0,Ubisoft,0.0,0.0,0.08,0.0,0.08
1,Dead or Alive Ultimate,XB,2004.0,0,1,0,0,0,0,0,0,0,0,0,0,Tecmo Koei,0.3,0.1,0.08,0.02,0.5
2,Monster High: New Ghoul in School,X360,2015.0,0,0,1,0,0,0,0,0,0,0,0,0,Little Orbit,0.0,0.04,0.0,0.0,0.04
3,SpongeBob's Boating Bash,DS,2010.0,1,0,0,0,0,0,0,0,0,0,0,0,THQ,0.14,0.01,0.0,0.01,0.16
4,Sniper Elite 3,X360,2014.0,0,0,0,1,0,0,0,0,0,0,0,0,505 Games,0.16,0.14,0.0,0.03,0.32


## Feature Hashing
- hashing allows you to use variable-size feature vectors with standard learning algorithms
- hash functions map data of arbitrary sizes to data of a fixed size
    - same input will always give the same output
    - may output the same value for different inputs (collision)
    - can’t perform a reverse lookup to determine what the input was
    - choice of function determines the range of possible outputs (range is always fixed)
- good hash functions map the expected input evenly over the output range -- every hash value has roughly the same probability of being observed after hashing a typical sample of the key space

In [209]:
ce_hash = ce.HashingEncoder(cols = ['Publisher'])
newX = ce_hash.fit_transform(X, y)

In [210]:
newX.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,Name,Platform,Year,Genre,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,0,0,0,0,0,0,0,1,Mind Quiz: Your Brain Coach,DS,2006.0,Misc,0.0,0.0,0.08,0.0,0.08
1,0,0,0,0,0,0,0,1,Dead or Alive Ultimate,XB,2004.0,Fighting,0.3,0.1,0.08,0.02,0.5
2,0,0,0,0,1,0,0,0,Monster High: New Ghoul in School,X360,2015.0,Action,0.0,0.04,0.0,0.0,0.04
3,0,0,0,0,0,0,0,1,SpongeBob's Boating Bash,DS,2010.0,Misc,0.14,0.01,0.0,0.01,0.16
4,0,0,0,0,0,1,0,0,Sniper Elite 3,X360,2014.0,Shooter,0.16,0.14,0.0,0.03,0.32


# Dealing with Data Leakage
- data leakage: the accidental sharing of information between training and testing datasets
    - sharing of information will give the model a ‘heads-up’ about the testing dataset and generate seemingly optimal evaluation scores
    - since the model overfits the testing data, it cannot predict accurately on future unseen datasets, i.e. Kaggle test data or production live data.
- by splitting the data first, there are two major benefits:
    - reduce risk of data leakage.
    - future unseen data will be processed in exact same way as the testing data, thus ensures consistency in model performance.

In [211]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [248]:
encoding_pipeline = Pipeline([
  ('encode_genre', ce.OneHotEncoder(cols=['Genre'], return_df=True)),
  ('encode_publisher', ce.HashingEncoder(n_components = 15, cols=['Publisher'], return_df=True))
])

# Get the encoded training set:
df_train_encoded = encoding_pipeline.fit_transform(X_train, y_train)

# Get the encoded test set, not no target passed!
df_test_encoded = encoding_pipeline.fit_transform(X_test, y_test)

In [249]:
newColNames = []
for col in df_train_encoded.columns:
    col = col.replace("col","Publisher")
    newColNames.append(col)

In [250]:
df_train_encoded.columns = newColNames
df_test_encoded.columns = newColNames

In [251]:
df_train_encoded.head(100)

Unnamed: 0,Publisher_0,Publisher_1,Publisher_2,Publisher_3,Publisher_4,Publisher_5,Publisher_6,Publisher_7,Publisher_8,Publisher_9,Publisher_10,Publisher_11,Publisher_12,Publisher_13,Publisher_14,Name,Platform,Year,Genre_1,Genre_2,Genre_3,Genre_4,Genre_5,Genre_6,Genre_7,Genre_8,Genre_9,Genre_10,Genre_11,Genre_12,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,Monster Jam: Path of Destruction,Wii,2010.0,1,0,0,0,0,0,0,0,0,0,0,0,0.50,0.02,0.00,0.03,0.56
1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,Headhunter: Redemption,PS2,2004.0,0,1,0,0,0,0,0,0,0,0,0,0,0.02,0.01,0.00,0.00,0.04
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,Dreamer Series: Shop Owner,DS,2009.0,0,0,1,0,0,0,0,0,0,0,0,0,0.07,0.00,0.00,0.01,0.08
3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,Disney's Kim Possible 3: Team Possible,GBA,2005.0,0,0,0,1,0,0,0,0,0,0,0,0,0.33,0.12,0.00,0.01,0.46
4,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,F.E.A.R. 2: Project Origin,X360,2009.0,0,0,0,0,1,0,0,0,0,0,0,0,0.32,0.15,0.00,0.05,0.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,Batman: Gotham City Racer,PS,2001.0,1,0,0,0,0,0,0,0,0,0,0,0,0.03,0.02,0.00,0.00,0.05
96,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,My Spanish Coach,DS,2007.0,0,0,0,0,0,1,0,0,0,0,0,0,0.40,0.01,0.00,0.03,0.44
97,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,The Treasures of Montezuma,DS,2010.0,0,0,0,0,0,0,0,0,0,1,0,0,0.02,0.02,0.00,0.00,0.04
98,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,Genkai Tokki: Moero Crystal,PSV,2015.0,0,0,0,0,0,0,0,1,0,0,0,0,0.00,0.00,0.06,0.00,0.06
