In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
data_path = "data/Jester/jester-v3-1Only.xls"
ratings = pd.read_excel(data_path, sep=',', header=None)

In [3]:
# remove the first column
# not informative
# tells the number of jokes rated
# can be implied by presence of "99" 
# (specifically column length - number of "99")
ratings.drop(ratings.columns[0], axis=1, inplace=True) 

In [4]:
# jokes with no ratings have been dropped
ratings[ratings == 99] = np.nan
ratings = ratings.dropna(axis=1, how='all')

# reset back other nan to 99
ratings = ratings.fillna(value=99.0) # shape: 50692 x 140 (10 jokes not rated)

ratings = ratings.reset_index(drop=True) # just in case

In [5]:
Xy = ratings.T.unstack().dropna().to_frame()
Xy.columns = ["ratings"]

X = np.vstack(Xy.index.values)
y = Xy.get_values()

In [6]:
frac = np.sum(y>0)/np.size(y) 

92% percent of the labels is in class 1 and 8% is in class 0

This is very risky, so I will be investigating the average precision/recall metric as well as the roc_auc metric (when binarizing the label)

In [15]:
# One hot encoded version using original X
# split as in the next cell
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder()
X_enc = one_hot_encoder.fit_transform(X)

# split train, validation and test (60,20,20)
X_trainVal_enc, X_test_enc, X_trainVal, X_test, y_trainVal, y_test = train_test_split(X_enc, X, y, test_size=0.2, random_state=10031996, shuffle=True)

X_train_enc, X_val_enc, X_train, X_val, y_train, y_val = train_test_split(X_trainVal_enc, X_trainVal, y_trainVal, test_size=0.25, random_state=10031996, shuffle=True)

del X_trainVal_enc, X_trainVal, y_trainVal

In [16]:
print(X_train.shape) # 4.3 million joke ratings
print(X_val.shape) # 1.4 million joke ratings
print(X_test.shape) # 1.4 million joke ratings

print(X_train_enc.shape) # 4.3 million joke ratings
print(X_val_enc.shape) # 1.4 million joke ratings
print(X_test_enc.shape) # 1.4 million joke ratings

(4258128, 2)
(1419376, 2)
(1419376, 2)
(4258128, 50832)
(1419376, 50832)
(1419376, 50832)


# Binary and non binary labels

In [17]:
y_train_bin = y_train>0 # binary
y_train_cont = y_train # continuous (-10,10)

y_val_bin = y_val>0
y_val_cont = y_val

y_test_bin = y_test>0
y_test_cont = y_test

# Save files

In [18]:
np.save("J3_Xtr", X_train)
np.save("J3_XtrEnc", X_train_enc)
np.save("J3_Xval", X_val)
np.save("J3_XvalEnc", X_val_enc)
np.save("J3_Xtest", X_test)
np.save("J3_XtestEnc", X_test_enc)

np.save("J3_ytrBin", y_train_bin)
np.save("J3_ytrCat", y_train_cont)
np.save("J3_yvalBin", y_val_bin)
np.save("J3_yvalCat", y_val_cont)
np.save("J3_ytestBin", y_test_bin)
np.save("J3_ytestCat", y_test_cont)
