In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
data_path_1 = "data/Jester/jester-v1-1.xls"
raw_1 = pd.read_excel(data_path_1, sep=',', header=None)


data_path_2 = "data/Jester/jester-v1-2.xls"
raw_2 = pd.read_excel(data_path_2, sep=',', header=None)


data_path_3 = "data/Jester/jester-v1-3.xls"
raw_3 = pd.read_excel(data_path_3, sep=',', header=None)

In [3]:
# only need ratings from (user id, joke id) entry
dataframes = [raw_1, raw_2, raw_3]
ratings = pd.concat(dataframes)
ratings.drop(ratings.columns[0], axis=1, inplace=True) 

In [4]:
# jokes with no ratings have been dropped
ratings[ratings == 99] = np.nan
ratings = ratings.dropna(axis=1, how='all')
# reset user id index 
ratings = ratings.reset_index(drop=True) # shape: 73,421 x 100

In [5]:
Xy = ratings.T.unstack().dropna().to_frame()
Xy.columns = ["ratings"]

In [6]:
X = np.vstack(Xy.index.values)
y = Xy.get_values()

In [7]:
frac = np.sum(y>0)/np.size(y) 

58.5% percent of the labels is in class 1 and 40.5% is in class 0

As the split isn't unfavourably imbalanced, I will use the roc_auc metric when binarizing the label

In [8]:
# One hot encoded version using original X
# split as in the next cell
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder()
X_enc = one_hot_encoder.fit_transform(X)

# split train, validation and test (60,20,20)
X_trainVal_enc, X_test_enc, X_trainVal, X_test, y_trainVal, y_test = train_test_split(X_enc, X, y, test_size=0.2, random_state=10031996, shuffle=True)

X_train_enc, X_val_enc, X_train, X_val, y_train, y_val = train_test_split(X_trainVal_enc, X_trainVal, y_trainVal, test_size=0.25, random_state=10031996, shuffle=True)

del X_trainVal_enc, X_trainVal, y_trainVal

In [9]:
print(X_train.shape) # 2.5 million joke ratings
print(X_val.shape) # 0.8 million joke ratings
print(X_test.shape) # 0.8 million joke ratings

print(X_train_enc.shape) # 2.5 million joke ratings
print(X_val_enc.shape) # 0.8 million joke ratings
print(X_test_enc.shape) # 0.8 million joke ratings

(2481816, 2)
(827272, 2)
(827272, 2)
(2481816, 73521)
(827272, 73521)
(827272, 73521)


# Binary and non binary labels

In [10]:
y_train_bin = y_train>0 # binary
y_train_cont = y_train # continuous (-10,10)

y_val_bin = y_val>0
y_val_cont = y_val

y_test_bin = y_test>0
y_test_cont = y_test

In [None]:
# num_class_1 = np.sum(y_train_bin,y_val_bin,y_test_bin)
# fraction = num_class_1/np.sum(y_train_bin.s)

# Save files

In [11]:
np.save("J1_Xtr", X_train)
np.save("J1_XtrEnc", X_train_enc)
np.save("J1_Xval", X_val)
np.save("J1_XvalEnc", X_val_enc)
np.save("J1_Xtest", X_test)
np.save("J1_XtestEnc", X_test_enc)

np.save("J1_ytrBin", y_train_bin)
np.save("J1_ytrCat", y_train_cont)
np.save("J1_yvalBin", y_val_bin)
np.save("J1_yvalCat", y_val_cont)
np.save("J1_ytestBin", y_test_bin)
np.save("J1_ytestCat", y_test_cont)
