In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
data_path = "data/MovieLens20M/ratings.csv"

raw = pd.read_csv(data_path, sep=',')

X = raw.iloc[:,:-2].get_values() 
y = raw.iloc[:,-2].get_values()

In [None]:
frac = np.sum(y>3)/np.size(y) 

61 percent of the labels is in class 1 and 39% is in class 0

As the split isn't unfavourably imbalanced, I will use the roc_auc metric when binarizing the label

In [3]:
# One hot encoded version using original X
# split as in the next cell
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder()
X_enc = one_hot_encoder.fit_transform(X)

# split train, validation and test (60,20,20)
X_trainVal_enc, X_test_enc, X_trainVal, X_test, y_trainVal, y_test = train_test_split(X_enc, X, y, test_size=0.2, random_state=10031996, shuffle=True)

X_train_enc, X_val_enc, X_train, X_val, y_train, y_val = train_test_split(X_trainVal_enc, X_trainVal, y_trainVal, test_size=0.25, random_state=10031996, shuffle=True)

del X_trainVal_enc, X_trainVal, y_trainVal

In [6]:
print(X_train.shape) # 12 million movie ratings
print(X_val.shape) # 4 million movie ratings
print(X_test.shape) # 4 million movie ratings

print(X_train_enc.shape) # 12 million movie ratings
print(X_val_enc.shape) # 4 million movie ratings
print(X_test_enc.shape) # 4 million movie ratings

(12000157, 2)
(4000053, 2)
(4000053, 2)
(12000157, 165237)
(4000053, 165237)
(4000053, 165237)


# Binary and non binary labels

In [7]:
categories = np.linspace(0,5,num=11)

In [8]:
y_train_bin = y_train>3 # binary
y_train_cat = y_train # non-binary but still categorical
y_train_cat = np.digitize(y_train_cat,categories)

y_val_bin = y_val>3
y_val_cat = y_val
y_val_cat = np.digitize(y_val_cat,categories)

y_test_bin = y_test>3
y_test_cat = y_test
y_test_cat = np.digitize(y_test_cat,categories)

# Save files

In [14]:
np.save("ML20_Xtr", X_train)
np.save("ML20_XtrEnc", X_train_enc)
np.save("ML20_Xval", X_val)
np.save("ML20_XvalEnc", X_val_enc)
np.save("ML20_Xtest", X_test)
np.save("ML20_XtestEnc", X_test_enc)

np.save("ML20_ytrBin", y_train_bin)
np.save("ML20_ytrCat", y_train_cat)
np.save("ML20_yvalBin", y_val_bin)
np.save("ML20_yvalCat", y_val_cat)
np.save("ML20_ytestBin", y_test_bin)
np.save("ML20_ytestCat", y_test_cat)
