In [8]:
# import required libraries

import pandas as pd
import numpy as np
from heavy_tail_observations import BothSideWeibullNoise, BothSideParetoNoise, BothSideFrechetNoise
from heavy_tail_regressions import catoni_lin_reg, mom_lin_reg, pro, reg_lin_reg, trunc_lin_reg, trunc_reg

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

# Data Preprocessing

In [9]:
#import of review data
cols = ["user id","item id","rating","timestamp"]
#encoding using ISO-8859-1 is used because utf-8 does not support all the characters in movie names
df_data = pd.read_csv("ml-100k/u.data",sep="\t",names=cols,header=None,encoding="ISO-8859-1")
# df_data.head()

#import of moviedata
cols = ["movie id","movie title","release date","video release date","IMDb URL","unknown",
        "Action","Adventure","Animation","Childrens","Comedy","Crime","Documentary",
        "Drama","Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi",
        "Thriller","War","Western"]

df_movie = pd.read_csv("ml-100k/u.item",sep="|",names=cols,header=None,encoding="ISO-8859-1")
# df_movie.head()

#import of user data
cols = ["user id","age","gender","occupation","zip code"]
df_user = pd.read_csv("ml-100k/u.user",sep="|",names=cols,header=None,encoding="ISO-8859-1")
# df_user.head()

#frequency binning the ages into age groups as it will be easier for future analysis
df_user['age_group'] = pd.qcut(df_user['age'],q=10,precision=0)

#join all three dataframes

df = pd.merge(pd.merge(df_data,
                  df_user[["user id",
                           "age_group",
                           "gender",
                           "occupation"]],
                  on='user id',
                  how='left'),
              df_movie,
              left_on = 'item id',
              right_on = 'movie id',
              how ='left')
# df.head()

#drop unneccessary features
df.drop(["movie id",
        "movie title",
        "release date",
        "video release date",
        "IMDb URL",
        "unknown",
        "user id",
        "item id",
        "timestamp"],axis=1, inplace=True)
# df.head()

# check for null values
df.isnull().sum()

#categorize age_group, gender and occupation using 1-hot encoder
df['age_group'] = pd.Categorical(df['age_group'])
df['gender'] = pd.Categorical(df['gender'])
df['occupation'] = pd.Categorical(df['occupation'])

age_group_dummies = pd.get_dummies(df['age_group'])
gender_dummies = pd.get_dummies(df['gender'])
occupation_dummies = pd.get_dummies(df['occupation'])

df = pd.concat([df,
                age_group_dummies,
                gender_dummies,
                occupation_dummies], axis=1)

df.drop(['age_group',
        'gender',
        'occupation'], axis=1, inplace=True)
df.head()

Unnamed: 0,rating,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,...,marketing,none,other,programmer,retired,salesman,scientist,student,technician,writer
0,3,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,3,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


# Splitting data

In [32]:
y = df.rating.to_numpy()[:2000]
x = df.drop('rating',axis=1).to_numpy()[:2000]

# split data into 70% (training set) and 30% (testing set)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)

In [33]:
p = 1.5
optim_names = ['Catoni', 'MoM', 'PRo', 'Ridge', 'Trunc Lin', 'Trunc']
optim_params = [{'beta':1.},
                {'lam':1.,'k':10},
                {'lam':1.,'beta':1.},
                {'lam':1.},
                {'lam':1.,'beta':1.},
                {'lam':1.}]
optims = [catoni_lin_reg, mom_lin_reg, pro, reg_lin_reg, trunc_lin_reg, trunc_reg]

for opt_idx, (optim, optim_param, optim_name)  in enumerate(zip(optims,optim_params,optim_names)):
    print(opt_idx, optim_name, "optimization starts")
    y_est, w_hat = optim(*(x_train,y_train,x_test,p),**optim_param)
    error = np.mean(np.abs(y_est - y_test))
    print('{} RMS: {:.06f}'.format(optim_name, error))    

0 Catoni optimization starts
Catoni RMS: 0.906667
1 MoM optimization starts
MoM RMS: 1.145182
2 PRo optimization starts
PRo RMS: 0.934309
3 Ridge optimization starts
Ridge RMS: 0.927990
4 Trunc Lin optimization starts
Trunc Lin RMS: 0.971063
5 Trunc optimization starts
Trunc RMS: 0.927990
