# Προσπαθούμε να εφαρμόσουμε διάφορα μοντέλα Regression για να προβλέψουμε το rating

## Import Libraries

In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor

## Import Datasets

In [43]:
# Movies
movies_df = pd.read_csv('data/movies.csv', sep='\t')
movies_df = movies_df.drop(movies_df.columns[0], axis=1)

# Users
users_df = pd.read_csv('data/users.csv', sep='\t')
users_df = users_df.drop(users_df.columns[0], axis=1)

# Ratings
ratings_df = pd.read_csv('data/ratings.csv', sep=';')
ratings_df = ratings_df.drop(ratings_df.columns[0], axis=1)

## Θέλουμε να μετατρέψουμε όλα τα δεδομένα που θα χρησιμοποιήσουμε στο regression σε αριθμούς

## --- Movies ---

In [44]:
movies_df.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


### Μπορούμε να δημιουργήσουμε one hot encodings από το column genres με τη μέθοδο get_dummies()
-> https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.get_dummies.html

In [45]:
one_hot_encoded_genres = movies_df.genres.str.get_dummies()
one_hot_encoded_genres.head()

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Μπορούμε να ενώσουμε τα δύο dataframes με την concat()
-> https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html

In [46]:
new_movies_df = pd.concat([movies_df, one_hot_encoded_genres], axis=1).drop(['genres', 'title'], axis=1)
new_movies_df.head()

Unnamed: 0,movie_id,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


## --- Users ---

In [47]:
users_df.head()

Unnamed: 0,user_id,gender,age,occupation,zipcode,age_desc,occ_desc
0,1,F,1,10,48067,Under 18,K-12 student
1,2,M,56,16,70072,56+,self-employed
2,3,M,25,15,55117,25-34,scientist
3,4,M,45,7,2460,45-49,executive/managerial
4,5,M,25,20,55455,25-34,writer


### Θα χρειαστούμε τα columns gender, occupation και age_desc
### Μπορούμε να μετατρέψουμε τις στήλες gender και age_desc σε categorical data μέσω του pandas, έτσι ώστε να μπορούμε να πάρουμε τα codes (η στήλη occupation είναι ήδη int)
-> https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html

-> https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.cat.codes.html

In [48]:
new_users_df = users_df.copy().drop(['age', 'zipcode', 'occ_desc'], axis=1)
new_users_df.gender = new_users_df.gender.astype('category').cat.codes
new_users_df.age_desc = new_users_df.age_desc.astype('category').cat.codes
new_users_df.head()

Unnamed: 0,user_id,gender,occupation,age_desc
0,1,0,10,6
1,2,1,16,5
2,3,1,15,1
3,4,1,7,3
4,5,1,20,1


## --- Ratings ---

In [49]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


### Τα ratings είναι ήδη αριθμοί οπότε το μόνο που θα κάνουμε είναι να αφαιρέσουμε το column timestamp εφόσον δεν θα το χρειαστούμε

In [50]:
new_ratings_df = ratings_df.copy().drop('timestamp', axis=1)
new_ratings_df.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


## Εφόσον όλα τα δεδομένα μας έχουν πλέον αριθμητική αναπαράσταση, δημιουργούμε ένα dataframe που να τα περιέχει όλα μαζί

In [51]:
merged_df = new_ratings_df.merge(new_users_df, on='user_id', how='left') # merge left για να πάρουμε όλα τα ratings
merged_df.head()

Unnamed: 0,user_id,movie_id,rating,gender,occupation,age_desc
0,1,1193,5,0,10,6
1,1,661,3,0,10,6
2,1,914,3,0,10,6
3,1,3408,4,0,10,6
4,1,2355,5,0,10,6


In [52]:
merged_df = merged_df.merge(new_movies_df, on='movie_id', how='left') # πάλι left για να μην χάσουμε τα ratings
merged_df

Unnamed: 0,user_id,movie_id,rating,gender,occupation,age_desc,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1193,5,0,10,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,661,3,0,10,6,0,0,1,1,...,0,0,0,1,0,0,0,0,0,0
2,1,914,3,0,10,6,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
3,1,3408,4,0,10,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,2355,5,0,10,6,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,1,6,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000205,6040,1094,5,1,6,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1000206,6040,562,5,1,6,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000207,6040,1096,4,1,6,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
# Βλέπουμε αν υπάρχουν nan τιμές
merged_df.isna().sum()

user_id        0
movie_id       0
rating         0
gender         0
occupation     0
age_desc       0
Action         0
Adventure      0
Animation      0
Children's     0
Comedy         0
Crime          0
Documentary    0
Drama          0
Fantasy        0
Film-Noir      0
Horror         0
Musical        0
Mystery        0
Romance        0
Sci-Fi         0
Thriller       0
War            0
Western        0
dtype: int64

## Ετοιμάζουμε τα subsets

In [55]:
X = merged_df.drop(['user_id', 'movie_id', 'rating'], axis=1)
y = merged_df.rating

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [57]:
X_train

Unnamed: 0,gender,occupation,age_desc,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
276894,0,6,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
45426,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
628526,0,19,2,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
904724,1,7,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
933858,1,7,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
906754,1,4,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
248231,1,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
807827,1,20,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
131198,1,0,2,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
X_test

Unnamed: 0,gender,occupation,age_desc,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
402891,1,8,4,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
954242,1,20,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
566748,1,7,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
838717,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
857802,1,14,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156424,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
195147,0,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
342242,1,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4162,1,7,5,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [59]:
y_train

276894    4
45426     4
628526    4
904724    5
933858    4
         ..
906754    2
248231    3
807827    5
131198    4
569541    4
Name: rating, Length: 800167, dtype: int64

In [60]:
y_test

402891    4
954242    4
566748    5
838717    3
857802    4
         ..
156424    1
195147    5
342242    3
4162      3
446497    1
Name: rating, Length: 200042, dtype: int64

## Εφαρμόζουμε διάφορα μοντέλα Regression

In [63]:
# Linear Regression
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

LinearRegression()

In [66]:
regr.score(X_test, y_test)

0.039936270108393845

In [71]:
# Logistic Regression
regr2 = linear_model.LogisticRegression()
regr2.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [72]:
regr2.score(X_test, y_test)

0.3496665700202957

In [69]:
# Random Forest
regr3 = RandomForestRegressor(n_estimators=100)
regr3.fit(X_train, y_train)

RandomForestRegressor()

In [70]:
regr3.score(X_test, y_test)

0.06942452373697006