In [1]:
# Import packages

# Data analysis libraries
import pandas as pd
import numpy as np


# Packages to create models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.linear_model import LinearRegression, Lasso, LassoCV
from sklearn.svm import SVR, LinearSVR
from sklearn.metrics import mean_squared_error


# Save trained models
import pickle


import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import datasets
ratings = pd.read_csv("data/train.csv")
imdb_data = pd.read_csv("data/imdb_data.csv")
movies = pd.read_csv("data/movies.csv")
pd.set_option('display.max_column', None) 

In [69]:
ratings.tail()

Unnamed: 0,userId,movieId,rating
547709,162541,4084,1.0
547504,162541,4973,4.5
6854956,162541,4639,3.0
8880807,162541,32,5.0
3307313,162541,2124,2.5


In [4]:
ratings = ratings[['userId','movieId','rating']]

In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
movies.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [7]:
imdb_data.head()

Unnamed: 0,movieId,title_cast,director,runtime,budget,plot_keywords
0,1,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,"$30,000,000",toy|rivalry|cowboy|cgi animation
1,2,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,"$65,000,000",board game|adventurer|fight|game
2,3,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,101.0,"$25,000,000",boat|lake|neighbor|rivalry
3,4,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,124.0,"$16,000,000",black american|husband wife relationship|betra...
4,5,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,"$30,000,000",fatherhood|doberman|dog|mansion


In [7]:
imdb_data.nunique()

movieId          27278
title_cast       17143
director         11786
runtime            274
budget            1362
plot_keywords    16008
dtype: int64

We can merge movies with imbd_data on movieId


In [3]:
movies_2 = movies.merge(imdb_data, on= 'movieId', how = 'left')

In [4]:
movies_2.nunique()

movieId          62423
title            62325
genres            1639
title_cast       15161
director         10493
runtime            257
budget            1277
plot_keywords    14246
dtype: int64

In [5]:
movies_2.head()

Unnamed: 0,movieId,title,genres,title_cast,director,runtime,budget,plot_keywords
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,"$30,000,000",toy|rivalry|cowboy|cgi animation
1,2,Jumanji (1995),Adventure|Children|Fantasy,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,"$65,000,000",board game|adventurer|fight|game
2,3,Grumpier Old Men (1995),Comedy|Romance,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,101.0,"$25,000,000",boat|lake|neighbor|rivalry
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,124.0,"$16,000,000",black american|husband wife relationship|betra...
4,5,Father of the Bride Part II (1995),Comedy,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,"$30,000,000",fatherhood|doberman|dog|mansion


Create a corpus for the text data in "genres", "title_cast", "director" and "plot_keywords"

Each movie has a distinct movieId, therefores we can drop the "title" column, we can drop th ebudget column at this time aswell.


In [6]:
movies_3 = movies_2.drop(['title','budget','runtime'], axis= 1)

In [7]:
movies_3.nunique()

movieId          62423
genres            1639
title_cast       15161
director         10493
plot_keywords    14246
dtype: int64

In [8]:
movies_3.head()

Unnamed: 0,movieId,genres,title_cast,director,plot_keywords
0,1,Adventure|Animation|Children|Comedy|Fantasy,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,toy|rivalry|cowboy|cgi animation
1,2,Adventure|Children|Fantasy,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,board game|adventurer|fight|game
2,3,Comedy|Romance,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,boat|lake|neighbor|rivalry
3,4,Comedy|Drama|Romance,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,black american|husband wife relationship|betra...
4,5,Comedy,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,fatherhood|doberman|dog|mansion


## Remove whitespace

Remove white space in columns "title_cast"  "director" "plot_keywords" those are all unique words so they can be used a single words insted of multiple words.

In [9]:
movies_3['genres'] = movies_3['genres'].str.replace(' ', '')
movies_3['title_cast'] = movies_3['title_cast'].str.replace(' ', '')
movies_3['plot_keywords'] = movies_3['plot_keywords'].str.replace(' ', '')
movies_3['director'] = movies_3['director'].str.replace(' ', '')

## Split data in Columns

split the data in genre, title_cast, plot_keywords column on | with a space

In [10]:
movies_3['genres'] = movies_3['genres'].str.replace('|', ' ')

In [11]:
movies_3['title_cast'] = movies_3['title_cast'].str.replace('|', ' ')
movies_3['plot_keywords'] = movies_3['plot_keywords'].str.replace('|', ' ')
movies_3['director'] = movies_3['director'].str.replace('|', ' ')

## Corpus of words
Create a corpus column containing all the words from genre, title_cast, plot_keywords and director, to create a vectorizer later

In [12]:
movies_4 = movies_3.copy()

In [13]:
movies_4.head()

Unnamed: 0,movieId,genres,title_cast,director,plot_keywords
0,1,Adventure Animation Children Comedy Fantasy,TomHanks TimAllen DonRickles JimVarney Wallace...,JohnLasseter,toy rivalry cowboy cgianimation
1,2,Adventure Children Fantasy,RobinWilliams JonathanHyde KirstenDunst Bradle...,JonathanHensleigh,boardgame adventurer fight game
2,3,Comedy Romance,WalterMatthau JackLemmon SophiaLoren Ann-Margr...,MarkStevenJohnson,boat lake neighbor rivalry
3,4,Comedy Drama Romance,WhitneyHouston AngelaBassett LorettaDevine Lel...,TerryMcMillan,blackamerican husbandwiferelationship betrayal...
4,5,Comedy,SteveMartin DianeKeaton MartinShort KimberlyWi...,AlbertHackett,fatherhood doberman dog mansion


In [14]:
movies_4.isna().sum()

movieId              0
genres               0
title_cast       47222
director         47076
plot_keywords    48039
dtype: int64

In [15]:
# Fillna with empth strings to ensure the corpus column i screated correctly
movies_4 = movies_4.fillna("")

In [16]:
movies_4['corpus'] = movies_4['genres']

#  +" " + movies_4['title_cast'] + " " +movies_4['plot_keywords']
# movies_4['director'] + " " + 

## Create corpus training dataset

In [17]:
movies_5 = movies_4[['movieId', 'corpus']]

In [18]:
movies_5.head()

Unnamed: 0,movieId,corpus
0,1,Adventure Animation Children Comedy Fantasy Jo...
1,2,Adventure Children Fantasy JonathanHensleigh
2,3,Comedy Romance MarkStevenJohnson
3,4,Comedy Drama Romance TerryMcMillan
4,5,Comedy AlbertHackett


In [19]:
movies_5.to_csv('corpus_df.csv', index= False)

## Create a vectorizer

In [2]:
corpus_df = pd.read_csv('corpus_df.csv')

In [3]:
corpus_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   corpus   62423 non-null  object
dtypes: int64(1), object(1)
memory usage: 975.5+ KB


In [4]:
corpus_df.isna().sum()

movieId    0
corpus     0
dtype: int64

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
# list of text documents
corpus = corpus_df['corpus']

In [7]:
type(corpus)

pandas.core.series.Series

In [9]:
# create the transform
vectorizer = CountVectorizer(max_features=5000)

# tokenize and build vocab
vectorizer.fit(corpus)
# summarize
#print(vectorizer.vocabulary_)
# encode document
vector = vectorizer.transform(corpus)
# summarize encoded vector
print(vector.shape)
#print(type(vector))
#print(vector.toarray())

(62423, 5000)


In [10]:
vector_df = pd.DataFrame(vector.toarray(), columns=vectorizer.get_feature_names())
vector_df.head()

Unnamed: 0,11,aaronguzikowski,aaronj,aaronkatz,aaronmoorhead,aaronseltzer,aaronsorkin,abaire,abbaskiarostami,abbykohn,...,éricbesnard,éricrohmer,érictoledano,étiennechatiliez,étiennefaure,évagárdos,óscaraibar,ömerfaruksorak,önderçakar,özgüryildirim
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
frames = [corpus_df, vector_df]
  

merge_df = pd.concat(frames, axis=1, join='inner')


In [12]:
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Columns: 5002 entries, movieId to özgüryildirim
dtypes: int64(5001), object(1)
memory usage: 2.3+ GB


In [33]:
merge_df.to_csv("merge_vect_df.csv", index= False)

# Model


In [13]:
merge_vect_df = merge_df.copy()

In [14]:
ratings = pd.read_csv("data/train.csv")

In [None]:
# Import data

start = time.time()

merge_vect_df = pd.read_csv("merge_vect_df.csv")

stop = time.time()
print(f"Training time: {round(stop - start,2)}s")

In [15]:
ratings = ratings.sort_values('userId')

In [16]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
6308822,1,296,5.0,1147880044
3137042,1,27721,3.0,1147869115
2533005,1,665,5.0,1147878820
2524478,1,4308,3.0,1147868534
1946297,1,1250,4.0,1147868414


In [17]:
ratings = ratings.drop(['timestamp'],axis= 1)

In [30]:
len(sample_df)

10000

In [19]:
sample_df = ratings[:10000]

In [22]:
merge_vect_df.head()

Unnamed: 0,movieId,11,aaronguzikowski,aaronj,aaronkatz,aaronmoorhead,aaronseltzer,aaronsorkin,abaire,abbaskiarostami,...,éricbesnard,éricrohmer,érictoledano,étiennechatiliez,étiennefaure,évagárdos,óscaraibar,ömerfaruksorak,önderçakar,özgüryildirim
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
merge_vect_df = merge_vect_df.drop(['corpus'], axis= 1)

merge reviews and vector df


In [23]:
train_df = sample_df.merge(merge_vect_df, on= 'movieId', how= "left")

In [24]:
train_df.head()

Unnamed: 0,userId,movieId,rating,11,aaronguzikowski,aaronj,aaronkatz,aaronmoorhead,aaronseltzer,aaronsorkin,...,éricbesnard,éricrohmer,érictoledano,étiennechatiliez,étiennefaure,évagárdos,óscaraibar,ömerfaruksorak,önderçakar,özgüryildirim
0,1,296,5.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,27721,3.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,665,5.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,4308,3.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1250,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
# Identify features and target

# Features
X = train_df.drop(['userId','movieId','rating'], axis=1)

# Target
y = train_df['rating']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 , random_state = 42)


In [26]:
import time

In [29]:

# Initialize model
xgb_reg = XGBRegressor()

start = time.time()
# Fit the model
xgb_reg.fit(X_train, y_train)

#Create Predition
y_pred_5 = xgb_reg.predict(X_test)

stop = time.time()
print(f"Training time: {round(stop - start,2)}s")

# Determine RMSE
RMSE_5 = np.sqrt(mean_squared_error(y_test,y_pred_5))

print('RMSE:', RMSE_5)

# genre vector
# drop(['userId','movieId', 'rating'] RMSE: 1.02767
# ['movieId', 'rating']  RMSE: 0.93872
#  RMSE: 0.92090

# genre director vector
# ['movieId', 'rating'] RMSE: 
# drop(['userId','movieId', 'rating'] RMSE: 1.007610



Training time: 65.74s
RMSE: 1.0076105574082672


In [31]:
corpus_df.head()

Unnamed: 0,movieId,corpus
0,1,Adventure Animation Children Comedy Fantasy Jo...
1,2,Adventure Children Fantasy JonathanHensleigh
2,3,Comedy Romance MarkStevenJohnson
3,4,Comedy Drama Romance TerryMcMillan
4,5,Comedy AlbertHackett


In [32]:
ratings.head()

Unnamed: 0,userId,movieId,rating
6308822,1,296,5.0
3137042,1,27721,3.0
2533005,1,665,5.0
2524478,1,4308,3.0
1946297,1,1250,4.0


In [40]:
r_c_merge = ratings.merge(corpus_df, on= 'movieId', how= 'left')

In [47]:
r_c_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000038 entries, 0 to 10000037
Data columns (total 4 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int64  
 1   movieId  int64  
 2   rating   float64
 3   corpus   object 
dtypes: float64(1), int64(2), object(1)
memory usage: 381.5+ MB


In [58]:
r_c_merge_sample = r_c_merge[:10000]

In [59]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [60]:
pipeline_sgd = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("clf", SGDClassifier()),
    ]
)

In [65]:
# Identify features and target

# Features
X = r_c_merge_sample.drop(['userId','movieId','rating'], axis=1)

# Target
y = r_c_merge_sample['rating']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 , random_state = 42)


In [66]:
len(X)

10000