In [14]:
#data analysis libraries 
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
#visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#ignore warnings
import warnings

warnings.filterwarnings('ignore')

In [15]:
# https://aigamer28100.github.io/Movie-Lens-Dataset-Visualization-and-Prediction/

In [16]:
mldir="../data/ml-20m/"

#import train and test CSV files
movie = pd.read_csv(mldir+"movies.csv")
ratings = pd.read_csv(mldir+"ratings.csv")
links = pd.read_csv(mldir+"links.csv")
tags = pd.read_csv(mldir+"tags.csv")
#take a look at the training data
print(movie.shape)
print(ratings.shape)
print(links.shape)
print(tags.shape,end="\n\n")
#m=pd.read_csv("movies.csv")


(27278, 3)
(20000263, 4)
(27278, 3)
(465564, 4)



In [17]:

#get a list of the features within the dataset
print("Movie : ", movie.columns,end="\n\n")
print("Rating : ", ratings.columns,end="\n\n")
print("Links : ", links.columns,end="\n\n")
print("Tags : ", tags.columns,end="\n\n")

movie.info()
ratings.info()
tags.info()

Movie :  Index(['movieId', 'title', 'genres'], dtype='object')

Rating :  Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

Links :  Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')

Tags :  Index(['userId', 'movieId', 'tag', 'timestamp'], dtype='object')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 610.4 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 465564 entrie

In [18]:
movie.head(3)



Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


{'A', ')', 'r', 'R', 'N', '|', ' ', 'v', 'C', 'l', 'y', 'a', '(', 's', 'M', 'u', 'e', 'o', 't', 'D', 'i', 'I', 'c', 'd', 'W', 'S', '-', 'm', 'n', 'H', 'F', 'X', 'g', 'T', 'h'}


In [19]:
links.head(3)


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0


In [20]:
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819


In [21]:
tags.head(3)


Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079


In [22]:
# Droping the timestamp column from ratings and tags file
ratings.drop(columns='timestamp',inplace=True)
tags.drop(columns='timestamp',inplace=True)

In [23]:
all_categories = set(cat for sublist in movie['genres'].str.split("|") for cat in sublist)
print(all_categories)

{'Adventure', 'Fantasy', 'Horror', 'IMAX', 'Romance', 'Musical', 'Children', 'Documentary', 'Animation', 'Western', 'Crime', 'Mystery', 'Comedy', 'War', 'Sci-Fi', 'Thriller', 'Drama', '(no genres listed)', 'Action', 'Film-Noir'}


In [24]:
for category in all_categories:
    movie[category] = movie['genres'].apply(lambda x: 1 if category in x else 0)

# Drop the original 'Categories' column
movie = movie.drop('genres', axis=1)
    

In [25]:
#Add a Column `rating` in movie DF and assign them with the Mean Movie Rating for that Movie.
x=ratings.groupby('movieId').rating.mean()
movie = pd.merge(movie,x,how='outer',on='movieId')
movie['rating'].fillna('0',inplace=True)

In [26]:
movie.head()

Unnamed: 0,movieId,title,Adventure,Fantasy,Horror,IMAX,Romance,Musical,Children,Documentary,...,Mystery,Comedy,War,Sci-Fi,Thriller,Drama,(no genres listed),Action,Film-Noir,rating
0,1,Toy Story (1995),1,1,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,3.92124
1,2,Jumanji (1995),1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,3.211977
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,3.15104
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,2.861393
4,5,Father of the Bride Part II (1995),0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,3.064592


In [36]:
#importing necessary packages for model prediction and evaluation
import sklearn
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.model_selection import train_test_split

In [28]:
movie.columns

Index(['movieId', 'title', 'Adventure', 'Fantasy', 'Horror', 'IMAX', 'Romance',
       'Musical', 'Children', 'Documentary', 'Animation', 'Western', 'Crime',
       'Mystery', 'Comedy', 'War', 'Sci-Fi', 'Thriller', 'Drama',
       '(no genres listed)', 'Action', 'Film-Noir', 'rating'],
      dtype='object')

In [29]:
#split the data into features and results
X = movie[movie.columns[3:-1]]
y = movie[movie.columns[-1]]

In [30]:
X_1 , X_test , y_1 , y_test = train_test_split(X ,y ,test_size=0.20 ,random_state = 1 ,shuffle = True)

X_train , X_validation , y_train , y_validation = train_test_split(X_1 ,y_1 ,test_size=0.20 ,random_state = 1 ,shuffle = True)


In [31]:
X_train.shape

(17457, 19)

In [32]:
X_train.head()

Unnamed: 0,Fantasy,Horror,IMAX,Romance,Musical,Children,Documentary,Animation,Western,Crime,Mystery,Comedy,War,Sci-Fi,Thriller,Drama,(no genres listed),Action,Film-Noir
359,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
17425,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0
5307,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
5990,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0
7787,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0


In [37]:
model = DecisionTreeRegressor(max_depth=10,random_state=42)
model.fit(X_train,y_train)
print(mean_squared_error(model.predict(X_train),y_train))

0.549296099310675


In [38]:
tree_params = {'max_depth':[3,5,10]}
DTCG = GridSearchCV(DecisionTreeRegressor(random_state=42), tree_params, cv=5)
DTCG.fit(X_1,y_1)
DTC=DTCG.best_estimator_
score=DTCG.best_score_
print(score)

0.06068867708598267


In [39]:
mean_squared_error(DTC.predict(X_test),y_test)

0.5717914257637114