In [1]:
import pandas as pd
from pandasgui import show
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix,\
    classification_report, plot_roc_curve, auc
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [2]:
df_actors = pd.read_csv('./data/movie_actors.csv')
df_countries = pd.read_csv('./data/movie_countries.csv')
df_directors = pd.read_csv('./data/movie_directors.csv')
df_genres = pd.read_csv('./data/movie_genres.csv')
df_imdb = pd.read_csv('./data/movie_imdb.csv')
df_locations = pd.read_csv('./data/movie_locations.csv')
df_tags = pd.read_csv('./data/movie_tags.csv')
df_rottem = pd.read_csv('./data/movies_rt.csv')
df_movies = pd.read_csv('./data/movies.csv')
df_ratings = pd.read_csv('./data/ratings_train.csv')
df_profile = pd.read_csv('./data/profile_by_user.csv')

df_test = pd.read_csv('./data/ratings_test.csv')


In [3]:
df_movies.head()

Unnamed: 0,id,title,year,imdbID,rtID
0,1,Toy story,1995,114709,toy_story
1,2,Jumanji,1995,113497,1068044-jumanji
2,3,Grumpy Old Men,1993,107050,grumpy_old_men
3,4,Waiting to Exhale,1995,114885,waiting_to_exhale
4,5,Father of the Bride Part II,1995,113041,father_of_the_bride_part_ii


In [4]:
list_of_genres = df_genres.genre.unique().tolist()
list_of_genres.sort()
df_genres['value'] = 1
df_genres = df_genres.pivot(index='movieID', columns='genre', values='value').fillna(0).rename_axis(columns = None).reset_index()

In [5]:
df_movies = df_movies.merge(df_genres, left_on='id', right_on='movieID')

In [6]:
df_movies

Unnamed: 0,id,title,year,imdbID,rtID,movieID,Action,Adventure,Animation,Children,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Short,Thriller,War,Western
0,1,Toy story,1995,114709,toy_story,1,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,1995,113497,1068044-jumanji,2,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpy Old Men,1993,107050,grumpy_old_men,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,1995,114885,waiting_to_exhale,4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,1995,113041,father_of_the_bride_part_ii,5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10192,65088,Bedtime Stories,2008,960731,bedtime_stories,65088,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10193,65091,Manhattan Melodrama,1934,25464,manhattan_melodrama,65091,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
10194,65126,Choke,2008,1024715,choke,65126,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10195,65130,Revolutionary Road,2008,959337,revolutionary_road,65130,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [7]:
list_of_user_movie_similarity = []
for user_id, group in tqdm(df_ratings.groupby('userID')):
    for index, row in group.iterrows():
        user_movie_similarity = dict()
        profile = df_profile.loc[df_profile.userID == user_id][list_of_genres].values[0]
        movie = df_movies.query("id == {}".format(row.movieID))[list_of_genres].values[0]
        similarity = np.dot(movie, profile) / (np.linalg.norm(movie) * np.linalg.norm(profile))
        user_movie_similarity['userID'] = user_id
        user_movie_similarity['movieID'] = row.movieID
        user_movie_similarity['similarity'] = similarity
        list_of_user_movie_similarity.append(user_movie_similarity)
df_user_movie_similarity = pd.DataFrame(list_of_user_movie_similarity)

100%|██████████| 2113/2113 [31:59<00:00,  1.10it/s]


In [8]:
df_ratings = df_ratings.merge(df_user_movie_similarity, left_on=['userID', 'movieID'], right_on=['userID', 'movieID'])
df_ratings.to_csv('./data/ratings_train_similarity.csv', index=False)
df_ratings.head()

Unnamed: 0,ID,userID,movieID,rating,date_day,date_month,date_year,date_hour,date_minute,date_second,similarity
0,1,51198,608,5.0,17,9,1997,23,40,20,0.467004
1,2,51198,141,4.0,17,9,1997,23,41,58,0.22127
2,3,51198,1073,4.0,17,9,1997,23,41,6,0.45196
3,4,51198,356,4.0,17,9,1997,23,43,44,0.46008
4,5,51198,1198,5.0,17,9,1997,23,44,35,0.317708


In [9]:
list_of_user_movie_similarity = []
for user_id, group in tqdm(df_test.groupby('userID')):
    for index, row in group.iterrows():
        user_movie_similarity = dict()
        profile = df_profile.loc[df_profile.userID == user_id][list_of_genres].values[0]
        movie = df_movies.query("id == {}".format(row.movieID))[list_of_genres].values[0]
        similarity = np.dot(movie, profile) / (np.linalg.norm(movie) * np.linalg.norm(profile))
        user_movie_similarity['userID'] = user_id
        user_movie_similarity['movieID'] = row.movieID
        user_movie_similarity['similarity'] = similarity
        list_of_user_movie_similarity.append(user_movie_similarity)
df_user_movie_similarity = pd.DataFrame(list_of_user_movie_similarity)

100%|██████████| 2103/2103 [03:26<00:00, 10.20it/s]


In [10]:
df_user_movie_similarity.head()

Unnamed: 0,userID,movieID,similarity
0,75,920.0,0.41655
1,75,173.0,0.463611
2,75,32587.0,0.423715
3,75,45431.0,0.452658
4,75,1370.0,0.45302


In [11]:
df_test = df_test.merge(df_user_movie_similarity, left_on=['userID', 'movieID'], right_on=['userID', 'movieID'])
df_test.to_csv('./data/ratings_test_similarity.csv', index=False)
df_test.head()

Unnamed: 0,ID,userID,movieID,rating,date_day,date_month,date_year,date_hour,date_minute,date_second,similarity
0,9,29202,1608,,18,9,1997,0,53,58,0.324408
1,18,32283,1485,,18,9,1997,15,45,18,0.217213
2,34,32283,30,,18,9,1997,16,36,38,0.324625
3,39,32283,1545,,18,9,1997,16,47,39,0.231302
4,50,51198,1258,,20,9,1997,0,11,21,0.309244


In [130]:
# Separo mi train & test
X_train, X_test, y_train, y_test =\
    train_test_split(df_ratings.drop(['rating', 'ID'], axis=1), df_ratings.rating, test_size=0.3, random_state=0)

# Entrenamiento regresion lineal
model = LinearRegression()
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)
scores = model.score(X_test, y_test)
print("Linear regression: ", mean_squared_error(y_test, y_predicted))
print("Linear regression R2: ", r2_score(y_test, y_predicted))

Linear regression:  0.994783422093942
Linear regression R2:  0.013122951464400345


In [131]:
model.fit(df_ratings.drop(['rating', 'ID'], axis=1), df_ratings.rating)
prediction = model.predict(df_test.drop(['rating', 'ID', 'rating'], axis=1))
submission = pd.DataFrame(
        {'ID': df_test.ID, 'rating': np.around(prediction, 2)})
submission.to_csv('./submission_similarity.csv', index=False)
# kaggle competitions submit -c recomendacion-de-peliculas-fcen-2020 -f notebooks/submission_similarity.csv -m "Con similaridad"