In [102]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
from collections import deque

# To compute similarities between vectors
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# To create deep learning models
from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model
from sklearn.model_selection import train_test_split

# To create sparse matrices
from scipy.sparse import coo_matrix

# To stack sparse matrices
from scipy.sparse import vstack

In [103]:
df_use = pd.read_csv('df_use.csv')
users = pd.read_parquet('reviewed_users.parquet', engine='pyarrow')
reviews = pd.read_parquet('reviews.parq', engine='pyarrow')
df_titles = pd.read_csv("df_titles.csv", index_col='sid')

In [104]:
df_titles.head()

Unnamed: 0_level_0,Name
sid,Unnamed: 1_level_1
15673,tunnel
40541,twenty-five twenty-one
26981,dr. romantic season 2
29419,move to heaven
17517,the king’s avatar


## Starting the training and recomm

In [105]:
## lets take only a sample for training and testing first
# this shuffles and takes random samples
# for a fixed number of samples
#reviews = reviews.sample(n=1000000)

# for a percentage 
reviews = reviews.sample(frac=1).reset_index(drop=True)

In [106]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3595051 entries, 0 to 3595050
Data columns (total 3 columns):
 #   Column  Dtype  
---  ------  -----  
 0   User    int64  
 1   Rating  float64
 2   Drama   int64  
dtypes: float64(1), int64(2)
memory usage: 82.3 MB


In [107]:
#for deep learning, think about taking only a certain amount not the whole amount. cos df has over 3 million reviews

In [108]:
# splitting into test and train 
# Testingsize
# n = 100000

# # Split train- & testset
# df_train = reviews[:-n]
# df_test = reviews[-n:]

# taking only 10% of data for testing 
df_train, df_test = train_test_split(reviews, test_size=0.10, random_state=42)

In [109]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3235545 entries, 1224085 to 2219110
Data columns (total 3 columns):
 #   Column  Dtype  
---  ------  -----  
 0   User    int64  
 1   Rating  float64
 2   Drama   int64  
dtypes: float64(1), int64(2)
memory usage: 98.7 MB


In [110]:
# Create a user-movie matrix with empty values
df_p = df_train.pivot_table(index='User', columns='Drama', values='Rating')
print('Shape User-Drama-Matrix:\t{}'.format(df_p.shape))
df_p

Shape User-Drama-Matrix:	(10873, 3346)


Drama,1,7,9,10,11,13,14,15,17,18,...,44437,44472,44543,44690,44721,44909,45026,45472,45613,45877
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
6,9.0,6.5,,,,,,,,8.0,...,,,,,,,,,,
10,,9.5,,,,,,,9.0,10.0,...,,,,,,,,,,
12,9.5,8.5,,,,,,,,,...,,,,,,,,,,
23,10.0,10.0,,,,,,,9.0,9.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84562,8.0,8.0,,,7.0,,,,,9.0,...,,,,,,,,,,
84581,,,,,,,,,,,...,,,,,,,,,,
84597,,7.5,,,,,,,,,...,,,,,,,,,,
84600,,,,,,,,,,,...,,,,,,,,,,


In [111]:
# Top n movies
n = 10

# Computing the mean rating for all movies
ratings_mean = df_p.mean(axis=0).sort_values(ascending=False).rename('Rating-Mean').to_frame()

# Counting the ratings for all movies
ratings_count = df_p.count(axis=0).rename('Rating-Count').to_frame()

# Combining the ratings_mean, ratings_count and movie_titles
ranking_mean_rating = ratings_mean.head(n).join(ratings_count).join(df_titles)


# Join labels and predictions
df_prediction = df_test.set_index('Drama').join(ratings_mean)[['Rating', 'Rating-Mean']]
y_true = df_prediction['Rating']
y_pred = df_prediction['Rating-Mean']

# Check for NaNs in both y_true and y_pred
nan_in_y_true = y_true.isna().sum()
nan_in_y_pred = y_pred.isna().sum()

print(f"NaNs in y_true: {nan_in_y_true}")
print(f"NaNs in y_pred: {nan_in_y_pred}")

# had a problem with NAN values in my dataset. 
# so may have to remove rows with NaNs in either column beforehand
#df_prediction = df_prediction.dropna(subset=['Rating', 'Rating-Mean'])

# Or we can impute NaNs instead by using the mean
df_prediction['Rating'].fillna(df_prediction['Rating'].mean(), inplace=True)
df_prediction['Rating-Mean'].fillna(df_prediction['Rating-Mean'].mean(), inplace=True)

# Update y_true and y_pred after handling NaNs
y_true = df_prediction['Rating']
y_pred = df_prediction['Rating-Mean']

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred))

# must check if nans in data:

NaNs in y_true: 0
NaNs in y_pred: 0


In [112]:
# creating the visual of the top mean ratings
trace = go.Bar(x = ranking_mean_rating['Rating-Mean'],
               text = ranking_mean_rating['Name'].astype(str) +': '+ ranking_mean_rating['Rating-Count'].astype(str) + ' Ratings',
               textposition = 'outside',
               textfont = dict(color = '#000000'),
               orientation = 'h',
               y = list(range(1, n+1)),
               marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = 'Ranking Of Top {} Mean-Drama-Ratings: {:.4f} RMSE'.format(n, rmse),
              xaxis = dict(title = 'Mean-Rating',
                          range = (8, 10)),
              yaxis = dict(title = 'Drama'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [113]:
# most low rated dramas have been removed, but lets make it so minimum is 200
m = 200

# Mean rating for all dramas
C = df_p.stack().mean()

# Mean rating for all dramas separately
R = df_p.mean(axis=0).values

# Rating count for all dramas separately
v = df_p.count().values


# weighted formula
weighted_score = (v/ (v+m) *R) + (m/ (v+m) *C)
# Sort ids to ranking
weighted_ranking = np.argsort(weighted_score)[::-1]
# Sort scores to ranking
weighted_score = np.sort(weighted_score)[::-1]
# Get drama sids
weighted_drama_ids = df_p.columns[weighted_ranking]


# Join labels and predictions
df_prediction = df_test.set_index('Drama').join(pd.DataFrame(weighted_score, index=weighted_drama_ids, columns=['Prediction']))[['Rating', 'Prediction']]
y_true = df_prediction['Rating']
y_pred = df_prediction['Prediction']

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred))


In [114]:
# Create DataFrame for plotting
df_plot = pd.DataFrame(weighted_score[:n], columns=['Rating'])
df_plot.index = weighted_drama_ids[:10]
ranking_weighted_rating = df_plot.join(ratings_count).join(df_titles)
del df_plot


# Create trace
trace = go.Bar(x = ranking_weighted_rating['Rating'],
               text = ranking_weighted_rating['Name'].astype(str) +': '+ ranking_weighted_rating['Rating-Count'].astype(str) + ' Ratings',
               textposition = 'outside',
               textfont = dict(color = '#000000'),
               orientation = 'h',
               y = list(range(1, n+1)),
               marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = 'Ranking Of Top {} Weighted-Drama-Ratings: {:.4f} RMSE'.format(n, rmse),
              xaxis = dict(title = 'Weighted Rating',
                          range = (7, 10)),
              yaxis = dict(title = 'Drama'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [115]:
# lets take a sample of the data instead of the full amount, because, wow large
# for a fixed number of samples
reviews_small = reviews.sample(n=1000000)

# for a percentage 
#reviews = reviews.sample(frac=1).reset_index(drop=True)

# reviews_small = reviews

In [116]:
# splitting into test and train 
# Testingsize
n = 100000

# Split train- & testset
df_train = reviews_small[:-n]
df_test = reviews_small[-n:]

In [117]:
# Create a user-movie matrix with empty values
df_p = df_train.pivot_table(index='User', columns='Drama', values='Rating')
print('Shape User-Drama-Matrix:\t{}'.format(df_p.shape))
df_p

Shape User-Drama-Matrix:	(10873, 3346)


Drama,1,7,9,10,11,13,14,15,17,18,...,44437,44472,44543,44690,44721,44909,45026,45472,45613,45877
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
6,9.0,6.5,,,,,,,,,...,,,,,,,,,,
10,,9.5,,,,,,,,,...,,,,,,,,,,
12,,8.5,,,,,,,,,...,,,,,,,,,,
23,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84562,,,,,,,,,,,...,,,,,,,,,,
84581,,,,,,,,,,,...,,,,,,,,,,
84597,,,,,,,,,,,...,,,,,,,,,,
84600,,,,,,,,,,,...,,,,,,,,,,


In [118]:
# THIS CODE TAKES AGES! AT LEAST 10 MINUTES!!

# selecting user id to grab recommendations
user_index = 0

# Number of similar users for recommendation
n_recommendation = 100

# Plot top n recommendations
n_plot = 10

print("Starting to impute missing values...")
# Fill in missing values
df_p_imputed = df_p.T.fillna(df_p.mean(axis=1)).T
print("Imputation completed.")

print("Computing cosine similarity between all users...")
# Compute similarity between all users
similarity = cosine_similarity(df_p_imputed.values)
print("Cosine similarity computation completed.")

print("Processing similarity matrix...")
# Remove self-similarity from similarity-matrix
similarity -= np.eye(similarity.shape[0])

print("Sorting similar users...")
# Sort similar users by index
similar_user_index = np.argsort(similarity[user_index])[::-1]
# Sort similar users by score
similar_user_score = np.sort(similarity[user_index])[::-1]
print("Sorting completed.")


Starting to impute missing values...
Imputation completed.
Computing cosine similarity between all users...
Cosine similarity computation completed.
Processing similarity matrix...
Sorting similar users...
Sorting completed.


In [119]:
# Get unrated movies
unrated_dramas = df_p.iloc[user_index][df_p.iloc[user_index].isna()].index

# Weight ratings of the top n most similar users with their rating and compute the mean for each movie
mean_drama_recommendations = (df_p_imputed.iloc[similar_user_index[:n_recommendation]].T * similar_user_score[:n_recommendation]).T.mean(axis=0)

# Filter for unrated movies and sort results
best_drama_recommendations = mean_drama_recommendations[unrated_dramas].sort_values(ascending=False).to_frame().join(df_titles)




In [120]:
# Create user-id mapping
user_id_mapping = {id:i for i, id in enumerate(df_p_imputed.index)}

prediction = []
# Iterate over all testset items
for user_id in df_test['User'].unique():
    
    # Sort similar users by index
    similar_user_index = np.argsort(similarity[user_id_mapping[user_id]])[::-1]
    # Sort similar users by score
    similar_user_score = np.sort(similarity[user_id_mapping[user_id]])[::-1]
    
    for drama_id in df_test[df_test['User']==user_id]['Drama'].values:

        # Compute predicted score
        score = (df_p_imputed.iloc[similar_user_index[:n_recommendation]][drama_id] * similar_user_score[:n_recommendation]).values.sum() / similar_user_score[:n_recommendation].sum()
        prediction.append([user_id, drama_id, score])
        

# Create prediction DataFrame
df_pred = pd.DataFrame(prediction, columns=['User', 'Drama', 'Prediction']).set_index(['User', 'Drama'])
df_pred = df_test.set_index(['User', 'Drama']).join(df_pred)


# Get labels and predictions
y_true = df_pred['Rating'].values
y_pred = df_pred['Prediction'].values

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred))




In [121]:
# Create trace
trace = go.Bar(x = best_drama_recommendations.iloc[:n_plot, 0],
               text = best_drama_recommendations['Name'],
               textposition = 'inside',
               textfont = dict(color = '#000000'),
               orientation = 'h',
               y = list(range(1, n_plot+1)),
               marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = 'Ranking Of Top {} Recommended Dramas For A User Based On Similarity: {:.4f} RMSE'.format(n_plot, rmse),
              xaxis = dict(title = 'Recommendation-Rating',
                           range = (8, 10)),
              yaxis = dict(title = 'Drama'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [122]:
## TFIDF based on tags? 
df_use.head()

Unnamed: 0.1,Unnamed: 0,Name,category,country,num_episodes,aired,watchers,rating,num_raters,cast_names,genre_names,tag_names,synopsis,url,sid,num_reviews
0,0,tunnel,Drama,South Korea,16.0,2017,32651,8.7,14787,"Choi Jin Hyuk,Yoon Hyun Min,Lee Yoo Young,Jo H...","Thriller,Mystery,Sci-Fi,Fantasy","Time Travel,Murder,Criminal Profiler,Serial K...","In 1986, Park Gwang Ho works as an excellent a...",https://i.mydramalist.com/JkryYc.jpg?v=1,15673.0,4130.0
1,1,twenty-five twenty-one,Drama,South Korea,16.0,2022,16043,8.8,2119,"Kim Tae Ri,Nam Joo Hyuk,Bona,Choi Hyun Wook,Le...","Romance,Life,Youth,Drama","Athlete,Fencing,1990s,Coming Of Age,Bold Fema...",The story is set in 1998 and tells the stories...,https://i.mydramalist.com/ROOPo_4c.jpg?v=1,40541.0,3731.0
2,2,dr. romantic season 2,Drama,South Korea,16.0,2020,31855,8.7,15880,"Han Seok Kyu,Ahn Hyo Seop,Lee Sung Kyung,Kim J...","Romance,Drama,Medical,Melodrama","Hospital,Smart Male Lead,Character Developmen...","A real doctor story set in a small, humble h...",https://i.mydramalist.com/Rr7DEc.jpg?v=1,26981.0,4129.0
3,3,move to heaven,Drama,South Korea,10.0,2021,40962,9.2,20399,"Lee Je Hoon,Tang Jun Sang,Hong Seung Hee,Jung ...","Life,Drama,Family","Autism,Uncle-Nephew Relationship,Death,Savant...",Geu Roo is a young autistic man. He works for ...,https://i.mydramalist.com/Rle36_4c.jpg?v=1,29419.0,3911.0
4,4,the king’s avatar,Drama,China,40.0,2019,24236,8.6,8734,"Yang Yang,Jiang Shu Ying,Lai Yu Meng,Daisy Li,...","Action,Friendship,Youth,Sports","Online Gaming,Strong Friendship,Smart Male Le...","In the multiplayer online game Glory, Ye Xiu i...",https://i.mydramalist.com/2O0xEc.jpg?v=1,17517.0,2029.0


In [123]:
# maybe try combining genre and tag columns? maybe just try with tag_names first, then see if combining helps 

In [124]:
df_tags = df_use[['Name','tag_names']]
df_tags.set_index('Name', inplace=True)
df_tags.head()

Unnamed: 0_level_0,tag_names
Name,Unnamed: 1_level_1
tunnel,"Time Travel,Murder,Criminal Profiler,Serial K..."
twenty-five twenty-one,"Athlete,Fencing,1990s,Coming Of Age,Bold Fema..."
dr. romantic season 2,"Hospital,Smart Male Lead,Character Developmen..."
move to heaven,"Autism,Uncle-Nephew Relationship,Death,Savant..."
the king’s avatar,"Online Gaming,Strong Friendship,Smart Male Le..."


In [125]:
df_tags[df_tags.index=="semantic error"]

Unnamed: 0_level_0,tag_names
Name,Unnamed: 1_level_1
semantic error,"LGBTQ+,Student,Opposites Attract,University,G..."


In [126]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df_tags['tag_names'].dropna())


# Compute cosine similarity between all movie-descriptions
similarity = cosine_similarity(tfidf_matrix)
# Remove self-similarity from matrix
similarity -= np.eye(similarity.shape[0])


# Get index of movie to find similar movies
movie = 'semantic error'
n_plot = 10
index = df_tags.reset_index(drop=True)[df_tags.index==movie].index[0]

# Get indices and scores of similar movies
similar_drama_index = np.argsort(similarity[index])[::-1][:n_plot]
similar_drama_score = np.sort(similarity[index])[::-1][:n_plot]

# Get titles of similar movies
similar_drama_titles = df_tags.iloc[similar_drama_index].index


# Create trace
trace = go.Bar(x = similar_drama_score,
               text = similar_drama_titles,
               textposition = 'inside',
               textfont = dict(color = '#000000'),
               orientation = 'h',
               y = list(range(1, n_plot+1)),
               marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = 'Ranking Of Top {} Most Similar Drama Descriptions For "{}"'.format(n_plot, movie),
              xaxis = dict(title = 'Cosine TFIDF Description Similarity',
                           #range = (0, 0.4)
                          ),
              yaxis = dict(title = 'Drama'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [127]:
# lets combine the genre and tags 
# Combine 'genre_names' and 'tag_names' into a new column
df_use['combined_genres_tags'] = df_use.apply(lambda row: str(row['genre_names']) + " " + str(row['tag_names']), axis=1)
df_gtags = df_use[['Name','combined_genres_tags']]
df_gtags.set_index('Name', inplace=True)
df_gtags.head()

Unnamed: 0_level_0,combined_genres_tags
Name,Unnamed: 1_level_1
tunnel,"Thriller,Mystery,Sci-Fi,Fantasy Time Travel,..."
twenty-five twenty-one,"Romance,Life,Youth,Drama Athlete,Fencing,199..."
dr. romantic season 2,"Romance,Drama,Medical,Melodrama Hospital,Sma..."
move to heaven,"Life,Drama,Family Autism,Uncle-Nephew Relati..."
the king’s avatar,"Action,Friendship,Youth,Sports Online Gaming..."


In [128]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df_gtags['combined_genres_tags'].dropna())


# Compute cosine similarity between all movie-descriptions
similarity = cosine_similarity(tfidf_matrix)
# Remove self-similarity from matrix
similarity -= np.eye(similarity.shape[0])


# Get index of movie to find similar movies
movie = 'semantic error'
n_plot = 10
index = df_gtags.reset_index(drop=True)[df_gtags.index==movie].index[0]

# Get indices and scores of similar movies
similar_drama_index = np.argsort(similarity[index])[::-1][:n_plot]
similar_drama_score = np.sort(similarity[index])[::-1][:n_plot]

# Get titles of similar movies
similar_movie_titles = df_gtags.iloc[similar_drama_index].index


# Create trace
trace = go.Bar(x = similar_drama_score,
               text = similar_drama_titles,
               textposition = 'inside',
               textfont = dict(color = '#000000'),
               orientation = 'h',
               y = list(range(1, n_plot+1)),
               marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = 'Ranking Of Top {} Most Similar Drama Descriptions For "{}"'.format(n_plot, movie),
              xaxis = dict(title = 'Cosine TFIDF Description Similarity',
                           #range = (0, 0.4)
                          ),
              yaxis = dict(title = 'Drama'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [129]:
## Matrix Factorisation With Keras And Gradient Descent

In [130]:
# genres are too ... general. so will go for the tags instead which goes deeper into the content of the story
# Create user- & movie-id mapping
user_id_mapping = {id:i for i, id in enumerate(reviews['User'].unique())}
movie_id_mapping = {id:i for i, id in enumerate(reviews['Drama'].unique())}


# Create correctly mapped train- & testset
train_user_data = df_train['User'].map(user_id_mapping)
train_movie_data = df_train['Drama'].map(movie_id_mapping)

test_user_data = df_test['User'].map(user_id_mapping)
test_movie_data = df_test['Drama'].map(movie_id_mapping)


# Get input variable-sizes
users = len(user_id_mapping)
movies = len(movie_id_mapping)
embedding_size = 10


##### Create model
# Set input layers
user_id_input = Input(shape=[1], name='user')
movie_id_input = Input(shape=[1], name='drama')

# Create embedding layers for users and movies
user_embedding = Embedding(output_dim=embedding_size, 
                           input_dim=users,
                           input_length=1, 
                           name='user_embedding')(user_id_input)
movie_embedding = Embedding(output_dim=embedding_size, 
                            input_dim=movies,
                            input_length=1, 
                            name='item_embedding')(movie_id_input)

# Reshape the embedding layers
user_vector = Reshape([embedding_size])(user_embedding)
movie_vector = Reshape([embedding_size])(movie_embedding)

# Compute dot-product of reshaped embedding layers as prediction
y = Dot(1, normalize=False)([user_vector, movie_vector])

# Setup model
model = Model(inputs=[user_id_input, movie_id_input], outputs=y)
model.compile(loss='mse', optimizer='adam')


# Fit model
model.fit([train_user_data, train_movie_data],
          df_train['Rating'],
          batch_size=256, 
          epochs=1,
          validation_split=0.1,
          shuffle=True)

# Test model
y_pred = model.predict([test_user_data, test_movie_data])
y_true = df_test['Rating'].values

#  Compute RMSE
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Matrix-Factorization: {:.4f} RMSE'.format(rmse))




Testing Result With Keras Matrix-Factorization: 3.8413 RMSE
