In [1]:
import pandas as pd

In [2]:
movies_df = pd.read_csv('movies_final.csv')
ratings_df = pd.read_csv('ratings_final.csv')
quantiled_movies_df = pd.read_csv('my_quantiled_movies.csv')

In [3]:
content_df = movies_df.copy()

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer(stop_words='english')

tfidf_matrix = tf_idf.fit_transform(content_df['plot'])

In [5]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [6]:
users_ratings = ratings_df.groupby(['user']).count()
selected = users_ratings['rating'] > 200
selected_users = users_ratings.loc[selected]
random_selected = selected_users.sample() # sample() returns a random row from the dataframe. The returned object is a dataframe with one row. If you pass a number as argument you specify to select more than one row.
select_column_df = random_selected.reset_index()['user'] # reset_index() create a new index, and the user became a column. Then, we can filter using the column name
selected_user = select_column_df.iloc[0] # iloc select by index, since our dataframe only has one row we read it from the index 0
print("Selected user: " + str(selected_user))

Selected user: 339


In [7]:
selected_user_ratings = ratings_df.loc[ratings_df['user'] == selected_user]
selected_user_ratings = selected_user_ratings.sort_values(by='item', ascending=True)
print("Rated movies: " + str(selected_user_ratings.shape[0]))
display(selected_user_ratings.head(10))

Rated movies: 223


Unnamed: 0,user,item,rating,timestamp
33481,339,1,4.0,1460183470
33482,339,6,4.0,1460345729
33483,339,16,4.5,1460794139
33484,339,111,4.0,1460518498
33485,339,150,4.0,1460182140
33486,339,296,2.5,1460868473
33487,339,319,5.0,1460518598
33488,339,356,4.0,1460182081
33489,339,480,4.0,1460186748
33490,339,527,5.0,1460519029


In [8]:
rated_movies_df = movies_df.loc[movies_df['item'].isin(list(selected_user_ratings['item']))]
rated_movies_df = rated_movies_df[['title', 'plot']]
print("Rated movies: " + str(rated_movies_df.shape[0]))
display(rated_movies_df.head(10))

Rated movies: 223


Unnamed: 0,title,plot
0,toy story,In a world where toys are living things who pr...
5,heat,"Neil McCauley, a career criminal, hires Waingr..."
14,casino,"In 1973, sports handicapper and Mafia associat..."
69,taxi driver,"Travis Bickle, a 26-year-old honorably dischar..."
78,apollo 13,"In July 1969, astronaut Jim Lovell hosts a hou..."
160,pulp fiction,Hitmen Jules Winnfield and Vincent Vega arrive...
166,shallow grave,"Chartered accountant David Stephens, physician..."
186,forrest gump,"In 1981, Forrest Gump recounts his life story ..."
252,jurassic park,Industrialist John Hammond and his bioengineer...
281,schindler's list,"In Kraków during World War II, the Germans hav..."


In [9]:
diff = set(movies_df.index) - set(rated_movies_df.index)
unrated_movies_df = movies_df.loc[diff]
unrated_movies_df = unrated_movies_df[['title', 'plot']]
print("Unrated movies: " + str(unrated_movies_df.shape[0]))
display(unrated_movies_df.head(10))

Unrated movies: 4410


Unnamed: 0,title,plot
1,jumanji,"In 1869, near Brantford, New Hampshire, two br..."
2,grumpier old men,The feud between Max (Walter Matthau) and John...
3,waiting to exhale,"""Friends are the People who let you be yoursel..."
4,father of the bride part ii,The film begins five years after the events of...
6,sabrina,Sabrina Fairchild is the young daughter of the...
7,tom and huck,The movie opens with Injun Joe (Eric Schweig) ...
8,sudden death,Darren McCord (Jean-Claude Van Damme) is a Fre...
9,goldeneye,"In 1986, at Arkhangelsk, MI6 agents James Bond..."
10,dracula: dead and loving it,Solicitor Thomas Renfield travels all the way ...
11,balto,"In New York City, an elderly woman, her grandd..."


In [10]:
indices = pd.Series(movies_df.index, index=movies_df['title']).drop_duplicates()
def get_similar_movies(title, cosine_sim):
    idx = indices[title]
    sim_scores1 = list(enumerate(cosine_sim[idx]))
    sim_scores1 = sorted(sim_scores1, key=lambda x: x[1], reverse=True)
    sim_scores1 = sim_scores1[1:4783]
    sim_scores1 = [i for i in sim_scores1 if i[0] in rated_movies_df.index]
    return sim_scores1

In [11]:
def calculate_sim_scores(unrated_movie, cosine_sim):
    sim_scores = get_similar_movies(unrated_movie, cosine_sim)
    movie_indices = [i[0] for i in sim_scores] 
    most_similar_rated = movies_df.loc[movies_df['title'].isin(rated_movies_df.loc[movie_indices].head(5)['title'])]['item']
    temp = ratings_df[(ratings_df['user'] == selected_user)]
    most_similar_ratings = temp.loc[ratings_df['item'].isin(most_similar_rated)]['rating']
    cosine_scores =[n for _, n in sim_scores[:len(most_similar_ratings)]]
    sum_cosine = sum(cosine_scores)
    final_similarity_score = sum(most_similar_ratings * cosine_scores)/sum_cosine
    return final_similarity_score

In [12]:
unrated_movies_df['sim_score']= unrated_movies_df.apply(lambda row: calculate_sim_scores(row['title'],cosine_sim),axis=1)

In [13]:
unrated_movies_df.to_csv('content_based_cf.csv')

In [14]:
unrated_movies_df

Unnamed: 0,title,plot,sim_score
1,jumanji,"In 1869, near Brantford, New Hampshire, two br...",3.965459
2,grumpier old men,The feud between Max (Walter Matthau) and John...,4.090392
3,waiting to exhale,"""Friends are the People who let you be yoursel...",3.960971
4,father of the bride part ii,The film begins five years after the events of...,3.992067
6,sabrina,Sabrina Fairchild is the young daughter of the...,4.671090
...,...,...,...
4628,phantom thread,"In 1954 London, renowned fashion designer Reyn...",4.823701
4629,bright,"In an alternate present, humans live in uneasy...",3.795012
4630,gintama,Yorozuya receives two similar and ultimately c...,3.003607
4631,love live! the school idol movie,The movie begins with a scene from the second ...,4.586757


In [15]:
rated_movies_df

Unnamed: 0,title,plot
0,toy story,In a world where toys are living things who pr...
5,heat,"Neil McCauley, a career criminal, hires Waingr..."
14,casino,"In 1973, sports handicapper and Mafia associat..."
69,taxi driver,"Travis Bickle, a 26-year-old honorably dischar..."
78,apollo 13,"In July 1969, astronaut Jim Lovell hosts a hou..."
...,...,...
4497,snowden,"In 2013, Edward Snowden arranges a clandestine..."
4547,the founder,Ray Kroc is a traveling salesman selling Princ...
4588,dunkirk,"In 1940, during the Battle of France, hundreds..."
4598,blade runner 2049,"In 2049, replicants (described as ""bioengineer..."


In [20]:
selected_ratings = ratings_df.loc[ratings_df['user'] == 339]

In [17]:
item_title_df = movies_df[['item','title']]

In [21]:
test_movies = pd.merge(selected_ratings, item_title_df,  how='inner', left_on=['item'], right_on = ['item'])

In [25]:
test_movies['predicted_score'] = test_movies.apply(lambda row: calculate_sim_scores(row['title'],cosine_sim),axis=1)

In [26]:
test_movies

Unnamed: 0,user,item,rating,timestamp,title,predicted_score
0,339,1,4.0,1460183470,toy story,4.649411
1,339,6,4.0,1460345729,heat,4.134812
2,339,16,4.5,1460794139,casino,3.952436
3,339,111,4.0,1460518498,taxi driver,4.325714
4,339,150,4.0,1460182140,apollo 13,4.461871
...,...,...,...,...,...,...
218,339,157699,3.5,1481945488,snowden,4.311126
219,339,166946,4.5,1504908952,the founder,4.659385
220,339,174055,5.0,1507760155,dunkirk,4.540203
221,339,176371,4.5,1507502471,blade runner 2049,4.906428


In [24]:
movies_df.loc[movies_df['title'] == 'american made']

Unnamed: 0,item,title,year,genres,plot
4601,176751,american made,2017,Crime|Thriller,"In the late 1970s, Barry Seal, a pilot for com..."


In [30]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import mean_squared_error
mean_squared_error(test_movies['rating'], test_movies['predicted_score'])

0.6784536691289346