In [1]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import mean_squared_error

In [2]:
movies_df = pd.read_csv('movies_final.csv')
ratings_df = pd.read_csv('ratings_final.csv')
quantiled_movies_df = pd.read_csv('my_quantiled_movies.csv')

In [3]:
content_df = movies_df.copy()

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer(stop_words='english')

tfidf_matrix = tf_idf.fit_transform(content_df['plot'])

In [5]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [6]:
users_ratings = ratings_df.groupby(['user']).count()
selected = users_ratings['rating'] > 200
selected_users = users_ratings.loc[selected]
random_selected = selected_users.sample() # sample() returns a random row from the dataframe. The returned object is a dataframe with one row. If you pass a number as argument you specify to select more than one row.
select_column_df = random_selected.reset_index()['user'] # reset_index() create a new index, and the user became a column. Then, we can filter using the column name
selected_user = select_column_df.iloc[0] # iloc select by index, since our dataframe only has one row we read it from the index 0
print("Selected user: " + str(selected_user))

Selected user: 132


In [7]:
selected_user_ratings = ratings_df.loc[ratings_df['user'] == selected_user]
selected_user_ratings = selected_user_ratings.sort_values(by='item', ascending=True)
print("Rated movies: " + str(selected_user_ratings.shape[0]))
display(selected_user_ratings.head(10))

Rated movies: 216


Unnamed: 0,user,item,rating,timestamp
12710,132,1,2.0,1157921785
12711,132,17,3.0,1157922698
12712,132,34,1.5,1157921395
12713,132,39,3.0,1157921453
12714,132,45,2.5,1157923125
12715,132,48,3.0,1157919960
12716,132,70,2.5,1157923053
12717,132,89,2.5,1157997580
12718,132,158,2.0,1157922952
12719,132,163,2.0,1157919918


In [8]:
rated_movies_df = movies_df.loc[movies_df['item'].isin(list(selected_user_ratings['item']))]
rated_movies_df = rated_movies_df[['title', 'plot']]
print("Rated movies: " + str(rated_movies_df.shape[0]))
display(rated_movies_df.head(10))

Rated movies: 216


Unnamed: 0,title,plot
0,toy story,In a world where toys are living things who pr...
15,sense and sensibility,"On his deathbed, Mr. Dashwood tells his son fr..."
28,babe,"Babe, an orphaned piglet, is chosen for a ""gue..."
31,clueless,"Cheryl ""Cher"" Horowitz is a well-intentioned b..."
37,to die for,Suzanne Stone dreams of being a world-famous n...
38,pocahontas,"In 1607, the Susan Constant sails to the New W..."
47,from dusk till dawn,Fugitive bank robbers Seth and Richie Gecko ho...
57,nick of time,"The film opens with Gene Watson (Johnny Depp),..."
84,casper,"In the town of Friendship, Maine, two young bo..."
88,desperado,"At the Tarasco bar in Mexico, an American man ..."


In [9]:
diff = set(movies_df.index) - set(rated_movies_df.index)
unrated_movies_df = movies_df.loc[diff]
unrated_movies_df = unrated_movies_df[['title', 'plot']]
print("Unrated movies: " + str(unrated_movies_df.shape[0]))
display(unrated_movies_df.head(10))

Unrated movies: 4417


Unnamed: 0,title,plot
1,jumanji,"In 1869, near Brantford, New Hampshire, two br..."
2,grumpier old men,The feud between Max (Walter Matthau) and John...
3,waiting to exhale,"""Friends are the People who let you be yoursel..."
4,father of the bride part ii,The film begins five years after the events of...
5,heat,"Neil McCauley, a career criminal, hires Waingr..."
6,sabrina,Sabrina Fairchild is the young daughter of the...
7,tom and huck,The movie opens with Injun Joe (Eric Schweig) ...
8,sudden death,Darren McCord (Jean-Claude Van Damme) is a Fre...
9,goldeneye,"In 1986, at Arkhangelsk, MI6 agents James Bond..."
10,dracula: dead and loving it,Solicitor Thomas Renfield travels all the way ...


In [10]:
indices = pd.Series(movies_df.index, index=movies_df['title']).drop_duplicates()
def get_similar_movies(title, cosine_sim):
    idx = indices[title]
    sim_scores1 = list(enumerate(cosine_sim[idx]))
    sim_scores1 = sorted(sim_scores1, key=lambda x: x[1], reverse=True)
    sim_scores1 = sim_scores1[1:4783]
    sim_scores1 = [i for i in sim_scores1 if i[0] in rated_movies_df.index]
    return sim_scores1

In [11]:
def calculate_sim_scores(unrated_movie, cosine_sim):
    sim_scores = get_similar_movies(unrated_movie, cosine_sim)
    movie_indices = [i[0] for i in sim_scores] 
    most_similar_rated = movies_df.loc[movies_df['title'].isin(rated_movies_df.loc[movie_indices].head(5)['title'])]['item']
    temp = ratings_df[(ratings_df['user'] == selected_user)]
    most_similar_ratings = temp.loc[ratings_df['item'].isin(most_similar_rated)]['rating']
    cosine_scores =[n for _, n in sim_scores[:len(most_similar_ratings)]]
    sum_cosine = sum(cosine_scores)
    final_similarity_score = sum(most_similar_ratings * cosine_scores)/sum_cosine
    
    return final_similarity_score

In [12]:
from sklearn.model_selection import KFold

In [13]:
test_movies = pd.merge(rated_movies_df, movies_df,  how='inner', left_on=['title'], right_on = ['title'])
test_movies = pd.merge(test_movies, selected_user_ratings,  how='inner', left_on=['item'], right_on = ['item'])
kf_test_movies_df = test_movies[['item','title','rating','plot_x']].copy()

In [14]:
kf_test_movies_df

Unnamed: 0,item,title,rating,plot_x
0,1,toy story,2.0,In a world where toys are living things who pr...
1,17,sense and sensibility,3.0,"On his deathbed, Mr. Dashwood tells his son fr..."
2,34,babe,1.5,"Babe, an orphaned piglet, is chosen for a ""gue..."
3,39,clueless,3.0,"Cheryl ""Cher"" Horowitz is a well-intentioned b..."
4,45,to die for,2.5,Suzanne Stone dreams of being a world-famous n...
...,...,...,...,...
211,56782,there will be blood,2.5,"In 1898, Daniel Plainview, a prospector in New..."
212,63082,slumdog millionaire,3.5,"18-year-old Jamal Malik, an Indian Muslim from..."
213,68157,inglourious basterds,3.5,"In 1941, SS colonel Hans Landa interrogates Fr..."
214,68954,up,3.5,"In 1940, nine-year-old Carl Fredricksen idoliz..."


In [15]:
cosine_sim

array([[1.        , 0.01000176, 0.0103191 , ..., 0.00325355, 0.01340812,
        0.00531974],
       [0.01000176, 1.        , 0.01331374, ..., 0.00118613, 0.01516237,
        0.00775083],
       [0.0103191 , 0.01331374, 1.        , ..., 0.        , 0.00942662,
        0.00361881],
       ...,
       [0.00325355, 0.00118613, 0.        , ..., 1.        , 0.00552235,
        0.03364775],
       [0.01340812, 0.01516237, 0.00942662, ..., 0.00552235, 1.        ,
        0.00308234],
       [0.00531974, 0.00775083, 0.00361881, ..., 0.03364775, 0.00308234,
        1.        ]])

In [16]:
kf_test_movies_df

Unnamed: 0,item,title,rating,plot_x
0,1,toy story,2.0,In a world where toys are living things who pr...
1,17,sense and sensibility,3.0,"On his deathbed, Mr. Dashwood tells his son fr..."
2,34,babe,1.5,"Babe, an orphaned piglet, is chosen for a ""gue..."
3,39,clueless,3.0,"Cheryl ""Cher"" Horowitz is a well-intentioned b..."
4,45,to die for,2.5,Suzanne Stone dreams of being a world-famous n...
...,...,...,...,...
211,56782,there will be blood,2.5,"In 1898, Daniel Plainview, a prospector in New..."
212,63082,slumdog millionaire,3.5,"18-year-old Jamal Malik, an Indian Muslim from..."
213,68157,inglourious basterds,3.5,"In 1941, SS colonel Hans Landa interrogates Fr..."
214,68954,up,3.5,"In 1940, nine-year-old Carl Fredricksen idoliz..."


In [17]:
kf = KFold(n_splits=10) 
vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(kf_test_movies_df['plot_x']) # Train our TF-IDF model and computes the features
cosine_sim = linear_kernel(tf_idf, tf_idf)
RMSE =  []
for train_index, test_index in kf.split(kf_test_movies_df):
    test_ratings = kf_test_movies_df.iloc[test_index]['rating']
    test_predicted_ratings = []
    for idx in test_index:
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores_in_train_indices = [x for x in sim_scores if x[0] in train_index][:5]
        indices = [x[0] for x in sim_scores_in_train_indices ]
        cosine_scores = [x[1] for x in sim_scores_in_train_indices]
        most_similar_train_ratings = kf_test_movies_df.iloc[indices]['rating']
        sum_cosine = sum(cosine_scores)
        predicted_ratings = sum(most_similar_train_ratings * cosine_scores)/sum_cosine
        test_predicted_ratings.append(predicted_ratings)
    RMSE.append(mean_squared_error(test_predicted_ratings, test_ratings))
        


In [18]:
RMSE

[0.7832982903375764,
 0.7015144572179092,
 0.36380607218365824,
 0.5835246960355005,
 0.696821007086755,
 0.9598093024415916,
 0.9392229194005243,
 0.6510282426410421,
 0.8162373593624537,
 0.41701338770383245]

In [None]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import mean_squared_error
mean_squared_error(test_movies['rating'], test_movies['predicted_score'])

249 [0.47203350019187545,
 0.5503929841069063,
 0.4424303052164278,
 0.4954592178875273,
 0.39726317577126385,
 0.9007992300814592,
 0.4063328872400191,
 0.3388956326139274,
 0.5546611269613024,
 0.5519805967221907]