In [2]:
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

# import data
ratings = pd.read_csv("data/dummy_demo.csv", index_col=0)
ratings

Unnamed: 0,action 1,action 2,action 3,romantic 1,romantic 2,romantic 3
user 1,4.0,5.0,3.0,,2.0,1.0
user 2,5.0,3.0,3.0,2.0,2.0,
user 3,1.0,,,4.0,5.0,4.0
user 4,,2.0,1.0,4.0,,3.0
user 5,1.0,,2.0,3.0,3.0,4.0


In [3]:
ratings = ratings.fillna(0)
ratings

Unnamed: 0,action 1,action 2,action 3,romantic 1,romantic 2,romantic 3
user 1,4.0,5.0,3.0,0.0,2.0,1.0
user 2,5.0,3.0,3.0,2.0,2.0,0.0
user 3,1.0,0.0,0.0,4.0,5.0,4.0
user 4,0.0,2.0,1.0,4.0,0.0,3.0
user 5,1.0,0.0,2.0,3.0,3.0,4.0


In [4]:
# standardize or normalize data
def standardize(row):
    return (row - row.mean()) / (row.max() - row.min())

ratings_std = ratings.apply(standardize)
ratings_std


Unnamed: 0,action 1,action 2,action 3,romantic 1,romantic 2,romantic 3
user 1,0.36,0.6,0.4,-0.65,-0.08,-0.35
user 2,0.56,0.2,0.4,-0.15,-0.08,-0.6
user 3,-0.24,-0.4,-0.6,0.35,0.52,0.4
user 4,-0.44,0.0,-0.266667,0.35,-0.48,0.15
user 5,-0.24,-0.4,0.066667,0.1,0.12,0.4


# Item-based Collaborative Filtering

In [5]:
item_similarity = cosine_similarity(ratings_std.T)
item_similarity_df = pd.DataFrame(item_similarity, index=ratings.columns, columns=ratings.columns)
item_similarity_df

Unnamed: 0,action 1,action 2,action 3,romantic 1,romantic 2,romantic 3
action 1,1.0,0.706689,0.813682,-0.799411,-0.025392,-0.914106
action 2,0.706689,1.0,0.723102,-0.845154,-0.518999,-0.843374
action 3,0.813682,0.723102,1.0,-0.847946,-0.37998,-0.802181
romantic 1,-0.799411,-0.845154,-0.847946,1.0,0.148039,0.723747
romantic 2,-0.025392,-0.518999,-0.37998,0.148039,1.0,0.393939
romantic 3,-0.914106,-0.843374,-0.802181,0.723747,0.393939,1.0


In [6]:
def get_similar_item(name, rating):
    similar_score = item_similarity_df[name]*(rating-2.5)
    similar_score = similar_score.sort_values(ascending=False)
    return similar_score

result = get_similar_item("action 1", 5)

temp_result = []
for key, value in result.items():
    temp_result.append([key, value])
    
# Remove all the rated movies from user - beautify
temp_result = pd.DataFrame(temp_result, columns=["movie", "predicted score"])
temp_result[~temp_result['movie'].isin(["action 1"])]

Unnamed: 0,movie,predicted score
1,action 3,2.034204
2,action 2,1.766722
3,romantic 2,-0.06348
4,romantic 1,-1.998527
5,romantic 3,-2.285265


In [7]:
user_ratings = [("action 1",5), ("romantic 1",1), ("romantic 3",1)]
# user_ratings = [("action 1",2), ("romantic 1",5), ("romantic 3",3)]

similar_movies = pd.DataFrame()

for movie, rating in user_ratings:
    similar_movies = similar_movies.append(get_similar_item(movie,rating), ignore_index=True)
    
similar_movies = similar_movies.sum().sort_values(ascending=False)

temp_result = []
for key, value in similar_movies.items():
    temp_result.append([key, value])
    

# Remove all the rated movies from user - beautify
temp_df = pd.DataFrame(temp_result, columns=["movie", "predicted score"])
temp_df[~temp_df['movie'].isin([m for m,v in user_ratings])]

  similar_movies = similar_movies.append(get_similar_item(movie,rating), ignore_index=True)


Unnamed: 0,movie,predicted score
1,action 3,4.509394
2,action 2,4.299514
3,romantic 2,-0.876447
