In [1]:
# import important libraries
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, pairwise


In [2]:
# load item data

item = pd.read_csv('ml-100k/u.item', sep = "|", encoding = "latin-1", names=["movie_id", "movie_title", "release_date", "video_release_date",
                             "imbd_url", "unknown", "action", "adventure", "animation",
                             "childrens", "comedy", "crime", "documentary", "drama", "fantasy", 
                             "film_noir", "horror", "musical", "mystery", "romance", 
                             "sci-fi", "thriller", "war", "western"])

#load ratings data

rating = pd.read_csv('ml-100k/u.data', sep = "\t", names = ["user_id", "item_id", "rating","timestamp"])

In [3]:
item.head()

movies = item.loc[:,:"movie_title"].copy()
movies

Unnamed: 0,movie_id,movie_title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [4]:
movies.movie_title

0                                Toy Story (1995)
1                                GoldenEye (1995)
2                               Four Rooms (1995)
3                               Get Shorty (1995)
4                                  Copycat (1995)
                          ...                    
1677                            Mat' i syn (1997)
1678                             B. Monkey (1998)
1679                         Sliding Doors (1998)
1680                          You So Crazy (1994)
1681    Scream of Stone (Schrei aus Stein) (1991)
Name: movie_title, Length: 1682, dtype: object

In [5]:
rating.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
# drop timestamp
rating.drop("timestamp",axis = 1, inplace = True)
rating.head()

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [7]:
# Building a memory based recommendation system
user_rating = pd.pivot_table(rating, index = "user_id", columns = "item_id", values = "rating").fillna(0)
n_users = len(user_rating.index)
n_items = len(user_rating.columns)

print(f"Users: {n_users}\n Items: {n_items}")
user_rating.head()

Users: 943
 Items: 1682


item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
def train_test_split(data:np.array, n_users: int , n_items:int):
#     create an empty matrix
    test = np.zeros((n_users, n_items));
    train = data.copy()

    for user in range(n_users):   
        random_sample = np.random.choice(data[user,:].nonzero()[0], size = 5, replace = False)
        
        train[user,random_sample] = 0
        test[user,random_sample] = data[user, random_sample]
    return train, test   
 

train, test = train_test_split(data=user_rating.to_numpy(), n_users=n_users, n_items=n_items)

In [58]:
# find similar users and items

user_similarity = pairwise.cosine_similarity(train + 1e9)
items_similarity = pairwise.cosine_similarity(train.T + 1e9)

print(f"{user_similarity.shape} and {items_similarity.shape}")

(943, 943) and (1682, 1682)


In [63]:
train.T

array([[5., 4., 0., ..., 5., 0., 0.],
       [3., 0., 0., ..., 0., 0., 5.],
       [4., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])