In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import combinations 
import csv

In [None]:
with open('Data/ratings.dat') as dat_file, open('Data/ratings.csv', 'w') as csv_file:
    csv_writer = csv.writer(csv_file)

    for line in dat_file:
        row = [field.strip() for field in line.split('::')]
        csv_writer.writerow(row)

ratings = pd.read_csv('Data/ratings.csv', sep=',', encoding='latin-1')
ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
print(ratings.sample)

In [None]:
with open('Data/movies.dat') as dat_file, open('Data/movies.csv', 'w') as csv_file:
    csv_writer = csv.writer(csv_file)

    for line in dat_file:
        row = [field.strip() for field in line.split('::')]
        csv_writer.writerow(row)

movies = pd.read_csv('Data/movies.csv', sep=',', encoding='latin-1')
movies.columns = ['movie_id', 'title', 'genre']
print(movies.sample)

In [None]:
with open('Data/users.dat') as dat_file, open('Data/users.csv', 'w') as csv_file:
    csv_writer = csv.writer(csv_file)

    for line in dat_file:
        row = [field.strip() for field in line.split('::')]
        csv_writer.writerow(row)

users = pd.read_csv('Data/users.csv', sep=',', encoding='latin-1')
users.columns = ['user_id', 'gender', 'age', 'occupation', 'zip_code']
print(users.sample)

In [None]:
genre_popularity = (movies.genre.str.split('|')
                      .explode()
                      .value_counts()
                      .sort_values(ascending=False))
genre_popularity.head(10)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tf = TfidfVectorizer(analyzer=lambda s: (c for i in range(1,4)
                                             for c in combinations(s.split('|'), r=i)))
tfidf_matrix = tf.fit_transform(movies['genre'])
tfidf_matrix.shape

In [None]:
pd.DataFrame(tfidf_matrix.todense(), columns=tf.get_feature_names(), index=movies.title).sample(5, axis=1).sample(10, axis=0)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix)

In [None]:
cosine_sim_df = pd.DataFrame(cosine_sim, index=movies['title'], columns=movies['title'])
print(cosine_sim_df.shape)
cosine_sim_df.sample(5, axis=1).round(2)

In [None]:
def genre_recommendations(i, M, items, k=10):
    ix = M.loc[:,i].to_numpy().argpartition(range(-1,-k,-1))
    closest = M.columns[ix[-1:-(k+2):-1]]
    closest = closest.drop(i, errors='ignore')
    return pd.DataFrame(closest).merge(items).head(k)

In [15]:
genre_recommendations('2001: A Space Odyssey (1968)', cosine_sim_df, movies[['title', 'genre']])

Unnamed: 0,title,genre
0,"X-Files: Fight the Future, The (1998)",Mystery|Sci-Fi|Thriller
1,"Client, The (1994)",Drama|Mystery|Thriller
2,"Talented Mr. Ripley, The (1999)",Drama|Mystery|Thriller
3,Communion (1989),Drama|Sci-Fi|Thriller
4,Gattaca (1997),Drama|Sci-Fi|Thriller
5,"Thirteenth Floor, The (1999)",Drama|Sci-Fi|Thriller
6,Event Horizon (1997),Action|Mystery|Sci-Fi|Thriller
7,2010 (1984),Mystery|Sci-Fi
8,Stalker (1979),Mystery|Sci-Fi
9,Deep Impact (1998),Action|Drama|Sci-Fi|Thriller
