In [None]:
'''
Exercise in Collaborative Filtering using data from MovieLens + IMDB/RottenTomatoes

Process:
    1. Import and clean data 
    2. Create matrix for movie ratings for each user/movie pair
    3. Find K most similar users to a selected user (aka User)
        a. Use KNN algorithm
        b. Use Hamming distance to compare vectors (# mismatched dimension values / # of total dimensions)
    4. Using K most similar users, calculate mean rating of each movie
    5. Select top N movies with greatest mean that have not been rated by User
       (These are the top N movies to recommend to User)
    6. Get details for each of the N movies
    7. Analyze movie details to look for user preferences


'''

In [131]:
import datetime
import numpy as np
import pandas as pd
from scipy.spatial.distance import hamming

# 1) Import and clean data
def file_to_df(path, usecols, delimiter=','):
    '''
    Import text file data to a pandas dataframe object
    
    param path: path of text file containing data to import
    param usecols: list of columns to take
    param delimiter: character used to separate data elements in a line
    
    return: DataFrame object of data frm text file
    '''
    # Read file line-by-line into a list
    with open(path, errors='replace') as file:
        data = file.readlines()
    
    # Clean data
    for i in range(len(data)):
        data[i] = data[i].replace('\n', '') # Remove newline chars
    
    # Convert array of strings into 2D list of strings
    data = [line.split(delimiter) for line in data]
    
    # Convert 2D list to a DataFrame object
    df = pd.DataFrame(data)
    
    # Set column names to values in first row
    df.columns = list(df.iloc[0])

    # Take specified columns only
    df = df[usecols]
    
    # Delete first row since it contains column names and not values
    df = df.drop(0)
    
    # Reset to 0-based index
    df.index = range(len(df.index))
    
    return df

# Set data file paths
movies_path = 'data/movielens_2k/movies.dat'
movie_actors_path = 'data/movielens_2k/movie_actors.dat'
movie_directors_path = 'data/movielens_2k/movie_directors.dat'
movie_genres_path = 'data/movielens_2k/movie_genres.dat'
user_rated_movies_path = 'data/movielens_2k/user_ratedmovies.dat'

# Import data, clean it, and output to dataframes
df_movies = file_to_df(movies_path, delimiter='\t', usecols=['id','title','imdbID','rtID','rtAllCriticsRating','rtAllCriticsNumReviews','rtAllCriticsScore','rtTopCriticsRating','rtTopCriticsNumReviews','rtTopCriticsNumFresh','rtTopCriticsNumRotten','rtTopCriticsScore','rtAudienceRating','rtAudienceNumRatings','rtAudienceScore'])
df_movie_actors = file_to_df(movie_actors_path, delimiter='\t', usecols=['movieID','actorID','actorName','ranking'])
df_movie_directors = file_to_df(movie_directors_path, delimiter='\t', usecols=['movieID','directorID','directorName'])
df_movie_genres = file_to_df(movie_genres_path, delimiter='\t', usecols=['movieID','genre'])
df_user_rated_movies = file_to_df(user_rated_movies_path, delimiter='\t', usecols=['userID','movieID','rating']) 

# Create User/Movie ratings matrix
df_user_movie_ratings = df_user_rated_movies.pivot(index='userID', columns='movieID', values='rating')

# Replace null values with a numerical equivalent, then cast all data types in table to float 
df_user_movie_ratings.fillna(value=np.nan,inplace=True)
df_user_movie_ratings = pd.DataFrame(df_user_movie_ratings, dtype=np.float32)


In [59]:
def hamming_distance(userID_1, userID_2):
    '''
    Finds hamming distance between two users.
    Hamming distance is the number of features that differ between the two users.
    
    '''
    distance = hamming(df_user_movie_ratings.loc[userID_1], df_user_movie_ratings.loc[userID_2])
    return distance

def find_knn(userID, k=3):
    '''
    Finds k nearest neighbors of userID based on euclidean distance
    
    param userID: userID to get neighbors for
    param k: number of neighbors
    '''
    # Get data for all other users except specified user
    df_other_user_movie_ratings = df_user_movie_ratings.loc[df_user_movie_ratings.index != userID]
    
    # Calculate distance between specified user and all other users
    df_other_user_movie_ratings['distance'] = df_other_user_movie_ratings.apply(lambda row: hamming_distance(userID,row.name), axis=1)
    
    # Sort users by distance (closest first)
    df_other_user_movie_ratings.sort_values(['distance'],axis=0,inplace=True)
    
    # Take top k users (k nearest users)
    df_other_user_movie_ratings = df_other_user_movie_ratings.iloc[0:k]
    
    return df_other_user_movie_ratings

###
print(datetime.datetime.now())
userID = '10058'
#df_user_movie_ratings = df_user_movie_ratings.iloc[0:11]
df_knn = find_knn(userID,k=10)
print(datetime.datetime.now())

2017-11-30 13:34:40.638733
2017-11-30 13:34:42.139809


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [60]:
def get_unrated_movies_for_user(userID):
    '''
    Gets unrated movies from user/movie ratings matrix by finding row of userID, 
    then finding any column (movieID) with a null value
    
    param userID: userID to get unrated movies for
    
    return: list of movieID's that the specified user has not rated yet
    '''
    
    # n = single user's movie ratings
    n = df_user_movie_ratings.loc[df_user_movie_ratings.index == userID]
    cols = list(n.columns)
    unrated_movies = [cols[i] for i in range(len(cols)) if pd.isnull(n[str(cols[i])][0])]
    
    return unrated_movies

###
print(datetime.datetime.now())
# Get mean of movie ratings for nearest neighbors
# avg = df_knn.fillna(value=0).agg(['mean'], numeric_only=True)
avg = df_knn.agg(['mean'], numeric_only=True)

# Place avg movie ratings into a column, then remove any movies rated by the user
unrated_movies = get_unrated_movies_for_user(userID)
avg_filtered = avg.transpose().loc[unrated_movies]

# Sort movies based on mean average (descending)
avg_sorted = avg_filtered.sort_values(['mean'],axis=0,ascending=False)
print(datetime.datetime.now())

2017-11-30 13:35:08.308815
2017-11-30 13:35:13.251323


In [61]:
top_recommendations = avg_sorted[0:10]
top_recommendations

Unnamed: 0,mean
8684,5.0
921,5.0
3811,5.0
3814,5.0
3822,5.0
6609,5.0
8256,5.0
8197,5.0
7938,5.0
4658,5.0


In [100]:
def get_movie_details(movie_id):
    movie_details = {}
    movie_details['title'] = [df_movies.loc[df_movies.id == movie_id].iloc[0].title]
    movie_details['actors'] = list(df_movie_actors.loc[df_movie_actors.movieID == movie_id].actorName)
    movie_details['director'] = [df_movie_directors.loc[df_movie_directors.movieID == movie_id].iloc[0].directorName]
    movie_details['genres'] = list(df_movie_genres.loc[df_movie_genres.movieID == movie_id].genre)
    
    return movie_details

def movie_details_to_df(movie_details):
    data = [movie_details['title'],
            movie_details['director'],
            movie_details['actors'],
            movie_details['genres']] 
    
    df = pd.DataFrame(data).transpose().fillna(value='')
    df.columns=['title','director','actors','genres']
    
    return df

# Get list of all movie details after converting each set of movie details to a dataframe
movie_details_df_list = [movie_details_to_df(get_movie_details(movieID)) for movieID in list(top_recommendations.index)]

# Concatenate all movies into a single dataframe for futher analysis
df_movie_details = pd.concat(movie_details_df_list)

# Create an ID column
df_movie_details['id'] = range(df_movie_details.shape[0])

In [130]:
# Look for favorite genres
print(df_movie_details.groupby('genres')['title'].count().drop('').sort_values(ascending=False))

# look for favorite actors
print(df_movie_details.groupby('actors')['id'].count().sort_values(ascending=False).iloc[0:10])

# look for favorite directors
print(df_movie_details.groupby('director')['id'].count().drop('').sort_values(ascending=False))

genres
Drama        8
Romance      3
War          2
Comedy       2
Thriller     1
Mystery      1
Horror       1
Adventure    1
Name: title, dtype: int64
actors
Jessica Harper        2
Fred Kohler Jr.       1
Frank Wilson          1
Frank Adu             1
Francesco Leonetti    1
Franca Cupane         1
Fox Harris            1
Florian               1
Ferruccio Nuzzo       1
Ferdinand Munier      1
Name: id, dtype: int64
director
Woody Allen             1
Rouben Mamoulian        1
Robert Bresson          1
Richard Benjamin        1
Pier Paolo Pasolini     1
Patrice Leconte         1
Ingmar Bergman          1
Bruce Beresford         1
Alejandro Jodorowsky    1
Alain Resnais           1
Name: id, dtype: int64
