In [None]:
'''
Exercise in Collaborative Filtering using data from MovieLens + IMDB/RottenTomatoes

Process:
    1. Import and clean data 
    2. Define necessary functions
        - Get 


'''

In [50]:
import datetime
import timeit
import numpy as np
import pandas as pd
from scipy.spatial.distance import hamming

# 1) Import and clean data
def file_to_df(path, usecols, delimiter=','):
    '''
    Import text file data to a pandas dataframe object
    
    param path: path of text file containing data to import
    param usecols: list of columns to take
    param delimiter: character used to separate data elements in a line
    
    return: DataFrame object of data frm text file
    '''
    # Read file line-by-line into a list
    with open(path, errors='replace') as file:
        data = file.readlines()
    
    # Clean data
    for i in range(len(data)):
        data[i] = data[i].replace('\n', '') # Remove newline chars
    
    # Convert array of strings into 2D list of strings
    data = [line.split(delimiter) for line in data]
    
    # Convert 2D list to a DataFrame object
    df = pd.DataFrame(data)
    
    # Set column names to values in first row
    df.columns = list(df.iloc[0])

    # Take specified columns only
    df = df[usecols]
    
    # Delete first row since it contains column names and not values
    df = df.drop(0)
    
    # Reset to 0-based index
    df.index = range(len(df.index))
    
    return df

# Set data file paths
movies_path = 'data/movielens_2k/movies.dat'
movie_actors_path = 'data/movielens_2k/movie_actors.dat'
movie_directors_path = 'data/movielens_2k/movie_directors.dat'
movie_genres_path = 'data/movielens_2k/movie_genres.dat'
user_rated_movies_path = 'data/movielens_2k/user_ratedmovies.dat'

# Import data, clean it, and output to dataframes
df_movies = file_to_df(movies_path, delimiter='\t', usecols=['id','title','imdbID','rtID','rtAllCriticsRating','rtAllCriticsNumReviews','rtAllCriticsScore','rtTopCriticsRating','rtTopCriticsNumReviews','rtTopCriticsNumFresh','rtTopCriticsNumRotten','rtTopCriticsScore','rtAudienceRating','rtAudienceNumRatings','rtAudienceScore'])
df_movie_actors = file_to_df(movie_actors_path, delimiter='\t', usecols=['movieID','actorID','actorName','ranking'])
df_movie_directors = file_to_df(movie_directors_path, delimiter='\t', usecols=['movieID','directorID','directorName'])
df_movie_genres = file_to_df(movie_genres_path, delimiter='\t', usecols=['movieID','genre'])
df_user_rated_movies = file_to_df(user_rated_movies_path, delimiter='\t', usecols=['userID','movieID','rating']) 

# Create User/Movie ratings matrix
df_user_movie_ratings = df_user_rated_movies.pivot(index='userID', columns='movieID', values='rating')
df_user_movie_ratings.fillna(value=np.nan,inplace=True)
df_user_movie_ratings = pd.DataFrame(df_user_movie_ratings, dtype=np.float32)


In [57]:
def hamming_distance(userID_1, userID_2):
    '''
    Finds hamming distance between two users.
    Hamming distance is the number of features that differ between the two users.
    
    '''
    distance = hamming(df_user_movie_ratings.loc[userID_1], df_user_movie_ratings.loc[userID_2])
    return distance

def find_knn(userID, k=3):
    '''
    Finds k nearest neighbors of userID based on euclidean distance
    
    param userID: userID to get neighbors for
    param k: number of neighbors
    '''
    # Get data for all other users except specified user
    df_other_user_movie_ratings = df_user_movie_ratings.loc[df_user_movie_ratings.index != userID]
    
    # Calculate distance between specified user and all other users
    df_other_user_movie_ratings['distance'] = df_other_user_movie_ratings.apply(lambda row: hamming_distance(userID,row.name), axis=1)
    
    # Sort users by distance (closest first)
    df_other_user_movie_ratings.sort_values(['distance'],axis=0,inplace=True)
    
    # Take top k users (k nearest users)
    df_other_user_movie_ratings = df_other_user_movie_ratings.iloc[0:k]
    
    return df_other_user_movie_ratings

print(datetime.datetime.now())
userID = '10058'
df_user_movie_ratings = df_user_movie_ratings.iloc[0:11]
df_knn = find_knn(userID,k=10)
print(datetime.datetime.now())

2017-11-29 22:20:52.436750
2017-11-29 22:20:52.632680


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [82]:
def get_unrated_movies_for_user(userID):
    '''
    Gets unrated movies from user/movie ratings matrix by finding row of userID, 
    then finding any column (movieID) with a null value
    
    param userID: userID to get unrated movies for
    
    return: list of movieID's that the specified user has not rated yet
    '''
    
    # n = single user's movie ratings
    n = df_user_movie_ratings.loc[df_user_movie_ratings.index == userID]
    cols = list(n.columns)
    unrated_movies = [cols[i] for i in range(len(cols)) if pd.isnull(n[str(cols[i])][0])]
    
    return unrated_movies

# Get mean of movie ratings for nearest neighbors
# avg = df_knn.fillna(value=0).agg(['mean'], numeric_only=True)
avg = df_knn.agg(['mean'], numeric_only=True)

# Place avg movie ratings into a column, then remove any movies rated by the user
unrated_movies = get_unrated_movies_for_user(userID)
avg_filtered = avg.transpose().loc[unrated_movies]

# Sort movies based on mean average (descending)
avg_sorted = avg_filtered.sort_values(['mean'],axis=0,ascending=False)


In [83]:
avg_sorted

Unnamed: 0,mean
5907,5.0
48043,5.0
60069,5.0
2600,5.0
55442,5.0
47999,5.0
6283,5.0
1263,5.0
56145,5.0
306,5.0


In [27]:
def get_movie_details(movie_id):
    movie_details = {}
    return movie_details

'''
movie
    title
    director
    actors
    genres
'''

# Take top X movies, then get details for them

# get movies review counts per user. take users with more than X reviews

movieID,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,991,992,993,994,996,997,998,999,userID,distance
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10025,,,,,,,,,,,...,,,,,,,,,10025,0.032938
10058,,,,,,,,,,,...,,,,,,,,,10058,0.01543
10064,,,,,,,,,,,...,,,,,,,,,10064,0.007715
10084,4.5,,,,,,,,,,...,,,,,,,,,10084,0.012364
10094,2.0,,,,,,,,,,...,,,,,,,,,10094,0.01632
10125,,,,,,,,,,,...,,,,,,,,,10125,0.010386
10132,,3.0,,,,,,2.5,,,...,,,,,3.0,,,,10132,0.090504
10154,,,,,,,,,,,...,,,,,,,,,10154,0.011869
1017,,,,,,,,,,,...,,,,,,,,,1017,0.007616
10181,,4.0,,,,,,,,,...,,,,,,,,,10181,0.019387
