In [None]:
'''
Exercise in Collaborative Filtering using data from MovieLens + IMDB/RottenTomatoes

Process:
    1. Import and clean data 
    2. Define necessary functions
        - Get 


'''

In [2]:
import pandas as pd
from scipy.spatial.distance import hamming

# 1) Import and clean data
def file_to_df(path, usecols, delimiter=','):
    '''
    Import text file data to a pandas dataframe object
    
    param path: path of text file containing data to import
    param usecols: list of columns to take
    param delimiter: character used to separate data elements in a line
    
    return: DataFrame object of data frm text file
    '''
    # Read file line-by-line into a list
    with open(path, errors='replace') as file:
        data = file.readlines()
    
    # Clean data
    for i in range(len(data)):
        data[i] = data[i].replace('\n', '') # Remove newline chars
    
    # Convert array of strings into 2D list of strings
    data = [line.split(delimiter) for line in data]
    
    # Convert 2D list to a DataFrame object
    df = pd.DataFrame(data)
    
    # Set column names to values in first row
    df.columns = list(df.iloc[0])

    # Take specified columns only
    df = df[usecols]
    
    # Delete first row since it contains column names and not values
    df = df.drop(0)
    
    # Reset to 0-based index
    df.index = range(len(df.index))
    
    return df

# Set data file paths
movies_path = 'data/movielens_2k/movies.dat'
movie_actors_path = 'data/movielens_2k/movie_actors.dat'
movie_directors_path = 'data/movielens_2k/movie_directors.dat'
movie_genres_path = 'data/movielens_2k/movie_genres.dat'
user_rated_movies_path = 'data/movielens_2k/user_ratedmovies.dat'

# Import data, clean it, and output to dataframes
df_movies = file_to_df(movies_path, delimiter='\t', usecols=['id','title','imdbID','rtID','rtAllCriticsRating','rtAllCriticsNumReviews','rtAllCriticsScore','rtTopCriticsRating','rtTopCriticsNumReviews','rtTopCriticsNumFresh','rtTopCriticsNumRotten','rtTopCriticsScore','rtAudienceRating','rtAudienceNumRatings','rtAudienceScore'])
df_movie_actors = file_to_df(movie_actors_path, delimiter='\t', usecols=['movieID','actorID','actorName','ranking'])
df_movie_directors = file_to_df(movie_directors_path, delimiter='\t', usecols=['movieID','directorID','directorName'])
df_movie_genres = file_to_df(movie_genres_path, delimiter='\t', usecols=['movieID','genre'])
df_user_rated_movies = file_to_df(user_rated_movies_path, delimiter='\t', usecols=['userID','movieID','rating']) 

# Create User/Movie ratings matrix
df_user_movie_ratings = df_user_rated_movies.pivot(index='userID', columns='movieID', values='rating')
df_user_movie_ratings['userID'] = df_user_movie_ratings.index

In [20]:
def hamming_distance(userID_1, userID_2):
    '''
    Finds hamming distance between two users.
    Hamming distance is the number of features that differ between the two users.
    
    '''
    user_1 = df_user_movie_ratings.transpose()[userID_1]
    user_2 = df_user_movie_ratings.transpose()[userID_2]
    distance = hamming(user_1, user_2)
    return distance

def find_knn(userID, k=3):
    '''
    Finds k nearest neighbors of userID based on euclidean distance
    
    param userID: userID to get neighbors for
    param k: number of neighbors
    '''
    # Get data for all other users except specified user
    df_other_user_movie_ratings = df_user_movie_ratings[df_user_movie_ratings.index != userID]
    
    # Calculate distance between specified user and all other users
    df_other_user_movie_ratings['distance'] = df_other_user_movie_ratings['userID'].apply(lambda other_userID: hamming_distance(userID,other_userID))
    
    # Sort users by distance (closest first)
    df_other_user_movie_ratings.sort_values(['distance'],axis=0,inplace=True)
    
    # Take top k users (k nearest users)
    df_other_user_movie_ratings = df_other_user_movie_ratings.iloc[0:k]
    
    return df_other_user_movie_ratings

print(datetime.datetime.now())
df_other_user_movie_ratings = df_user_movie_ratings[df_user_movie_ratings.index != '75']
df_other_user_movie_ratings['distance'] = df_other_user_movie_ratings['userID'].apply(lambda other_userID: hamming_distance('75',other_userID))
print(datetime.datetime.now())

2017-11-22 09:57:32.049334
2017-11-22 10:58:59.256117


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


movieID,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,991,992,993,994,996,997,998,999,userID,distance
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10025,,,,,,,,,,,...,,,,,,,,,10025,0.032938
10058,,,,,,,,,,,...,,,,,,,,,10058,0.01543
10064,,,,,,,,,,,...,,,,,,,,,10064,0.007715
10084,4.5,,,,,,,,,,...,,,,,,,,,10084,0.012364
10094,2.0,,,,,,,,,,...,,,,,,,,,10094,0.01632
10125,,,,,,,,,,,...,,,,,,,,,10125,0.010386
10132,,3.0,,,,,,2.5,,,...,,,,,3.0,,,,10132,0.090504
10154,,,,,,,,,,,...,,,,,,,,,10154,0.011869
1017,,,,,,,,,,,...,,,,,,,,,1017,0.007616
10181,,4.0,,,,,,,,,...,,,,,,,,,10181,0.019387


In [117]:
def get_unrated_movies_for_user(userID):
    '''
    Gets unrated movies from user/movie ratings matrix by finding row of userID, 
    then finding any column (movieID) with a null value
    
    param userID: userID to get unrated movies for
    
    return: list of movieID's that the specified user has not rated yet
    '''
    
    # n = single user's movie ratings
    n = df_user_movie_ratings[df_user_movie_ratings.index == userID]
    cols = list(n.columns)
    df_users_unrated_movies = [cols[i] for i in range(len(cols)) if n[str(cols[i])][0] == None]
    
    return df_users_unrated_movies

#
#*** Not needed ***
#
#def get_unrated_movies_for_all():
#    '''
#    Get lists of all unrated movies for each user
#    
#    return: dictionary where userID's are the keys and the lists of their unrated movies are the values
#    '''
#    # Get unique set of users
#    unique_users = list(df_user_movie_ratings.index)
#
#    # For each user, get movies that have not yet been rated
#    user_unrated_movies = {}
#    for userID in unique_users:
#        unrated_movies = get_unrated_movies_for_user(userID)
#        user_unrated_movies[userID] = unrated_movies
#
#    return user_unrated_movies

user_unrated_movies = get_unrated_movies_for_user('75')

userID
75    None
dtype: object

In [19]:
user_1 = df_user_movie_ratings.transpose()['75']
user_2 = df_user_movie_ratings.transpose()['10025']
hamming(user_1,user_2)

0.032937685459940656