In [None]:
'''
Exercise in Collaborative Filtering using data from MovieLens + IMDB/RottenTomatoes

Process:
    1. Import and clean data 
    2. Define necessary functions
        - Get 


'''

In [26]:
import pandas as pd
from scipy.spatial.distance import hamming

# 1) Import and clean data
def file_to_df(path, usecols, delimiter=','):
    '''
    Import text file data to a pandas dataframe object
    
    param path: path of text file containing data to import
    param usecols: list of columns to take
    param delimiter: character used to separate data elements in a line
    
    return: DataFrame object of data frm text file
    '''
    # Read file line-by-line into a list
    with open(path, errors='replace') as file:
        data = file.readlines()
    
    # Clean data
    for i in range(len(data)):
        data[i] = data[i].replace('\n', '') # Remove newline chars
    
    # Convert array of strings into 2D list of strings
    data = [line.split(delimiter) for line in data]
    
    # Convert 2D list to a DataFrame object
    df = pd.DataFrame(data)
    
    # Set column names to values in first row
    df.columns = list(df.iloc[0])

    # Take specified columns only
    df = df[usecols]
    
    # Delete first row since it contains column names and not values
    df = df.drop(0)
    
    # Reset to 0-based index
    df.index = range(len(df.index))
    
    return df

# Set data file paths
movies_path = 'data/movielens_2k/movies.dat'
movie_actors_path = 'data/movielens_2k/movie_actors.dat'
movie_directors_path = 'data/movielens_2k/movie_directors.dat'
movie_genres_path = 'data/movielens_2k/movie_genres.dat'
user_rated_movies_path = 'data/movielens_2k/user_ratedmovies.dat'

# Import data, clean it, and output to dataframes
df_movies = file_to_df(movies_path, delimiter='\t', usecols=['id','title','imdbID','rtID','rtAllCriticsRating','rtAllCriticsNumReviews','rtAllCriticsScore','rtTopCriticsRating','rtTopCriticsNumReviews','rtTopCriticsNumFresh','rtTopCriticsNumRotten','rtTopCriticsScore','rtAudienceRating','rtAudienceNumRatings','rtAudienceScore'])
df_movie_actors = file_to_df(movie_actors_path, delimiter='\t', usecols=['movieID','actorID','actorName','ranking'])
df_movie_directors = file_to_df(movie_directors_path, delimiter='\t', usecols=['movieID','directorID','directorName'])
df_movie_genres = file_to_df(movie_genres_path, delimiter='\t', usecols=['movieID','genre'])
df_user_rated_movies = file_to_df(user_rated_movies_path, delimiter='\t', usecols=['userID','movieID','rating']) 

# Create User/Movie ratings matrix
df_user_movie_ratings = df_user_rated_movies.pivot(index='userID', columns='movieID', values='rating')
df_user_movie_ratings['userID'] = df_user_movie_ratings.index

In [29]:
def hamming_distance(userID_1, userID_2):
    '''
    Finds hamming distance between two users.
    Hamming distance is the number of features that differ between the two users.
    
    '''
    user_1 = df_user_movie_ratings.transpose()[userID_1]
    user_2 = df_user_movie_ratings.transpose()[userID_2]
    distance = hamming(userID_1, userID_2)
    return distance

def find_knn(userID, k=3):
    '''
    Finds k nearest neighbors of userID based on euclidean distance
    
    param userID: userID to get neighbors for
    param k: number of neighbors
    '''
    df_other_user_movie_ratings = df_user_movie_ratings[df_user_movie_ratings.index != userID]
    df_other_user_movie_ratings['distance'] = df_other_user_movie_ratings['userID'].apply(lambda other_userID: hamming_distance(userID,other_userID))
    # sort by distance
    # get top K neighbors
    # return top K neighbors

df_other_user_movie_ratings = df_user_movie_ratings[df_user_movie_ratings.index != '75']
df_other_user_movie_ratings['distance'] = df_other_user_movie_ratings['userID'].apply(lambda other_userID: hamming_distance('75',other_userID))

In [30]:
df_other_user_movie_ratings = df_user_movie_ratings[df_user_movie_ratings.index != '75']
df_other_user_movie_ratings['distance'] = df_other_user_movie_ratings['userID'].apply(lambda other_userID: hamming_distance('75',other_userID))

KeyboardInterrupt: 

In [117]:
def get_unrated_movies_for_user(userID):
    '''
    Gets unrated movies from user/movie ratings matrix by finding row of userID, 
    then finding any column (movieID) with a null value
    
    param userID: userID to get unrated movies for
    
    return: list of movieID's that the specified user has not rated yet
    '''
    
    # n = single user's movie ratings
    n = df_user_movie_ratings[df_user_movie_ratings.index == userID]
    cols = list(n.columns)
    df_users_unrated_movies = [cols[i] for i in range(len(cols)) if n[str(cols[i])][0] == None]
    
    return df_users_unrated_movies

#
#*** Not needed ***
#
#def get_unrated_movies_for_all():
#    '''
#    Get lists of all unrated movies for each user
#    
#    return: dictionary where userID's are the keys and the lists of their unrated movies are the values
#    '''
#    # Get unique set of users
#    unique_users = list(df_user_movie_ratings.index)
#
#    # For each user, get movies that have not yet been rated
#    user_unrated_movies = {}
#    for userID in unique_users:
#        unrated_movies = get_unrated_movies_for_user(userID)
#        user_unrated_movies[userID] = unrated_movies
#
#    return user_unrated_movies

user_unrated_movies = get_unrated_movies_for_user('75')

userID
75    None
dtype: object

In [24]:
user_1 = df_user_movie_ratings.transpose()['75']
user_2 = df_user_movie_ratings.transpose()['10025']
distance = hamming('75', '10025')

In [25]:
distance

1.0

In [13]:
df_user_movie_ratings['userID']

userID
10025    10025
10058    10058
10064    10064
10084    10084
10094    10094
10125    10125
10132    10132
10154    10154
1017      1017
10181    10181
10191    10191
10200    10200
10231    10231
10272    10272
10335    10335
1035      1035
10420    10420
1047      1047
10476    10476
10479    10479
10516    10516
10521    10521
10563    10563
10639    10639
10652    10652
10674    10674
10739    10739
10783    10783
10791    10791
10844    10844
         ...  
8892      8892
8893      8893
8905      8905
8930      8930
894        894
8970      8970
8996      8996
9029      9029
9034      9034
9060      9060
9085      9085
9117      9117
9127      9127
9134      9134
9139      9139
9177      9177
922        922
9245      9245
9277      9277
937        937
9370      9370
9411      9411
9415      9415
9470      9470
9475      9475
9522      9522
9593      9593
9595      9595
9807      9807
9889      9889
Name: userID, Length: 2113, dtype: object