In [None]:
'''
Exercise in Collaborative Filtering using data from MovieLens + IMDB/RottenTomatoes

Process:
    1. Import and clean data 
    2. Define necessary functions
        - Get 


'''

In [2]:
import pandas as pd
from scipy.spatial.distance import hamming

# 1) Import and clean data
def file_to_df(path, usecols, delimiter=','):
    '''
    Import text file data to a pandas dataframe object
    
    param path: path of text file containing data to import
    param usecols: list of columns to take
    param delimiter: character used to separate data elements in a line
    
    return: DataFrame object of data frm text file
    '''
    # Read file line-by-line into a list
    with open(path, errors='replace') as file:
        data = file.readlines()
    
    # Clean data
    for i in range(len(data)):
        data[i] = data[i].replace('\n', '') # Remove newline chars
    
    # Convert array of strings into 2D list of strings
    data = [line.split(delimiter) for line in data]
    
    # Convert 2D list to a DataFrame object
    df = pd.DataFrame(data)
    
    # Set column names to values in first row
    df.columns = list(df.iloc[0])

    # Take specified columns only
    df = df[usecols]
    
    # Delete first row since it contains column names and not values
    df = df.drop(0)
    
    # Reset to 0-based index
    df.index = range(len(df.index))
    
    return df

# Set data file paths
movies_path = 'data/movielens_2k/movies.dat'
movie_actors_path = 'data/movielens_2k/movie_actors.dat'
movie_directors_path = 'data/movielens_2k/movie_directors.dat'
movie_genres_path = 'data/movielens_2k/movie_genres.dat'
user_rated_movies_path = 'data/movielens_2k/user_ratedmovies.dat'

# Import data, clean it, and output to dataframes
df_movies = file_to_df(movies_path, delimiter='\t', usecols=['id','title','imdbID','rtID','rtAllCriticsRating','rtAllCriticsNumReviews','rtAllCriticsScore','rtTopCriticsRating','rtTopCriticsNumReviews','rtTopCriticsNumFresh','rtTopCriticsNumRotten','rtTopCriticsScore','rtAudienceRating','rtAudienceNumRatings','rtAudienceScore'])
df_movie_actors = file_to_df(movie_actors_path, delimiter='\t', usecols=['movieID','actorID','actorName','ranking'])
df_movie_directors = file_to_df(movie_directors_path, delimiter='\t', usecols=['movieID','directorID','directorName'])
df_movie_genres = file_to_df(movie_genres_path, delimiter='\t', usecols=['movieID','genre'])
df_user_rated_movies = file_to_df(user_rated_movies_path, delimiter='\t', usecols=['userID','movieID','rating']) 

# Create User/Movie ratings matrix
df_user_movie_ratings = df_user_rated_movies.pivot(index='userID', columns='movieID', values='rating')


In [40]:
def get_unrated_movies_for_user(userID):
    '''
    Gets unrated movies from user/movie ratings matrix by finding row of userID, 
    then finding any column (movieID) with a null value
    
    param userID: userID to get unrated movies for
    
    return: list of movieID's that the specified user has not rated yet
    '''
    
    # n = single user's movie ratings
    n = df_user_movie_ratings[df_user_movie_ratings.index == userID]
    cols = list(n.columns)
    df_users_unrated_movies = [cols[i] for i in range(len(cols)) if n[str(cols[i])][0] == None]
    
    return df_users_unrated_movies

def hamming_distance(userID_1, userID_2):
    user_1 = df_user_movie_ratings.transpose()[userID_1]
    user_2 = df_user_movie_ratings.transpose()[userID_2]
    distance = hamming(userID_1, userID_2)
    return distance

def find_knn(userID, k=3):
    '''
    Finds k nearest neighbors of userID based on euclidean distance
    
    param userID: userID to get neighbors for
    param k: number of neighbors
    '''
    # Get all data less the current user
    # apply hamming_distance function
    # sort by distance
    # get top K neighbors
    # return top K neighbors

#
#*** Not needed ***
#
#def get_unrated_movies_for_all():
#    '''
#    Get lists of all unrated movies for each user
#    
#    return: dictionary where userID's are the keys and the lists of their unrated movies are the values
#    '''
#    # Get unique set of users
#    unique_users = list(df_user_movie_ratings.index)
#
#    # For each user, get movies that have not yet been rated
#    user_unrated_movies = {}
#    for userID in unique_users:
#        unrated_movies = get_unrated_movies_for_user(userID)
#        user_unrated_movies[userID] = unrated_movies
#
#    return user_unrated_movies

user_unrated_movies = get_unrated_movies_for_user('75')

In [33]:
#n = df_user_movie_ratings[df_user_movie_ratings.index == userID]
#df_users_unrated_movies = [n.columns[i] for i in range(len(n.columns)) if n[n.columns[i]][0] == None]
    
len(list(m.columns)[100])
#m = df_user_movie_ratings[df_user_movie_ratings.index == '75']
#m['996'][0]

'1002'

In [117]:
# Test this by getting a series data sample from large df, 
# then see how list comprehension works using a series object
def get_cols_with_nulls(row):
    #print(row.index)
    [row.index[i] for i in range(len(row.index)) if row[row.index[i] == None]]
m.apply(get_cols_with_nulls, axis=1)

userID
75    None
dtype: object

userID,75
movieID,Unnamed: 1_level_1
1,
10,
100,
1000,
1001,
1002,
1003,
1004,
1005,
1006,
