## Import Libraries

In [57]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity, cosine_distances
import functools, operator
import pdb
from sklearn.feature_extraction.text import TfidfVectorizer
import math

## Create Dataframes of Inputs

In [58]:
def create_input_dataframe(input_file):
  return pd.read_table(input_file, sep='\s+',usecols=open(input_file,'rb').readlines()[0].strip().split().reverse())

def create_input_dataframe_movie_actors_a(input_file):
  retlist = []
  with open(input_file, 'rb') as f:
    lines = f.readlines()
    for line in lines:
      line = line.decode(errors='replace').strip()
      line = line.split("\t")
      retlist.append(line)
    header = retlist[0]
    retlist = retlist[1:]
    return pd.DataFrame(retlist, columns=header)

def create_input_dataframe_movie_actors(input_file):
  retlist = []
  with open(input_file, 'rb') as f:
    lines = f.readlines()
    for line in lines:
      line = line.decode(errors='replace')
      line = line.strip()
      newline = [line.split('\t')[i] for i in range(len(line.split('\t'))) if i in [0, 1, len(line.split('\t'))-1 ]]
      retlist.append(newline)
  header = retlist[0]
  retlist = retlist[1:]
  return pd.DataFrame(retlist, columns=header)

def create_input_dataframe_movie_director(input_file):
  retlist = []
  with open(input_file, 'rb') as f:
    lines = f.readlines()
    for line in lines:
      line = line.decode(errors='replace')
      line = line.strip()
      newline = [line.split('\t')[i] for i in range(len(line.split('\t'))) if i in [0, 1]]
      retlist.append(newline)
  header = retlist[0]
  retlist = retlist[1:]
  # return pd.DataFrame(retlist, columns=header)

def create_input_dataframe_tags(input_file):
  retlist = []
  with open(input_file, 'rb') as f:
    lines = f.readlines()
    for line in lines:
      line = line.decode(errors='replace')
      line = line.strip()
      newline = line.split('\t')
      retlist.append(newline)
  header = retlist[0]
  retlist = retlist[1:]
  return pd.DataFrame(retlist, columns=header)

In [59]:
train_df = create_input_dataframe('HW4_data/train.dat')
test_df = create_input_dataframe('HW4_data/test.dat')
movie_actors_df = create_input_dataframe_movie_actors_a('HW4_data/movie_actors.dat')
movie_directors_df = create_input_dataframe_movie_actors_a('HW4_data/movie_directors.dat')
movie_genres_df = create_input_dataframe('HW4_data/movie_genres.dat')
movie_tags_df = create_input_dataframe('HW4_data/movie_tags.dat')
user_taggedmovies_df = create_input_dataframe('HW4_data/user_taggedmovies.dat')
tags_df = create_input_dataframe_movie_actors_a('HW4_data/tags.dat')

## User Based Collaborative Filering
### Find 100 other users with the same values as the current user
### Amongst the 100 get 10 closest to the current user and then get average for the 

1. Create the User - Item matrix <br>
rows - users
columns - movies the user has seen

2. In the test data get the k nearest training samples which are closest to the test user and get an average of the rating.

Another way
get the test user
get the test movie 
know how many movies users has seen and then create a word matrix 


In [60]:
def create_user_item_matrix_with_train(source, train_df):
  # source is list of items 
  # we can fill in the future 
  if source != None:
    return
  matrix = dict()
  keys_val = matrix.keys()
  train_df_val = train_df.values.tolist()
  for record in train_df_val:
    if record[0] not in keys_val:
      user_id = int(record[0])
      matrix[user_id] = ([],[])
    matrix[user_id][0].append(record[1])
    matrix[user_id][1].append(record[2])
  return matrix

In [61]:
def get_user_item_matrix(train_df):
  matrix = []
  movies = [movie for movie in set(train_df['movieID'])]
  movies.sort()
  users = [user for user in set(train_df['userID'])]
  users.sort()
  count = 0
  user_id = train_df.values.tolist()[0][0]
  user_movies = [0]*len(movies)
  
  for record in train_df.values.tolist():
    if user_id != record[0]:
      matrix.append(user_movies)
      user_id = record[0]
      user_movies = [0]*len(movies)
    user_movies[movies.index(int(record[1]))] = record[2]
  
  indices = {i:users[i] for i in range(len(users))}
  return pd.DataFrame(matrix, columns=movies).rename(index=indices)

In [62]:
user_movie_matrix = get_user_item_matrix(train_df)

In [63]:
# def Normalize_Ratings(user_movie_matrix):
#   lst = []
#   for user in  user_movie_matrix.values:
#     tot = user.sum(axis=0)/user.astype(bool).sum(axis=0)
#     user[user>0] -= tot
#     lst.append(user)
#   matrix = pd.DataFrame(lst, index = user_movie_matrix.index)
#   matrix.columns = user_movie_matrix.columns
#   return matrix
# user_movie_matrix = Normalize_Ratings(user_movie_matrix)

In [64]:
def user_based_collaborative_filter(matrix, test_movie, user_info):
  if test_movie is None: raise Exception("test_movie is None")
  matrix_of_test_movie = matrix[matrix[test_movie]>0.0]
  movie_col_index = list(matrix.columns).index(test_movie)
  max_similarity = []
  max_distance = -1
  count = 0
  for user in matrix_of_test_movie.values:
    count += 1
    # similarity_measure = euclidean_distances(np.array([user]), np.array([user_info]))
    # similarity_measure = cosine_similarity(np.array([user]), np.array([user_info]))
    # similarity_measure = cosine_distances(np.array([user]), np.array([user_info]))
    similarity_measure = pearsonr(np.array(user), np.array(user_info))
    #print(similarity_measure)
    similarity_measure = similarity_measure[0]
    if len(max_similarity) < 11:
      if max_distance < similarity_measure:
        max_distance = similarity_measure
      max_similarity.append((user, user[movie_col_index], similarity_measure))
    
    else:
      if similarity_measure < max_distance:
        user_record = (user, user[movie_col_index], similarity_measure)
        max_similarity = filter(lambda x : x[2] != max_distance, max_similarity)
        max_similarity = list(max_similarity)
        max_similarity.append(user_record)
        max_distance = functools.reduce(lambda a,b: a if a[2] > b[2] else b, max_similarity)
        max_distance = max_distance[2]
  # pdb.set_trace()
    # print('Count: ', count)
  total = 0
  for user in max_similarity:
    total += user[1]
  if len(max_similarity)>0:  
    total = total/len(max_similarity)
  return total

In [65]:
def create_actor_profile_matrix(movie_actors_df):
    all_actors = set(movie_actors_df['actorID'])
    all_actors = [actor for actor in all_actors]
    all_movies = [int(movie) for movie in set(movie_actors_df['movieID'])]
    all_movies.sort()
    movie_id = 0
    movie_actor = [0]*len(all_actors)
    retlist=[]
    for movies in movie_actors_df.values:
        new_movie_id = movies[0]
        if new_movie_id != movie_id:
            print('Movie ID: ',movie_id)
            retlist.append(np.array(movie_actor))
            movie_actor = [0]*len(all_actors)
            movie_id = new_movie_id
        actor_index = all_actors.index(movies[1])
        movie_actor[actor_index]=1
    retlist = pd.DataFrame(retlist, columns=all_actors)
    # movie_index = pd.DataFrame(all_movies)
    # movie_index.columns = ['Movie_ID']
    # retlist = pd.concat([retlist, movie_index], axis =1)
    # retlist.set_index('Movie_ID', drop= True, inplace= True) 
    return retlist

In [66]:
def Movie_Genre(movie_genres_df):
  #pdb.set_trace()
  movies = [movie for movie in set(movie_genres_df['movieID'])]
  movies.sort()
  genres = [genre for genre in set(movie_genres_df['genre'])]
  movie_id = 0
  genre_cols = [0]* len(genres)
  matrix = []
  for movie in movie_genres_df.values:
    new_movie_id = movie[0]
    if new_movie_id != movie_id:
      matrix.append(np.array(genre_cols))
      movie_id = new_movie_id
      genre_cols = [0]* len(genres)
    genre_index=genres.index(movie[1])
    genre_cols[genre_index] =1 
  matrix = pd.DataFrame(matrix, columns=genres)
  movie_index = pd.DataFrame(movies)
  movie_index.columns = ['Movie_ID']
  matrix = pd.concat([matrix, movie_index], axis =1)
  matrix.set_index('Movie_ID', drop= True, inplace= True)
  return matrix 

In [67]:
movie_genre_matrix = Movie_Genre(movie_genres_df)
movie_genre_matrix

Unnamed: 0_level_0,Western,Comedy,Action,Thriller,Drama,Film-Noir,Children,War,Adventure,Musical,Crime,Short,Fantasy,Romance,Animation,Mystery,Sci-Fi,Documentary,Horror,IMAX
Movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
5,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65088,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
65091,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
65126,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
65130,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [68]:
def user_similarity(movie_genre_matrix, movie_id, movie_genres_df):
  train_vector =list(movie_genre_matrix.columns)  
  movie_vector =[0]*len(train_vector) 
  genres = (movie_genres_df[movie_genres_df['movieID'] == movie_id]['genre'].values)
  for genre in genres:
    genre_index =  train_vector.index(genre)
    movie_vector[genre_index] =1
  cos_similarity = []
  for i in movie_genre_matrix.values:
    cos_similarity.append(cosine_similarity([movie_vector], [i])[0][0])
  
  print(np.argsort(cos_similarity)[:-9:-1])  

In [69]:
# create_actor_profile_matrix(movie_actors_df)

In [70]:
def cold_start_movie(user_id, movie_id, movie_genres_df, user_movie_matrix):
  genres = movie_genres_df[movie_genres_df['movieID']==movie_id]['genre'].values
  genre_count = 0
  genre_sum = 0
  for genre in genres:
    genre_count+=1
    genre_movies=movie_genres_df.loc[(movie_genres_df['genre']== genre) & (movie_genres_df['movieID']!=movie_id)]
    print(genre_movies)
    count =0
    sum =0
    for movie in genre_movies['movieID'].values:
      if movie in user_movie_matrix:
        if user_movie_matrix.loc[user_id].loc[movie]!=0:
          count+=1
          sum += user_movie_matrix.loc[user_id].loc[movie]   
    if count!=0:
      genre_sum+=sum/count
  print(genre_sum/genre_count)   

In [71]:
'''
 we took the movie genre and checked all the genres of the movies that the user has rated. 
 Converted them into vectors and then computed their cosine distance 
 and then selected the 9 nearest genres average the rating of all the movies in those genres and return it.
'''
def get_user_profile(train_df, user_id, movie_genres_df, cold_movie_genres):
  # pdb.set_trace()
  user_movies = train_df[train_df['userID']==user_id]
  count = 0
  corpus = []
  p = []
  for movie in user_movies.values:
    count += 1
    movie_id = int(movie[1])
    p.append(movie[2])
    a = ' '.join(movie_genres_df[movie_genres_df['movieID']==movie_id]['genre'].values)
    corpus.append(a)
  vectorizer = TfidfVectorizer()
  train_tfdif = vectorizer.fit_transform(corpus)
  test_tfdif = vectorizer.transform(cold_movie_genres)

  cos_similarity = cosine_similarity(test_tfdif[0], train_tfdif).flatten()
  neighbor_indices = cos_similarity.argsort()[:-9:-1]
  rating = 0
  for indices in neighbor_indices:
     rating += p[indices]
  return rating/len(neighbor_indices)

In [72]:
'''
we took the movie genre and converted it into a vector using TFIDF Vectorizer on the movie genres training movies and test set (all the genres of that particular movie). 
After vectorizing the genres we calculated the cosine distance for each genre of the movie to the list of all the genres. Select the nearest 9 values. 
We compute the average of each genre and then again compute the average of the average 9 values.

'''
def cold_start_user(movie_id, movie_genres_df, user_movie_matrix, cold_movie_genres):
  corpus = []
  current_movie_id =0
  for movie in movie_genres_df.values:
    if current_movie_id != movie[0]:
      current_movie_id = movie[0]
      a = ' '.join(movie_genres_df[movie_genres_df['movieID']==movie[0]]['genre'].values)
      corpus.append(a)
  #genres = movie_genres_df[movie_genres_df['movieID']==movie_id]['genre'].values
  vectorizer = TfidfVectorizer()
  train_tfdif = vectorizer.fit_transform(corpus)
  test_tfdif = vectorizer.transform(cold_movie_genres)
  val = []
  for index, test in enumerate(test_tfdif):
    cos_similarity = cosine_similarity(test_tfdif[index], train_tfdif).flatten()
    print(cos_similarity)
    neighbor_indices = cos_similarity.argsort()[:-9:-1]
    print(neighbor_indices)
    tot = 0
    for indices in neighbor_indices:
      if indices in user_movie_matrix.columns:
        total = (user_movie_matrix[indices].sum())/(user_movie_matrix[indices].astype(bool).sum(axis=0)) 
        tot+=total
    val.append(total)
  return sum(val)/ len(val)

In [74]:
# final_pred = []
# for test_user in test_df.values:
  
#   if (test_user[1] in user_movie_matrix.columns and test_user[0] in user_movie_matrix.index):
#     user_movies_rated = user_movie_matrix.loc[test_user[0]] 
#     p = user_based_collaborative_filter(user_movie_matrix, test_user[1], user_movies_rated)
#   elif (test_user[0] not in user_movie_matrix.index) and (test_user[1] in user_movie_matrix.columns):
#     print("The User is a cold stat User")
#     cold_movie_genres =list(movie_genres_df[movie_genres_df['movieID']==test_user[1]]['genre'].values)
#     p = cold_start_user(test_user[1], movie_genres_df, user_movie_matrix, cold_movie_genres)
#   elif (test_user[1] not in user_movie_matrix.columns) and (test_user[0] in user_movie_matrix.index):
#     print("The movie is a cold stat movie", test_user[1])
#     cold_movie_genres = list(movie_genres_df[movie_genres_df['movieID']==test_user[1]]['genre'])
#     p = get_user_profile(train_df, test_user[0], movie_genres_df, cold_movie_genres)
#   print('Done Test User: ', test_user[0], 'with movie: ', test_user[1])
#   final_pred.append(p)

1        0.0
2        0.0
3        1.0
4        0.0
5        0.0
        ... 
65037    0.0
65088    0.0
65126    0.0
65130    0.0
65133    0.0
Name: 75, Length: 9936, dtype: float64
75
653


In [19]:
# a_file = open("output.txt", "w")
# for label in final_pred:
#     label*=5
#     if label ==0:
#       label = 0.5
#     a_file.write(str((label))+"\n")
# a_file.close()

In [23]:
# genre_list_1 = []
# for i in train_df[(train_df['userID']==78) & (train_df['rating'] < 2)].values:
#   a = movie_directors_df[movie_directors_df['movieID']==str(int(i[1]))].values[0]
#   genre_list_1 += movie_genres_df[movie_genres_df['movieID']==int(a[0])]['genre'].values.tolist()
# print(set(genre_list_1))


In [24]:
# genre_list_2 = []
# for i in train_df[(train_df['userID']==78) & (train_df['rating'] > 2)].values:
#   # pdb.set_trace()
#   print(i)
#   a = movie_directors_df[movie_directors_df['movieID']==str(int(i[1]))].values[0]
#   # genre_list_2 += movie_genres_df[movie_genres_df['movieID']==int(a[0])]['genre'].values.tolist()
# print(set(genre_list_2))
  

In [25]:
def user_genre(train_df, movie_genres_df, user_id):
  genre_avg_rating = dict()
  user_movies = train_df[train_df['userID']==user_id]['movieID'].values.tolist()
  movie_rate = train_df[train_df['userID']==user_id]['rating'].values.tolist()
  count = 0
  for movie, rating in zip(user_movies, movie_rate):
    movie_g = movie_genres_df[movie_genres_df['movieID'] == movie]['genre'].values.tolist()
    for g in movie_g:
      all_keys = genre_avg_rating.keys()
      if g not in all_keys:
        genre_avg_rating[g] = [rating, 1]
      else:
        genre_avg_rating[g][0] += rating 
        genre_avg_rating[g][1] += 1
    count += 1
  for ranting in genre_avg_rating:
    sum_val = genre_avg_rating[ranting][0]
    val_num = genre_avg_rating[ranting][1]
    genre_avg_rating[ranting] = sum_val/val_num
  return genre_avg_rating

In [34]:
def prediction(train_df, test_df, movie_genres_df):
  pred = []
  count = 0
  user_genre_bias = None
  user_id_current = test_df.values.tolist()[0]
  users_in_mem  = [i for i in set(train_df['userID'])]
  movies_in_mem = [i for i in set(train_df['movieID'])]
  for test_user in test_df.values.tolist():
    print(count)
    user = test_user[0]
    movie = test_user[1]
    if movie not in movies_in_mem or user not in users_in_mem: 
      pred.append(2.5)
    else:
      if user_genre_bias == None:
        user_genre_bias = user_genre(train_df, movie_genres_df, user)
      if user_genre_bias != None:
        if user_id_current != test_user[0]:
          user_genre_bias = user_genre(train_df, movie_genres_df, user)
          user_id_current = test_user[0]
      movies_genres = movie_genres_df[movie_genres_df['movieID']==movie]['genre'].values.tolist()
      rating = 0
      for genres in movies_genres:
        if genres not in user_genre_bias.keys():
          rating += 1.0
        else:
          rating += user_genre_bias[genres]
      rating /= len(movies_genres)
      pred.append(rating)
    count+=1
  return pred

In [None]:
pred = prediction(train_df, test_df, movie_genres_df)

In [36]:
len(pred)

71299

In [37]:
with open('HW4_data/output.txt','w') as f:
  for i in pred:
    f.write(str(i)+"\n")

## Validation

In [48]:
# validation_df = train_df.drop_duplicates(subset=['userID'])
# train_df = train_df[~train_df.isin(validation_df)].dropna()
# X_test = validation_df.iloc[:,:-1]
# y_actual = validation_df.iloc[:,-1]

In [None]:
# y_predicted = prediction(train_df, X_test, movie_genres_df)

In [56]:
# from sklearn.metrics import mean_squared_error
# from math import sqrt

# rms = sqrt(mean_squared_error(y_actual, y_predicted))
# rms


0.930978086973226