In [76]:
import datetime
import os
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
import io


In [77]:
#download raw data from here: https://www.kaggle.com/netflix-inc/netflix-prize-data#README
#thanks to https://github.com/matthewkparker/Netflix_Recommender/blob/master/Code/01_Preprocessing.ipynb for help with data processing

#some resources below for enriching movie data
#most hopeful genre data https://www.igvita.com/2007/01/27/correlating-netflix-and-imdb-datasets/
#http://cns.bu.edu/~gsc/MovieGenre.html genre data
#alternatively can download data here: https://github.com/hadley/data-movies

def formatting(path):
    #Step 1
    df_raw = pd.read_csv(path, header=None, names=['user_id', 'rating', 'date'], usecols=[0, 1, 2])
    #Step 2
    tmp_movies = df_raw[df_raw['rating'].isna()]['user_id'].reset_index()
    movie_indices = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]
    #Step 3
    shifted_movie_indices = deque(movie_indices)
    shifted_movie_indices.rotate(-1)
    #Step 4
    user_data = []
    for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indices, shifted_movie_indices):
        if df_id_1<df_id_2:
            tmp_df = df_raw.loc[df_id_1+1:df_id_2-1].copy()
        else:
            tmp_df = df_raw.loc[df_id_1+1:].copy()
        tmp_df['movie'] = movie_id
        user_data.append(tmp_df)
    #Step 5
    df = pd.concat(user_data)
    print('done formatting')
    return df

In [78]:
path2file1 = r'C:/Users/fxi/Documents/Decision_Making_Under_Uncertainty/final_project/combined_data_1.txt'
path2file2 = r'C:/Users/fxi/Documents/Decision_Making_Under_Uncertainty/final_project/combined_data_2.txt'
path2file3 = r'C:/Users/fxi/Documents/Decision_Making_Under_Uncertainty/final_project/combined_data_3.txt'
path2file4 = r'C:/Users/fxi/Documents/Decision_Making_Under_Uncertainty/final_project/combined_data_4.txt'

df1 = formatting(path2file1)
df2 = formatting(path2file2)
df3 = formatting(path2file3)
df4 = formatting(path2file4)

done formatting
done formatting
done formatting
done formatting


In [79]:
all_data=df1.append(df2.append(df3.append(df4)))
all_data.shape[0]

In [81]:
#remove users that have multiple movies watched on the same day.  We can't tell which order movies were watched
#on these days because we only have dates and not datetimes
def remove_dupes(data):
    user_date_dupes=pd.DataFrame({'count' : data.groupby( [ 'user_id', 'date'] ).size()}).reset_index()
    deletions=user_date_dupes.loc[user_date_dupes['count'] >= 2].user_id.unique()
    data = data[~data['user_id'].isin(list(deletions))]
    return data

all_data_deduped=remove_dupes(all_data)
all_data_deduped.shape[0]

In [84]:
all_data_deduped.to_csv('all_data_deduped.csv')

In [83]:
len(all_data_deduped.user_id.unique())

4689

In [85]:
user_df= all_data_deduped
movie_data = pd.read_csv("movie_titles.csv", header=None, encoding = "ISO-8859-1")
movie_data.columns = ['movie_id', 'release_year','movie_name']
netflix_data_cleaned=pd.merge(user_df, movie_data, left_on = 'movie', right_on = 'movie_id')

In [87]:
import scipy.io
import numpy as np

mat_path = r'C:/Users/fxi/Downloads/movieGenreBenchmark.mat'
#movieGenreData.mat'
genre_data = scipy.io.loadmat(mat_path)

movie_genre_mapping = pd.DataFrame(
    {'genre_id': genre_data['dataStructMovie'][0][0][1][:,0].tolist(),
     'movie_id':genre_data['dataStructMovie'][0][0][6][0].tolist()
    })    

k = [j[0] for j in genre_data['dataStructMovie'][0][0][8]] 

i=(list(range(1, len(k)+1)))

genre_names = pd.DataFrame(
    {'genre_id': i,
     'genre_name': k
    }) 

genre_mapping_cleaned=pd.merge(movie_genre_mapping, genre_names, left_on = 'genre_id', right_on = 'genre_id')
data=pd.merge(netflix_data_cleaned, genre_mapping_cleaned, left_on = 'movie_id', right_on = 'movie_id')

In [89]:
data.to_csv('netflix_data_with_genre.csv')

In [91]:
sorted_data=data.sort_values(['user_id', 'date'], ascending=[True, True])

In [100]:
sorted_data["next_user_id"]=sorted_data["user_id"].shift(-1)
sorted_data['next_date']=sorted_data['date'].shift(-1)
sorted_data['next_movie']=sorted_data['movie_name'].shift(-1)
sorted_data['next_movie_id']=sorted_data['movie_id'].shift(-1)
sorted_data['next_rating']=sorted_data['rating'].shift(-1)
sorted_data['next_genre_id']=sorted_data['genre_id'].shift(-1)
sorted_data['next_genre']=sorted_data['genre_name'].shift(-1)

In [102]:
sorted_data.assign(same_user=sorted_data.user_id == sorted_data.next_user_id)

Unnamed: 0,user_id,rating,date,movie,movie_id,release_year,movie_name,genre_id,genre_name,next_user_id,next_date,next_movie,next_movie_id,next_rating,next_genre_id,next_genre,same_user
11469,1000189,4.0,2005-04-05,14999,14999,2003.0,Monster,7,[Drama],1000189,2005-06-24,Armageddon,6972.0,3.0,1.0,[Action & Adventure],True
5599,1000189,3.0,2005-06-24,6972,6972,1998.0,Armageddon,1,[Action & Adventure],1000357,2003-09-10,Head of State,11376.0,3.0,5.0,[Comedy],False
8539,1000357,3.0,2003-09-10,11376,11376,2003.0,Head of State,5,[Comedy],1000357,2005-10-08,Zoolander,15233.0,1.0,5.0,[Comedy],True
11648,1000357,1.0,2005-10-08,15233,15233,2001.0,Zoolander,5,[Comedy],1000650,2004-01-06,Anger Management,16882.0,5.0,5.0,[Comedy],False
12950,1000650,5.0,2004-01-06,16882,16882,2003.0,Anger Management,5,[Comedy],1000828,2005-10-19,The Longest Yard,5239.0,3.0,5.0,[Comedy],False
4002,1000828,3.0,2005-10-19,5239,5239,2005.0,The Longest Yard,5,[Comedy],1000828,2005-11-13,High Tension,14135.0,1.0,9.0,[Horror],True
10814,1000828,1.0,2005-11-13,14135,14135,2005.0,High Tension,9,[Horror],1001071,2005-08-27,Spanglish,6475.0,3.0,5.0,[Comedy],False
5158,1001071,3.0,2005-08-27,6475,6475,2004.0,Spanglish,5,[Comedy],1001071,2005-09-01,Hitch,17324.0,4.0,11.0,[Romance],True
13480,1001071,4.0,2005-09-01,17324,17324,2005.0,Hitch,11,[Romance],1001071,2005-10-14,Hostage,406.0,3.0,14.0,[Thrillers],True
405,1001071,3.0,2005-10-14,406,406,2005.0,Hostage,14,[Thrillers],1001071,2005-10-25,The Red Violin,7523.0,2.0,7.0,[Drama],True


In [101]:
sorted_data.head(5)

Unnamed: 0,user_id,rating,date,movie,movie_id,release_year,movie_name,genre_id,genre_name,next_user_id,next_date,next_movie,next_movie_id,next_rating,next_genre_id,next_genre
11469,1000189,4.0,2005-04-05,14999,14999,2003.0,Monster,7,[Drama],1000189,2005-06-24,Armageddon,6972.0,3.0,1.0,[Action & Adventure]
5599,1000189,3.0,2005-06-24,6972,6972,1998.0,Armageddon,1,[Action & Adventure],1000357,2003-09-10,Head of State,11376.0,3.0,5.0,[Comedy]
8539,1000357,3.0,2003-09-10,11376,11376,2003.0,Head of State,5,[Comedy],1000357,2005-10-08,Zoolander,15233.0,1.0,5.0,[Comedy]
11648,1000357,1.0,2005-10-08,15233,15233,2001.0,Zoolander,5,[Comedy],1000650,2004-01-06,Anger Management,16882.0,5.0,5.0,[Comedy]
12950,1000650,5.0,2004-01-06,16882,16882,2003.0,Anger Management,5,[Comedy],1000828,2005-10-19,The Longest Yard,5239.0,3.0,5.0,[Comedy]


In [97]:
shifted_data.head(5)

Unnamed: 0,user_id,rating,date,movie,movie_id,release_year,movie_name,genre_id,genre_name
11469,1000189,3.0,2005-06-24,6972.0,6972.0,1998.0,Armageddon,1.0,[Action & Adventure]
5599,1000357,3.0,2003-09-10,11376.0,11376.0,2003.0,Head of State,5.0,[Comedy]
8539,1000357,1.0,2005-10-08,15233.0,15233.0,2001.0,Zoolander,5.0,[Comedy]
11648,1000650,5.0,2004-01-06,16882.0,16882.0,2003.0,Anger Management,5.0,[Comedy]
12950,1000828,3.0,2005-10-19,5239.0,5239.0,2005.0,The Longest Yard,5.0,[Comedy]
