In [27]:
import pandas as pd
import re

In [28]:
# import user rating data
movies_df = pd.read_csv("../raw_data/ml-latest-small/movies.csv")
ratings_df = pd.read_csv("../raw_data/ml-latest-small/ratings.csv")

# import data and create relevant dfs
netflix_df = pd.read_csv("../raw_data/netflix_titles.csv")
# df = pd.read_csv("./titles_sample.csv")
netflix_df['description'] = netflix_df['description'].map(lambda x: re.sub("([^\x00-\x7F])+","", x)) # from jessica's code

In [29]:
# slice years off of titles, remove any quotations
def modify_title(str):
    pattern = r" [(]\d*[)]"
    new_str = re.sub(pattern, '', str)
    new_str = new_str.replace('"', '')
    # move 'The' to beginning of title
    if new_str[-5:] == ", The":
        new_str = "The " + new_str
        new_str = new_str[:-5]
    return new_str

movies_df['title'] = movies_df['title'].apply(modify_title)

In [30]:
# join titles with netflix db movie titles
eval_df = netflix_df.merge(movies_df, on='title', how='left')
# drop titles in netflix that aren't in movielens
eval_df = eval_df.dropna(subset=['movieId'])
# drop irrelvant columns
eval_df.drop(columns=['show_id', 'type', 'director', 'cast', 'country', 'date_added', 
                    'release_year', 'duration', 'rating', 'listed_in', 'genres'], inplace=True)
eval_df = eval_df.reset_index()
eval_df.drop(columns=['index'], inplace=True)
eval_df.drop_duplicates('title', inplace=True)
print(len(eval_df))
eval_df

1058


Unnamed: 0,title,description,movieId
0,Grown Ups,Mourning the loss of their beloved junior high...,79134.0
1,Dark Skies,A familys idyllic suburban life shatters when ...,100810.0
2,Jaws,When an insatiable great white shark terrorize...,1387.0
3,Jaws 2,Four years after the last deadly shark attacks...,1388.0
4,Jaws: The Revenge,"After another deadly shark attack, Ellen Brody...",4124.0
...,...,...,...
1112,Young Adult,When a divorced writer gets a letter from an o...,91622.0
1113,"Yours, Mine and Ours",When a father of eight and a mother of 10 prep...,26198.0
1115,Zodiac,"A political cartoonist, a crime reporter and a...",51540.0
1116,Zombieland,Looking to survive in a world taken over by zo...,71535.0


In [31]:
def reassign_ids(df, property):
  mapping = {item:i for i, item in enumerate(df[property].unique())}
  df[property] = df[property].apply(lambda x: mapping[x])
  return df

In [32]:
eval_df[eval_df['title'] == "The Jungle Book"]

Unnamed: 0,title,description,movieId
1007,The Jungle Book,This animated series follows young Mowgli and ...,362.0


In [33]:
# join with ratings_df on movieId to begin evaluation
users_df = eval_df.merge(ratings_df, on='movieId', how='left')
users_df = users_df.dropna()
num_users = users_df['userId'].nunique()

users_df = reassign_ids(users_df, 'userId')
users_df = reassign_ids(users_df, 'movieId')

print(num_users)
num_movies = users_df['movieId'].nunique()
print(num_movies)
users_df.drop(columns=['timestamp'], inplace=True)

users_df[users_df['title'] == "The Jungle Book"]

608
1056


Unnamed: 0,title,description,movieId,userId,rating
15108,The Jungle Book,This animated series follows young Mowgli and ...,955,130,5.0
15109,The Jungle Book,This animated series follows young Mowgli and ...,955,132,3.0
15110,The Jungle Book,This animated series follows young Mowgli and ...,955,214,2.0
15111,The Jungle Book,This animated series follows young Mowgli and ...,955,103,4.0
15112,The Jungle Book,This animated series follows young Mowgli and ...,955,216,4.0
15113,The Jungle Book,This animated series follows young Mowgli and ...,955,137,5.0
15114,The Jungle Book,This animated series follows young Mowgli and ...,955,266,5.0
15115,The Jungle Book,This animated series follows young Mowgli and ...,955,140,5.0
15116,The Jungle Book,This animated series follows young Mowgli and ...,955,236,4.0
15117,The Jungle Book,This animated series follows young Mowgli and ...,955,141,4.0


In [35]:
# export dfs
users_df.to_csv('../ml_netflix.csv', index=False)