<a href="https://colab.research.google.com/github/jpgerber/Recommender-for-movie-snobs/blob/master/0_Movie_Snob_Data_Clean_%26_Wrangle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook cleans and combines the MovieLens data with a canonical movie list.

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import zipfile, io

# Make the canonical list
# Importing the 1,001 list and converting it to a list
snob_url = 'https://1001films.fandom.com/wiki/The_List'
snob_text= requests.get(snob_url)
soup = BeautifulSoup(snob_text.content, 'html.parser')
basic_list = (soup.body.find_all('b'))
thousand_list = [item.text for item in basic_list]
thousandone_movies = pd.DataFrame(thousand_list, columns = ['title']).drop(0) # Convert to df


In [None]:
# Get the MovieLens dataset
# Importing the ratings data
list_of_urls = ['http://files.grouplens.org/datasets/movielens/ml-latest.zip'] # I originally checked several files
for url in list_of_urls:
    ratings_small_file = requests.get(url)
    z = zipfile.ZipFile(io.BytesIO(ratings_small_file.content))
    z.extractall()

gl_movies = pd.read_csv('ml-latest/movies.csv', sep = ',', header = 0) # Make the df


##### There will be lots of different types of cleaning.
First, extract the year of release from the string titles

In [None]:
# Create columns of movie years in each database
# Make sure the titles don't have trailing spaces
thousandone_movies['title'] = thousandone_movies['title'].str.rstrip()
gl_movies['title'] = gl_movies['title'].str.rstrip()
# Then take the slices (the years are in parantheses at the end of the title)
thousandone_movies['year'] = [title[slice(-5,-1)] for title in thousandone_movies['title']]
gl_movies['year'] = [title[slice(-5,-1)] for title in gl_movies['title']]

# Then convert these strings to numbers (there is one title missing a year!)
# Define a conversion function
def ConvertYear(value):
    '''This function converts integer strings to integers and non-integer strings to zero'''
    try:
        return int(value)
    except: 
        return 0
# Then apply it to the columns for both thousandone_movies and ed_choices
thousandone_movies['year'] = thousandone_movies['year'].apply(lambda year: ConvertYear(
    year))
gl_movies['year'] = gl_movies['year'].apply(lambda year: ConvertYear(year))


Then make a custom function to test three years at a time and return the best string match.

In [None]:
!pip install fuzzywuzzy 
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
# Specify the matching function (we only need one of the outputs)
def Matcher(title, choices):
    title_match, percent_match, match3 = process.extractOne(title, choices)
    return title_match
# And here's a function for using the tokenizer
def Matcher_token(title, choices):
    title_match, percent_match, match3 = process.extractOne(title, choices, 
                                                            scorer=fuzz.token_sort_ratio)
    return title_match

#Define a filter to return targets for +/-1 year only
def YearFilter (year):
    years = [year-1, year, year+1]
    return gl_movies[gl_movies.year.isin(years)].title

# Running the tokenizer over the filtered target set
for index, row in thousandone_movies.iterrows():
    # call the filter
    targets = YearFilter(row.year)
    # update the new cell work out the matcher
    thousandone_movies.loc[index,'return_match'] = Matcher_token(row.title, targets)



The following movies were misidentified and so needed re-coding
Intolerance (1916) - 7243
Broken Blossoms (1919) - 6988
Häxan (1923) - 25744
Sunrise (1927) - 8125
The Unknown (1927) - 25762
A Throw of Dice (Prapancha Pash) (1929) - NONE
Tabu (1931) - 5599
The Vampire (Vampyr) (1932) - 25793
Scarface: The Shame of a Nation (1932) - 25788
Midnight Song (Ye Ban Ge Sheng) (1937) - None
Henry V (1944) - 25901
The Battle of San Pietro (1945) - 80104
Gun Crazy (1949) - 8751
Sunset Blvd. (1950) - 922
Europa '51 (1952) - 25966
Tokyo Story (1953) - 6643
The Wanton Countess (Senso) (1954) - 69911
The Sins of Lola Montes (Lola Montès) (1955) - 8143
Pather Panchali (1955) - 668
Ordet (1955) - 6981
Hill 24 Doesn't Answer (1955) - NONE
Dracula (1958) - 5649
Dog Star Man - 137579	137581	137583	137585	137587
Blonde Cobra (1963) - None
Playtime (1967) - 26171
Week End (1967) - 7749
Viy (1967) - 97065
Andrei Rublev (Andrei Rublyov) (1966) - 26150
A Touch of Zen (Hsia Nu) (1969) - 32511
M*A*S*H (1970) - 5060
The Sorrow and the Pity (La Chagrin et la Pitié) (1971) - 32853
Ceddo (1977) - 71973
Up in Smoke (1978) - 1194
Raiders of the Lost Ark (1981) - 1198
Yol (1982) - 6151
Koyaanisqatsi (1983) - 1289
The Naked Gun (1988) - 3868
Henry: Portrait of a Serial Killer (1990) - 2159
The Actress (Yuen Ling-Yuk) (1992) - 114394
Hana-Bi (1997) - 1809
Buffalo '66 (1998) - 1916
Tetsuo (1989) - 4552
A One and a Two (Yi Yi) (2000) - 4334
Y Tu Mama Tambien (2001) - 5225
Oldboy (2003) - 107314
Paranormal Activity (2007) - 71379
Precious: Based on the Novel "Push" by Sapphire (2009) - 72395
The Favourite (2018) - 183837
Vice (2018) - 127323

Join the canonical list to the ratings list

In [None]:
# Add the indicator variable to the canonical list.
thousandone_movies['canonical'] = 1
#print(thousandone_movies.head())

# Add the canonical indicator to the movie file, drop the irrelevant columns 
#and fill the missing values with zeroes
gl_movies = pd.merge(gl_movies, thousandone_movies, left_on='title', right_on='return_match', how='outer', 
         suffixes=('', '_canon')).drop(['year_canon', 
                                        'return_match', 'title_canon'], axis=1).fillna({'canonical':0})

print(gl_movies.head())

In [None]:
# Now add the mismatched ones

handcode_ids = [7243, 6988, 25744, 8125, 25762, 5599, 25793, 25788, 25901, 80104,
                         8751, 922, 25966, 6643, 69911, 8143, 668, 6981, 5649, 137579,
                         137581, 137583, 137585, 137587, 26171, 7749, 97065, 26150, 32511,
                         5060, 32853, 71973, 1194, 1198, 6151, 1289, 3868, 2159, 114394,
                         1809, 1916, 4552, 4334, 5225, 107314, 71379, 72395, 183837, 127323]

def handcode_row(row):
    if row['movieId'] in handcode_ids or row['canonical'] == 1:
      return 1
    else:
      return 0

gl_movies['canonical'] = gl_movies.apply(lambda row : handcode_row(row), axis=1) 



In [None]:
gl_movies.head()

In [None]:
gl_movies.canonical.value_counts()

In [None]:
import datetime
ratings = pd.read_csv('ml-latest/ratings.csv', sep = ',', header = 0)
ratings['rating_date'] = ratings['timestamp'].apply(lambda x: datetime.date.fromtimestamp(x))

print(ratings.head())


In [None]:
rating2 = pd.merge(ratings, gl_movies.drop_duplicates(subset=['movieId']), how='left', on='movieId').drop(
    ['title','genres','timestamp','year'], axis=1)  # we also dropped the long string columns and some others
rating2.head()

In [None]:

# Removing the one suspicious user
rating2 = rating2[rating2.userId != 123100]
rating2.groupby('userId').agg({'rating': 'count'}).sort_values(by='rating', ascending=False)


In [None]:
import pyarrow.parquet as pq
pq.write_table(table, 'example.parquet')