In [1]:
import pandas as pd
import json, os, csv
import re

In [2]:
# Specify data directory

data_dir = os.path.join(os.path.dirname(os.getcwd()),'data')
data_dir

'Z:\\Jupyter\\Netflix_Movie_Filter\\data'

In [3]:
imdb_file_path = os.path.join(data_dir, "movie_list.json")
amazon_file_path = os.path.join(data_dir, "baseline_data_ful_info.csv")

In [4]:
%%time
df_imdb = pd.read_json(imdb_file_path)
df_amazon = pd.read_csv(amazon_file_path)

Wall time: 4.58 s


In [5]:
print(df_imdb.shape)
print(df_amazon.shape)

(65406, 17)
(14658, 16)


In [6]:
df_imdb.head()

Unnamed: 0,actors,countries,description,directors,genre,imdb_url,img_url,languages,metascore,rating,runtime,tagline,title,users_rating,votes,year,votes_int
0,"[Kevin Costner, Kelly Preston, John C. Reilly,...",[USA],After 19 years of playing the game he's loved ...,[Sam Raimi],"[Drama, Romance, Sport]",https://www.imdb.com/title/tt0126916/,https://m.media-amazon.com/images/M/MV5BZDgzY2...,[English],43.0,PG-13,137 min,Billy Chapel must choose between the woman he ...,For Love of the Game,6.6,31110,1999.0,31110.0
1,"[Kurt Russell, Val Kilmer, Sam Elliott, Bill P...",[USA],A successful lawman's plans to retire anonymou...,"[George P. Cosmatos, Kevin Jarre]","[Action, Biography, Drama, History, Western]",https://www.imdb.com/title/tt0108358/,https://m.media-amazon.com/images/M/MV5BODRkYz...,"[English, Latin]",50.0,R,130 min,I'm your huckleberry!,Tombstone,7.8,122341,1993.0,122341.0
10,"[Saoirse Ronan, Laurie Metcalf, Tracy Letts, L...",[USA],"In 2002, an artistically inclined seventeen-ye...",[Greta Gerwig],"[Comedy, Drama]",https://www.imdb.com/title/tt4925292/,https://m.media-amazon.com/images/M/MV5BODhkZG...,"[English, Spanish]",94.0,R,94 min,Time to Fly.,Lady Bird,7.4,225508,2017.0,225508.0
100,"[Ethan Hawke, Uma Thurman, Gore Vidal, Xander ...",[USA],A genetically inferior man assumes the identit...,[Andrew Niccol],"[Drama, Sci-Fi, Thriller]",https://www.imdb.com/title/tt0119177/,https://m.media-amazon.com/images/M/MV5BNDQxOT...,"[English, Esperanto]",64.0,PG-13,106 min,How do you hide when you're running from yours...,Gattaca,7.8,274534,1997.0,274534.0
1000,"[Gabriel Bateman, Darby Camp, Kiele Sanchez, G...","[United Arab Emirates, USA]",Two school kids strike up a friendship with an...,[Brandon Camp],"[Crime, Drama, Family]",https://www.imdb.com/title/tt1799516/,https://m.media-amazon.com/images/M/MV5BMTc5Mz...,[English],53.0,TV-PG,87 min,A Hero Comes Home,Benji,6.3,3518,2018.0,3518.0


In [7]:
print(len(df_imdb.title.unique()))

57208


In [8]:
df_amazon.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,title,char_count,reviewer_rating_count,title_rating_count
0,5.0,True,"04 13, 2015",A2OI3HU0KN4118,5119367,{'Format:': ' DVD'},Shantal N.,Thanks for everything.,Five Stars,1428883200,,,Joseph VHS,22.0,78,50
1,5.0,True,"04 15, 2014",A385J7MEM932DL,5119367,{'Format:': ' DVD'},Jim,This one was really interesting! Loved it com...,Bible Story,1397520000,,,Joseph VHS,208.0,82,50
2,5.0,True,"04 27, 2017",AIVTZDMB297BG,767001311,{'Format:': ' DVD'},Alan Denman,funny,Five Stars,1493251200,,,The Very Best of the Bob Newhart Show 6 VHS,5.0,101,42
3,4.0,True,"12 2, 2016",AXOX48S8K994V,767001311,{'Format:': ' DVD'},Amazon Customer,Great series,Four Stars,1480636800,,,The Very Best of the Bob Newhart Show 6 VHS,12.0,64,42
4,4.0,True,"10 13, 2016",A3J9LLWHZH15HG,767001311,{'Format:': ' DVD'},JJ,I happen to like Bob Newhart even though he is...,"Well done, amusing series.",1476316800,,,The Very Best of the Bob Newhart Show 6 VHS,535.0,93,42


In [9]:
len(df_amazon.title.unique())

1284

In [10]:
def preprocessing(text):
    """
    Preprocess titles 
    """
    # Remove VHS tags
    text = text.replace(" VHS", "")
    # Lower case
    text = text.lower()
    # Remove punctuations and numerics
    text = ''.join(char for char in text if char.isalnum())
    return text  

In [11]:
%%time
df_amazon["title_normalize"] = df_amazon.title.apply(preprocessing)
df_imdb["title_normalize"] = df_imdb.title.apply(preprocessing)

Wall time: 5.55 s


In [12]:
df_merge = df_amazon.merge(df_imdb, how = "inner", on = "title_normalize")

In [13]:
print(df_merge.shape)

(9525, 34)


In [14]:
df_merge.columns

Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin', 'style',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'image', 'title_x', 'char_count', 'reviewer_rating_count',
       'title_rating_count', 'title_normalize', 'actors', 'countries',
       'description', 'directors', 'genre', 'imdb_url', 'img_url', 'languages',
       'metascore', 'rating', 'runtime', 'tagline', 'title_y', 'users_rating',
       'votes', 'year', 'votes_int'],
      dtype='object')

In [15]:
df_merge.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,...,languages,metascore,rating,runtime,tagline,title_y,users_rating,votes,year,votes_int
0,4.0,True,"09 14, 2017",A1BT9J2I6DC246,767023765,{'Format:': ' DVD'},Debbie,good,Four Stars,1505347200,...,[English],,,89 min,"Washington needed a victory, they got a miracle.",The Crossing,7.1,1763,2000.0,1763.0
1,5.0,True,"10 27, 2013",A2K53OYO4JJO28,767023765,{'Format:': ' DVD'},Pete N. Austira,Every has visons of Christmas morning - waking...,A New Look at Christmas,1382832000,...,[English],,,89 min,"Washington needed a victory, they got a miracle.",The Crossing,7.1,1763,2000.0,1763.0
2,5.0,True,"09 12, 2013",A2TY359PR45ATP,767023765,{'Format:': ' DVD'},"R. Edward Merrell, Jr.",I've read several histories of Washington and/...,Good History Lesson!!,1378944000,...,[English],,,89 min,"Washington needed a victory, they got a miracle.",The Crossing,7.1,1763,2000.0,1763.0
3,5.0,True,"05 21, 2013",A192UPT6KST3E2,767023765,{'Format:': ' DVD'},Gary W. Phelps,This is an incredible and accurate portrayal o...,The Crossing,1369094400,...,[English],,,89 min,"Washington needed a victory, they got a miracle.",The Crossing,7.1,1763,2000.0,1763.0
4,5.0,True,"04 10, 2009",A2XRMQA6PJ5ZJ8,767023765,{'Format:': ' DVD'},Roger J. Buffington,As our schools manage to do a poorer and poore...,Very well done -- splendid historical drama,1239321600,...,[English],,,89 min,"Washington needed a victory, they got a miracle.",The Crossing,7.1,1763,2000.0,1763.0


In [16]:
len(df_merge.title_normalize.unique())

524

In [18]:
df_merge.columns

Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin', 'style',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'image', 'title_x', 'char_count', 'reviewer_rating_count',
       'title_rating_count', 'title_normalize', 'actors', 'countries',
       'description', 'directors', 'genre', 'imdb_url', 'img_url', 'languages',
       'metascore', 'rating', 'runtime', 'tagline', 'title_y', 'users_rating',
       'votes', 'year', 'votes_int'],
      dtype='object')

In [21]:
df_merge.shape

(9525, 34)

In [17]:
df_merge.title_normalize.unique()[0:524]

array(['thecrossing', 'steelmagnolias', 'easyrider',
       'closeencountersofthethirdkind', 'frightnight', 'annie',
       'doloresclaiborne', 'draculadeadandlovingit',
       'sheworeayellowribbon', 'sandsofiwojima', 'tokillamockingbird',
       'sweetdreams', 'roostercogburn', 'forricherorpoorer',
       'thebreakfastclub', '1941', 'thechangeling', 'oldyeller',
       'prettywoman', 'enchanted', 'goodwillhunting',
       'oneflewoverthecuckoosnest', 'contact', 'seven', 'bladerunner',
       'ladyhawke', 'northbynorthwest', 'asummerplace', 'victorvictoria',
       'titanic', 'thefirstwivesclub', 'inharmsway',
       'indianajonesandthetempleofdoom', 'roadhouse', 'poltergeist',
       'fiddlerontheroof', 'chittychittybangbang', 'mrmom',
       'thesilenceofthelambs', 'thebestyearsofourlives',
       'howthewestwaswon',
       'drstrangeloveorhowilearnedtostopworryingandlovethebomb',
       'sensesensibility', 'themuppetchristmascarol', 'ulysses',
       'warlock', 'oklahoma', 'slingbl

In [20]:
df_merge.title_normalize.value_counts()

red                                    258
theloneranger                          111
achristmascarol                        105
robinhood                               96
safehouse                               85
sherlockholmes                          72
journeytothecenteroftheearth            72
thedaytheearthstoodstill                69
prometheus                              66
therevenant                             66
unknown                                 64
inharmsway                              63
sabotage                                57
totalrecall                             56
red2                                    55
anastasia                               55
argo                                    54
independenceday                         54
thelordoftheringsthereturnoftheking     52
guardiansofthegalaxy                    52
nowyouseeme                             51
taken                                   50
underworld                              50
payback    