In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz

## Find missing IMDB IDs for Amazon Data

#### Goals

- Find missing titles from original merge.
- Find missing titles on IMDB using fuzzywuzzy.

**NOTE: We will use the same process as in 03.1_a_netflix_missing_title_ids notebook**

### 1. Import data

In [2]:
amazon = pd.read_csv("../Data/amazon_shows.csv")

In [3]:
amazon_ids = pd.read_pickle("../Data/amazon_ids.pkl")

In [4]:
amazon_missing = pd.read_pickle("../Data/amazon_missing.pkl")

### 2. Compare length of values of oringal list vs new ones

We will compare the number of rows from the original amazon data frame and the sum of the missing titles and the amazon ids.

In [5]:
len(amazon)

2136

In [6]:
len(amazon_ids) + len(amazon_missing)

2130

In [7]:
amazon["show"].nunique()

2130

6 values are missing but when looking at unique values on the original data frame, we get the same result. This means that there must be some repeated titles which may be from different years. We will make a value count for the show column in the original amazon data frame.

In [8]:
amazon["show"].value_counts().head(6)

Undercover                  2
Clifford the Big Red Dog    2
Baby Talk                   2
Rome                        2
Elizabeth I                 2
Moby Dick                   2
Name: show, dtype: int64

We will look at each of them and add them the amazon missing data frame.

In [9]:
amazon[amazon["show"] == "Moby Dick"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes
583,Moby Dick,1998,7+,6.4,75%
626,Moby Dick,2011,18+,6.2,


In [10]:
amazon_ids[amazon_ids["show"] == "Moby Dick"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,tconst,titleType,primaryTitle,originalTitle,isAdult,...,runtimeMinutes,genres,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
470,Moby Dick,1998,7+,6.4,75%,tt0120756,tvMiniSeries,Moby Dick,Moby Dick,0.0,...,180,"Adventure,Drama,Thriller",,,,,,,,


In [11]:
amazon_missing = amazon_missing.append(amazon[(amazon["show"] == "Moby Dick") & (amazon["year"] == 2011)])

In [12]:
amazon[amazon["show"] == "Clifford the Big Red Dog"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes
479,Clifford the Big Red Dog,2000,all,6.6,
1010,Clifford the Big Red Dog,2019,all,6.1,


In [13]:
amazon_ids[amazon_ids["show"] == "Clifford the Big Red Dog"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,tconst,titleType,primaryTitle,originalTitle,isAdult,...,runtimeMinutes,genres,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
394,Clifford the Big Red Dog,2000,all,6.6,,tt0233041,tvSeries,Clifford the Big Red Dog,Clifford the Big Red Dog,0.0,...,30,"Animation,Comedy,Family",,,,,,,,


In [14]:
amazon_missing = amazon_missing.append(amazon[(amazon["show"] == "Clifford the Big Red Dog") & (amazon["year"] == 2019)])

In [15]:
amazon[amazon["show"] == "Undercover"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes
160,Undercover,2011,7+,8.9,
671,Undercover,2016,16+,6.8,75%


In [16]:
amazon_ids[amazon_ids["show"] == "Undercover"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,tconst,titleType,primaryTitle,originalTitle,isAdult,...,runtimeMinutes,genres,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
142,Undercover,2011,7+,8.9,,tt4266524,tvSeries,Undercover,Undercover,0.0,...,\N,\N,,,,,,,,


In [17]:
amazon_missing = amazon_missing.append(amazon[(amazon["show"] == "Undercover") & (amazon["year"] == 2016)])

In [18]:
amazon[amazon["show"] == "Baby Talk"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes
1032,Baby Talk,1991,all,4.2,
1792,Baby Talk,2017,,,


In [19]:
amazon_ids[amazon_ids["show"] == "Baby Talk"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,tconst,titleType,primaryTitle,originalTitle,isAdult,...,runtimeMinutes,genres,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
778,Baby Talk,1991,all,4.2,,tt0101041,tvSeries,Baby Talk,Baby Talk,0.0,...,30,Comedy,,,,,,,,


In [20]:
amazon_missing = amazon_missing.append(amazon[(amazon["show"] == "Baby Talk") & (amazon["year"] == 2017)])

In [21]:
amazon[amazon["show"] == "Elizabeth I"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes
329,Elizabeth I,2005,18+,7.9,
654,Elizabeth I,2017,,7.3,


In [22]:
amazon_ids[amazon_ids["show"] == "Elizabeth I"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,tconst,titleType,primaryTitle,originalTitle,isAdult,...,runtimeMinutes,genres,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
280,Elizabeth I,2005,18+,7.9,,tt0465326,tvMiniSeries,Elizabeth I,Elizabeth I,0.0,...,223,"Biography,Drama,History",,,,,,,,


In [23]:
amazon_missing = amazon_missing.append(amazon[(amazon["show"] == "Elizabeth I") & (amazon["year"] == 2017)])

In [24]:
amazon[amazon["show"] == "Rome"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes
13,Rome,2005,18+,8.7,86%
991,Rome,2012,,5.8,


In [25]:
amazon_ids[amazon_ids["show"] == "Rome"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,tconst,titleType,primaryTitle,originalTitle,isAdult,...,runtimeMinutes,genres,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
12,Rome,2005,18+,8.7,86%,tt0384766,tvSeries,Rome,Rome,0.0,...,52,"Action,Drama,History",,,,,,,,


In [26]:
amazon_missing = amazon_missing.append(amazon[(amazon["show"] == "Rome") & (amazon["year"] == 2012)])

In [27]:
len(amazon_missing) + len(amazon_ids)

2136

Now we got the same values from the original tv shows and the new data frames.

In [28]:
amazon_missing = amazon_missing.reset_index(drop=True)

### 3. Find missing titles using fuzzywuzzy

We will first create a list for missing shows.

In [29]:
amazon_shows_missing = [show for show in amazon_missing["show"]]

In [30]:
len(amazon_shows_missing)

602

In [31]:
imdb = pd.read_pickle(path + "Data/imdb_tv_all.pkl")

In [32]:
imdb_titles = [title for title in imdb["primaryTitle"]]

We will use the same function as on the netflix notebook, this time we will look for values with a ration higher than 80, since we are looking at more missing shows, we need more tolerance in order to get correct data.

In [33]:
def find_shows(show):
    matches = []

    for title in imdb_titles:
        # compute ratio and remove case-sensitivity
        ratio = fuzz.ratio(title.lower(), show.lower())

        # add all matches to list with ratio > 80
        if ratio >= 80:
            matches.append((title, show, ratio))
    
    # return none if there was no match found
    if len(matches) == 0:
        return None
    return sorted(matches, key=lambda x: x[2], reverse=True)[0][0]

We will now create a new column with the matched imdb titles.

In [34]:
amazon_missing["imdb_titles"] = amazon_missing["show"].apply(lambda x: find_shows(x))

In [35]:
amazon_missing.head(10)

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_titles
0,Peep show,2003,18+,8.6,96%,Peep Show
1,The Test: A New Era For Australia's Team,2020,,9.0,,The Test: A New Era for Australia's Team
2,Made In Abyss,2017,18+,8.4,,Made in Abyss
3,Chacha Vidhayak Hain Humare,2018,,8.0,,Chacha Vidhayak Hain Hamare
4,Darker than Black,2007,16+,7.8,,Darker Than Black
5,Kenichi The Mightiest Disciple,2006,16+,8.2,,
6,James May: Our Man In Japan,2020,,8.5,,James May: Our Man in Japan
7,6teen,2004,7+,7.2,,6Teen
8,Sonic x,2003,16+,6.1,,Sonic X
9,R. L. Stine's The Haunting Hour,2010,7+,8.0,,R.L. Stine's The Haunting Hour


In [36]:
amazon_missing_2 = amazon_missing[amazon_missing["imdb_titles"].isna()].reset_index(drop=True)

In [37]:
amazon_missing = amazon_missing[~amazon_missing["imdb_titles"].isna()].reset_index(drop=True)

In [38]:
amazon_missing_2

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_titles
0,Kenichi The Mightiest Disciple,2006,16+,8.2,,
1,Moon Embracing the Sun,2012,7+,8.0,,
2,StarTalk with Neil deGrasse Tyson,2015,all,7.8,,
3,The Mysterious Play,1995,16+,7.9,,
4,Worricker,2011,,8.9,,
...,...,...,...,...,...,...
319,Honolulu P.D.,2003,,,,
320,Mad Fabricators Society,2015,,,,
321,My Sister-in-Law,2015,,,,
322,How To Self-Publish,2017,,,,


We still have 324 titles that were not found. We will first merge the new imdb ids to the amazon_ids data frame and then continue to review the missing titles.

### 4. Merge IMDB IDs

We will still merge on the year, in order to be sure that we are not merging a remake or a similar title.

In [39]:
amazon_missing = amazon_missing.merge(imdb, how="left", left_on=["imdb_titles", "year"]
                                      , right_on=["primaryTitle", "startYear"])

We will separate the missing titles that were not matched.

In [40]:
amazon_missing_3 = amazon_missing[amazon_missing["tconst"].isna()].reset_index(drop=True)

In [41]:
amazon_missing = amazon_missing[~amazon_missing["tconst"].isna()].reset_index(drop=True)

We will join the new ids found to the amazon_ids dataframe.

In [42]:
amazon_ids = amazon_ids.append(amazon_missing).reset_index(drop=True)

In [43]:
amazon_missing_3 = amazon_missing_3[["show", "year", "rating", "imdb", "rotten_tomatoes", "imdb_titles"]]

We will take a look on the titles that weren't matched.

In [44]:
amazon_missing_3

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_titles
0,Charles II: The Power and The Passion,2003,16+,7.5,,Whitlam: The Power and the Passion
1,Empresses In The Palace,2012,7+,8.4,,Empresses in the Palace
2,R.I.P Files,2008,,7.6,,The R.I.P. Files
3,The History of Tom Jones,1997,,7.6,,The Story of Moses
4,LEGO Friends,2012,all,5.2,,Lego Friends
...,...,...,...,...,...,...
149,Great Indian Wars,2009,,,,Great Canadian Parks
150,Food Stories,2017,,,,Four Stories
151,My Battalion,2014,,,,The Battalion
152,Quick Bites,2017,,,,QuickBites


Some of them don't have a matching title, we will now add a new column showing the fuzzy ratio and remove those below 90.

In [45]:
fuzzy_ratio = []

for row in range(len(amazon_missing_3)):
    fuzzy_ratio.append(fuzz.ratio(amazon_missing_3.loc[row, "show"].lower(), amazon_missing_3.loc[row, "imdb_titles"].lower()))

amazon_missing_3["fuzzy_ratio"] = fuzzy_ratio

In [46]:
amazon_missing_3_keep = amazon_missing_3[amazon_missing_3["fuzzy_ratio"] >= 90].drop(columns="fuzzy_ratio")

We will keep this titles and merge them with their IMDB ID, and add them to the amazon_ids data frame.

In [47]:
amazon_missing_3_keep = amazon_missing_3_keep.merge(imdb, how="left", left_on=["imdb_titles"], right_on=["primaryTitle"])

In [48]:
amazon_missing_3_keep[amazon_missing_3_keep["tconst"].isna()]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_titles,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres


We don't have missing ids on this new data frame. We will now add it to the amazon_ids data frame.

In [49]:
len(amazon_missing_3)

154

In [50]:
len(amazon_missing_3_keep)

65

In [51]:
amazon_ids = amazon_ids.append(amazon_missing_3_keep).reset_index(drop=True)

In [52]:
amazon_ids = amazon_ids.drop_duplicates("tconst")

In [53]:
len(amazon_ids)/len(amazon)

0.7523408239700374

We will now work on amazon_missing_2 data frame, we will look for imdb titles decreasing the ratio to 60.

In [54]:
def find_shows2(show):
    matches = []

    for title in imdb_titles:
        # compute ratio and remove case-sensitivity
        ratio = fuzz.ratio(title.lower(), show.lower())

        # add all matches to list with ratio > 60
        if ratio >= 60:
            matches.append((title, show, ratio))
    if len(matches) == 0:
        return None
    return sorted(matches, key=lambda x: x[2], reverse=True)[0][0]

In [55]:
amazon_missing_2["imdb_titles"] = amazon_missing_2["show"].apply(lambda x: find_shows2(x))

In [56]:
amazon_missing_2.head(20)

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_titles
0,Kenichi The Mightiest Disciple,2006,16+,8.2,,The Mightiest Disciple Kenichi
1,Moon Embracing the Sun,2012,7+,8.0,,The Moon That Embraces the Sun
2,StarTalk with Neil deGrasse Tyson,2015,all,7.8,,SCIENTH: with Neil deMike Tyson
3,The Mysterious Play,1995,16+,7.9,,Mysterious Planet
4,Worricker,2011,,8.9,,Whicker
5,Kannazuki no Miko,2004,16+,6.6,,Kanata no ko
6,Soviet Storm: WWII in the East,2011,7+,,,True Stories: Life in the USA
7,"Kokkoku, Moment by Moment",2018,18+,7.0,,Mente y movimiento
8,An American Girl Story - Melody 1963: Love Has...,2016,all,7.6,,
9,Drop Kick on my Devil!,2018,,6.4,,


From the 20 first elements, it doesn't seem that we have found great matches. We will compute the fuzzy ratio in a new column and see if we can remove some.

In [57]:
fuzzy_ratio = []

for row in range(len(amazon_missing_2)):
    if amazon_missing_2.loc[row, "imdb_titles"]:
        fuzzy_ratio.append(fuzz.ratio(amazon_missing_2.loc[row, "show"].lower(), amazon_missing_2.loc[row, "imdb_titles"].lower()))
    else:
        fuzzy_ratio.append(None)
amazon_missing_2["fuzzy_ratio"] = fuzzy_ratio

In [58]:
amazon_missing_2[amazon_missing_2["fuzzy_ratio"] >= 75].head(20)

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_titles,fuzzy_ratio
3,The Mysterious Play,1995,16+,7.9,,Mysterious Planet,78.0
4,Worricker,2011,,8.9,,Whicker,75.0
22,Behind Tasty,2017,,,,Behind Bars,78.0
23,The Practice (2014),2014,,4.4,,The Practice,77.0
24,One-Eyed Horse: The Series,2009,,3.7,,Lonesome Dove: The Series,75.0
25,Star April,2017,,6.8,,Star Trails,76.0
31,Trinity and Beyond,2018,,,,Flexibility and Beyond,75.0
39,5 Facts,2014,,,,Facets,77.0
41,Project Restoration,2016,,,,Project Detention,78.0
42,#ThatsHarassment,2018,,,,Harassment,77.0


We can still see some titles that are not the same, so we will remove this values and work with what we have already.

We will now calculate the ratio of titles with an imdb id match and the original titles.

In [59]:
len(amazon_ids)/len(amazon)

0.7523408239700374

This means that we will be using 75% of the original data.

### 5. Cleaning final data

We will create a final data frame including:
- show
- year
- rating
- imdb
- rotten_tomatoes
- imdb_id
- all data from title basics

First, we will need to create the imdb_id column, this will have tconst and titleId values.

In [60]:
amazon_ids["imdb_id"] = np.where(amazon_ids["tconst"].isna(), amazon_ids["titleId"], amazon_ids["tconst"])

In [61]:
amazon_ids["imdb_id"].isna().value_counts()

False    1607
Name: imdb_id, dtype: int64

This means we have all the IMDB IDs for all the titles in this final data frame. We will now remove columns that are not show, year, rating, imdb, rotten_tomatoes and imdb_id and merge again.

In [62]:
to_drop = [col for col in amazon_ids.columns if col not in ["show", "year", "rating", "imdb", "rotten_tomatoes", "imdb_id"]]

In [63]:
amazon_ids = amazon_ids.drop(columns=to_drop)

In [64]:
amazon_ids.head()

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_id
0,The Wire,2002,18+,9.3,94%,tt0306414
1,The Sopranos,1999,18+,9.2,92%,tt0141842
2,Band of Brothers,2001,18+,9.4,94%,tt0185906
3,Vikings,2013,18+,8.6,93%,tt2306299
4,Mr. Robot,2015,18+,8.5,94%,tt4158110


Now we will merge all data from title basics.

In [65]:
amazon_ids = amazon_ids.merge(imdb, how="left", left_on="imdb_id", right_on="tconst")

In [66]:
amazon_ids.head()

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_id,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,The Wire,2002,18+,9.3,94%,tt0306414,tt0306414,tvSeries,The Wire,The Wire,0,2002.0,2002.0,59,"Crime,Drama,Thriller"
1,The Sopranos,1999,18+,9.2,92%,tt0141842,tt0141842,tvSeries,The Sopranos,The Sopranos,0,1999.0,1999.0,55,"Crime,Drama"
2,Band of Brothers,2001,18+,9.4,94%,tt0185906,tt0185906,tvMiniSeries,Band of Brothers,Band of Brothers,0,2001.0,2001.0,594,"Action,Drama,History"
3,Vikings,2013,18+,8.6,93%,tt2306299,tt2306299,tvSeries,Vikings,Vikings,0,2013.0,2013.0,44,"Action,Adventure,Drama"
4,Mr. Robot,2015,18+,8.5,94%,tt4158110,tt4158110,tvSeries,Mr. Robot,Mr. Robot,0,2015.0,2015.0,49,"Crime,Drama,Thriller"


In [67]:
amazon_ids = amazon_ids.drop(columns="tconst")

### 6. Export final data

In [68]:
# amazon_ids.to_pickle("../Data/amazon_final.pkl")