In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz

## Find missing IMDB IDs for Netflix Data

#### Goals

- Find missing titles from original merge.
- Find missing titles on IMDB using fuzzywuzzy.

### 1. Import data

In [2]:
netflix = pd.read_csv("../Data/netflix_shows.csv")

In [3]:
netflix_ids = pd.read_pickle("../Data/netflix_ids.pkl")

In [4]:
netflix_missing = pd.read_pickle("../Data/netflix_missing.pkl")

### 2. Compare length of values of oringal list vs new ones

We will compare the number of rows from the original netflix data frame and the sum of the missing titles and the netflix ids.

In [5]:
len(netflix)

1915

In [6]:
len(netflix_ids) + len(netflix_missing)

1909

In [7]:
netflix["show"].nunique()

1909

6 values are missing but when looking at unique values on the original data frame, we get the same result. This means that there must be some repeated titles which may be from different years. We will make a value count for the show column in the original netflix data frame.

In [8]:
netflix["show"].value_counts().head(6)

The Good Cop        2
Inspector Gadget    2
Danger Mouse        2
Maniac              2
Charmed             2
The Code            2
Name: show, dtype: int64

We will look at each of them and add them the netflix missing data frame.

In [9]:
netflix[netflix["show"] == "The Code"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes
483,The Code,2011,,7.4,100%
637,The Code,2014,,7.4,


In [10]:
netflix_ids[netflix_ids["show"] == "The Code"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,tconst,titleType,primaryTitle,originalTitle,isAdult,...,runtimeMinutes,genres,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
396,The Code,2011,,7.4,100%,tt2060305,tvMiniSeries,The Code,The Code,0.0,...,\N,Documentary,,,,,,,,


In [11]:
netflix_missing = netflix_missing.append(netflix[(netflix["show"] == "The Code") & (netflix["year"] == 2014)])

In [12]:
netflix[netflix["show"] == "Maniac"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes
138,Maniac,2018,16+,7.8,85%
1244,Maniac,2015,,7.0,


In [13]:
netflix_ids[netflix_ids["show"] == "Maniac"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,tconst,titleType,primaryTitle,originalTitle,isAdult,...,runtimeMinutes,genres,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
117,Maniac,2018,16+,7.8,85%,tt5580146,tvMiniSeries,Maniac,Maniac,0.0,...,40,"Comedy,Drama,Sci-Fi",,,,,,,,


In [14]:
netflix_missing = netflix_missing.append(netflix[(netflix["show"] == "Maniac") & (netflix["year"] == 2015)])

In [15]:
netflix[netflix["show"] == "Danger Mouse"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes
825,Danger Mouse,1981,7+,7.4,
1197,Danger Mouse,2015,all,7.1,


In [16]:
netflix_ids[netflix_ids["show"] == "Danger Mouse"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,tconst,titleType,primaryTitle,originalTitle,isAdult,...,runtimeMinutes,genres,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
621,Danger Mouse,1981,7+,7.4,,tt0081848,tvSeries,Danger Mouse,Danger Mouse,0.0,...,25,"Animation,Comedy,Family",,,,,,,,


In [17]:
netflix_missing = netflix_missing.append(netflix[(netflix["show"] == "Danger Mouse") & (netflix["year"] == 2015)])

In [18]:
netflix[netflix["show"] == "Charmed"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes
134,Charmed,1998,16+,7.1,75%
656,Charmed,2018,16+,4.5,


In [19]:
netflix_ids[netflix_ids["show"] == "Charmed"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,tconst,titleType,primaryTitle,originalTitle,isAdult,...,runtimeMinutes,genres,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
113,Charmed,1998,16+,7.1,75%,tt0158552,tvSeries,Charmed,Charmed,0.0,...,42,"Drama,Fantasy,Mystery",,,,,,,,


In [20]:
netflix_missing = netflix_missing.append(netflix[(netflix["show"] == "Charmed") & (netflix["year"] == 2018)])

In [21]:
netflix[netflix["show"] == "The Good Cop"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes
589,The Good Cop,2018,16+,7.0,50%
1354,The Good Cop,2015,,7.5,


In [22]:
netflix_ids[netflix_ids["show"] == "The Good Cop"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,tconst,titleType,primaryTitle,originalTitle,isAdult,...,runtimeMinutes,genres,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
470,The Good Cop,2018,16+,7.0,50%,tt7183074,tvSeries,The Good Cop,The Good Cop,0.0,...,60,"Comedy,Crime,Drama",,,,,,,,


In [23]:
netflix_missing = netflix_missing.append(netflix[(netflix["show"] == "The Good Cop") & (netflix["year"] == 2015)])

In [24]:
netflix[netflix["show"] == "Inspector Gadget"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes
606,Inspector Gadget,1983,7+,6.8,
1466,Inspector Gadget,2015,all,4.3,


In [25]:
netflix_ids[netflix_ids["show"] == "Inspector Gadget"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,tconst,titleType,primaryTitle,originalTitle,isAdult,...,runtimeMinutes,genres,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
482,Inspector Gadget,1983,7+,6.8,,tt0085033,tvSeries,Inspector Gadget,Inspector Gadget,0.0,...,30,"Action,Adventure,Animation",,,,,,,,


In [26]:
netflix_missing = netflix_missing.append(netflix[(netflix["show"] == "Inspector Gadget") & (netflix["year"] == 2015)])

In [27]:
len(netflix_missing) + len(netflix_ids)

1915

Now we got the same values from the original tv shows and the new data frames.

In [28]:
netflix_missing = netflix_missing.reset_index(drop=True)

### 3. Find missing titles using fuzzywuzzy

We will first create a list for missing shows.

In [29]:
netflix_shows_missing = [show for show in netflix_missing["show"]]

In [30]:
len(netflix_shows_missing)

204

In [31]:
imdb = pd.read_pickle(path + "Data/imdb_tv_all.pkl")

We will create a function to find the most similar title on the imdb title basics file.

In [32]:
imdb_titles = [title for title in imdb["primaryTitle"]]

In [33]:
def find_shows(show):
    matches = []

    for title in imdb_titles:
        # compute ratio and remove case-sensitivity
        ratio = fuzz.ratio(title.lower(), show.lower())

        # add all matches to list with ratio > 60
        if ratio >= 60:
            matches.append((title, show, ratio))
    
    # return none if there was no match found
    if len(matches) == 0:
        return None
    return sorted(matches, key=lambda x: x[2], reverse=True)[0][0]


We will make a test with the first value from the missing shows list.

In [34]:
netflix_shows_missing[0]

'YOU'

In [35]:
find_shows(netflix_shows_missing[0])

'You'

We will now create a new column with the matched imdb titles.

In [36]:
netflix_missing["imdb_titles"] = netflix_missing["show"].apply(lambda x: find_shows(x))

In [37]:
netflix_missing.head(10)

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_titles
0,YOU,2018,18+,7.8,91%,You
1,Marvel's Jessica Jones,2015,18+,8.0,83%,Jessica Jones
2,HAPPY!,2017,18+,8.2,84%,Happy!
3,Haikyu!!,2014,16+,8.7,,Haikyuu!!
4,F is for Family,2015,18+,8.0,86%,F Is for Family
5,GHOUL,2018,18+,7.1,83%,Ghoul
6,Time: The Kalief Browder Story,2017,16+,8.5,100%,TIME: The Kalief Browder Story
7,anohana: The Flower We Saw That Day,2011,16+,8.2,,Anohana: The Flower We Saw That Day
8,My Next Guest Needs No Introduction With David...,2018,18+,7.9,83%,My Next Guest Needs No Introduction with David...
9,Valhalla Murders,2019,18+,7.1,,The Valhalla Murders


From the first 10 missing titles we can see that our function did a good job, we will now take a look at the titles that weren't found a match.

In [38]:
netflix_missing[netflix_missing["imdb_titles"].isna()]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_titles
39,Yunus Emre: Aşkın Yolculuğu,2015,,7.8,,
102,Bangkok รัก Stories 2 ตอน ไม่เดียงสา,2018,,6.9,,
142,Тобот,2010,,6.2,,
151,阳关道,2018,,3.2,,


In [39]:
netflix_missing = netflix_missing[~netflix_missing["imdb_titles"].isna()]

Since we have only 4 titles and they are in another language, we will remove them from our data.

### 4. Merge IMDB IDs

We will still merge on the year, in order to be sure that we are not merging a remake or a similar title.

In [40]:
netflix_missing = netflix_missing.merge(imdb, how="left", left_on=["imdb_titles", "year"], right_on=["primaryTitle", "startYear"])

In [41]:
netflix_missing.head()

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_titles,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,YOU,2018,18+,7.8,91%,You,tt7335184,tvSeries,You,You,0.0,2018.0,2018.0,45,"Crime,Drama,Romance"
1,Marvel's Jessica Jones,2015,18+,8.0,83%,Jessica Jones,tt2357547,tvSeries,Jessica Jones,Jessica Jones,0.0,2015.0,2015.0,56,"Action,Crime,Drama"
2,HAPPY!,2017,18+,8.2,84%,Happy!,tt2452242,tvSeries,Happy!,Happy!,0.0,2017.0,2017.0,60,"Action,Comedy,Crime"
3,Haikyu!!,2014,16+,8.7,,Haikyuu!!,tt3398540,tvSeries,Haikyuu!!,Haikyuu!!,0.0,2014.0,2014.0,24,"Animation,Comedy,Drama"
4,F is for Family,2015,18+,8.0,86%,F Is for Family,tt4326894,tvSeries,F Is for Family,F Is for Family,0.0,2015.0,2015.0,30,"Animation,Comedy,Drama"


We will now take a look at the titles who didn't find a match.

In [42]:
netflix_missing[netflix_missing["tconst"].isna()]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_titles,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
15,WHAT / IF,2019,18+,6.3,,What If,,,,,,,,,
20,K-Project,2012,18+,7.2,,B-Project,,,,,,,,,
21,Cooked With Cannabis,2020,18+,7.0,80%,Coffee with Anna,,,,,,,,,
22,El Chapulín,1972,all,8.5,,El Chapo,,,,,,,,,
24,Terrace House: Boys & Girls in the City,2015,7+,8.2,,Terrace House Boys x Girls Next Door,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,Voice (OCN),2017,,,,Voices 4 Oceans,,,,,,,,,
191,The Golden Path,2012,,,,The Golden Bat,,,,,,,,,
192,The Underwear,2017,,,,The Undercard,,,,,,,,,
193,Taste of the Country,2011,,,,Heart of the Country,,,,,,,,,


We can see that most of this titles don't have a similar imdb titles, due to this, we will drop this columns also.

In [43]:
netflix_ids_new = netflix_missing[~netflix_missing["tconst"].isna()].reset_index(drop=True).drop_duplicates("tconst")

In [44]:
netflix_ids = netflix_ids.append(netflix_ids_new)

In [45]:
netflix_ids.head()

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,tconst,titleType,primaryTitle,originalTitle,isAdult,...,genres,titleId,ordering,title,region,language,types,attributes,isOriginalTitle,imdb_titles
0,Breaking Bad,2008,18+,9.5,96%,tt0903747,tvSeries,Breaking Bad,Breaking Bad,0.0,...,"Crime,Drama,Thriller",,,,,,,,,
1,Stranger Things,2016,16+,8.8,93%,tt4574334,tvSeries,Stranger Things,Stranger Things,0.0,...,"Drama,Fantasy,Horror",,,,,,,,,
2,Sherlock,2010,16+,9.1,78%,tt1475582,tvSeries,Sherlock,Sherlock,0.0,...,"Crime,Drama,Mystery",,,,,,,,,
3,Better Call Saul,2015,18+,8.7,97%,tt3032476,tvSeries,Better Call Saul,Better Call Saul,0.0,...,"Crime,Drama",,,,,,,,,
4,The Office,2005,16+,8.9,81%,tt0386676,tvSeries,The Office,The Office,0.0,...,Comedy,,,,,,,,,


We will now calculate the ratio of titles with an imdb id match and the original titles.

In [46]:
len(netflix_ids) / len(netflix)

0.9535248041775457

This means that we will be using 95% of the original data.

### 5. Cleaning final data

We will create a final data frame including:
- show
- year
- rating
- imdb
- rotten_tomatoes
- imdb_id
- all data from title basics

First, we will need to create the imdb_id column, this will have tconst and titleId values.

In [47]:
netflix_ids["imdb_id"] = np.where(netflix_ids["tconst"].isna(), netflix_ids["titleId"], netflix_ids["tconst"])

In [48]:
netflix_ids["imdb_id"].isna().value_counts()

False    1826
Name: imdb_id, dtype: int64

This means we have all the IMDB IDs for all the titles in this final data frame. We will now remove columns that are not show, year, rating, imdb, rotten_tomatoes and imdb_id and merge again.

In [49]:
to_drop = [col for col in netflix_ids.columns if col not in ["show", "year", "rating", "imdb", "rotten_tomatoes", "imdb_id"]]

In [50]:
netflix_ids = netflix_ids.drop(columns=to_drop)

In [51]:
netflix_ids.head()

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_id
0,Breaking Bad,2008,18+,9.5,96%,tt0903747
1,Stranger Things,2016,16+,8.8,93%,tt4574334
2,Sherlock,2010,16+,9.1,78%,tt1475582
3,Better Call Saul,2015,18+,8.7,97%,tt3032476
4,The Office,2005,16+,8.9,81%,tt0386676


Now we will merge all data from title basics

In [52]:
netflix_ids = netflix_ids.merge(imdb, how="left", left_on="imdb_id", right_on="tconst")

In [53]:
netflix_ids.head()

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_id,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,Breaking Bad,2008,18+,9.5,96%,tt0903747,tt0903747,tvSeries,Breaking Bad,Breaking Bad,0.0,2008.0,2008.0,49,"Crime,Drama,Thriller"
1,Stranger Things,2016,16+,8.8,93%,tt4574334,tt4574334,tvSeries,Stranger Things,Stranger Things,0.0,2016.0,2016.0,51,"Drama,Fantasy,Horror"
2,Sherlock,2010,16+,9.1,78%,tt1475582,tt1475582,tvSeries,Sherlock,Sherlock,0.0,2010.0,2010.0,88,"Crime,Drama,Mystery"
3,Better Call Saul,2015,18+,8.7,97%,tt3032476,tt3032476,tvSeries,Better Call Saul,Better Call Saul,0.0,2015.0,2015.0,46,"Crime,Drama"
4,The Office,2005,16+,8.9,81%,tt0386676,tt0386676,tvSeries,The Office,The Office,0.0,2005.0,2005.0,22,Comedy


In [54]:
netflix_ids = netflix_ids.drop(columns="tconst")

### 6. Export final data

In [55]:
# netflix_ids.to_pickle("../Data/netflix_final.pkl")