In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz

path = "C:/Users/Admin/Documents/ironhack/streaming_service_recommender/"

## Find missing IMDB IDs for HBO Data

#### Goals

- Find missing titles from original merge.
- Find missing titles on IMDB using fuzzywuzzy.

**NOTE: We will use the same process as in 03.1_a_netflix_missing_title_ids notebook**

### 1. Import data

In [2]:
hbo = pd.read_csv(path + "Data/hbo_shows.csv")

In [3]:
hbo_ids = pd.read_pickle(path + "Data/hbo_ids.pkl")

In [4]:
hbo_missing = pd.read_pickle(path + "Data/hbo_missing.pkl")

### 2. Compare length of values of oringal list vs new ones

We will compare the number of rows from the original amazon data frame and the sum of the missing titles and the amazon ids.

In [5]:
len(hbo)

200

In [6]:
len(hbo_ids) + len(hbo_missing)

197

In [7]:
hbo["show"].nunique()

197

3 values are missing but when looking at unique values on the original data frame, we get the same result. This means that there must be some repeated titles which may be from different years. We will make a value count for the show column in the original hbo data frame.

In [8]:
hbo["show"].value_counts().head(3)

High Maintenance        2
The Electric Company    2
Rome                    2
Name: show, dtype: int64

We will look at each of them and add them the amazon missing data frame.

In [9]:
hbo[hbo["show"] == "The Electric Company"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes
122,The Electric Company,1971,all,8.1,
156,The Electric Company,2009,,7.3,


In [10]:
hbo_ids[hbo_ids["show"] == "The Electric Company"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,tconst,titleType,primaryTitle,originalTitle,isAdult,...,runtimeMinutes,genres,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
107,The Electric Company,1971,all,8.1,,tt0066651,tvSeries,The Electric Company,The Electric Company,0.0,...,28,"Comedy,Family",,,,,,,,


In [11]:
hbo_missing = hbo_missing.append(hbo[(hbo["show"] == "The Electric Company") & (hbo["year"] == 2009)])

In [12]:
hbo[hbo["show"] == "High Maintenance"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes
50,High Maintenance,2016,18+,8.0,98%
111,High Maintenance,2012,18+,8.2,


In [13]:
hbo_ids[hbo_ids["show"] == "High Maintenance"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,tconst,titleType,primaryTitle,originalTitle,isAdult,...,runtimeMinutes,genres,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
48,High Maintenance,2016,18+,8.0,98%,tt6078096,tvSeries,High Maintenance,High Maintenance,0.0,...,30,"Comedy,Drama",,,,,,,,


In [14]:
hbo_missing = hbo_missing.append(hbo[(hbo["show"] == "High Maintenance") & (hbo["year"] == 2012)])

In [15]:
hbo[hbo["show"] == "Rome"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes
11,Rome,2005,18+,8.7,86%
181,Rome,2012,,5.8,


In [16]:
hbo_ids[hbo_ids["show"] == "Rome"]

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,tconst,titleType,primaryTitle,originalTitle,isAdult,...,runtimeMinutes,genres,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
11,Rome,2005,18+,8.7,86%,tt0384766,tvSeries,Rome,Rome,0.0,...,52,"Action,Drama,History",,,,,,,,


In [17]:
hbo_missing = hbo_missing.append(hbo[(hbo["show"] == "Rome") & (hbo["year"] == 2012)])

In [18]:
len(hbo_missing) + len(hbo_ids)

200

Now we got the same values from the original tv shows and the new data frames.

In [19]:
hbo_missing = hbo_missing.reset_index(drop=True)

### 3. Find missing titles using fuzzywuzzy

We will first create a list for missing shows.

In [20]:
hbo_shows_missing = [show for show in hbo_missing["show"]]

In [21]:
len(hbo_shows_missing)

18

In [22]:
imdb = pd.read_pickle(path + "Data/imdb_tv_all.pkl")

In [23]:
imdb_titles = [title for title in imdb["primaryTitle"]]

We will use the same function as on the netflix notebook, this time we will look for values with a ration higher than 80, since for HBO it is easy to see this because we have a small data frame.

In [24]:
def find_shows(show):
    matches = []

    for title in imdb_titles:
        # compute ratio and remove case-sensitivity
        ratio = fuzz.ratio(title.lower(), show.lower())

        # add all matches to list with ratio > 80
        if ratio >= 80:
            matches.append((title, show, ratio))
    
    # return none if there was no match found
    if len(matches) == 0:
        return None
    return sorted(matches, key=lambda x: x[2], reverse=True)[0][0]

We will now create a new column with the matched imdb titles.

In [25]:
hbo_missing["imdb_titles"] = hbo_missing["show"].apply(lambda x: find_shows(x))

In [26]:
hbo_missing

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_titles
0,Jonah From Tonga,2014,,7.1,80%,Jonah from Tonga
1,We Can Be Heroes: Finding The Australian of th...,2005,18+,8.1,,
2,Magnifica 70,2015,16+,7.8,,Magnífica 70
3,The Shop: Uninterrupted,2018,18+,6.6,,
4,Arliss,1996,,7.0,,
5,Epitaphs,2004,18+,7.9,,Epitaph
6,VICE News Tonight,2016,16+,8.3,,Vice News Tonight
7,When Shall We Kiss?,2011,,7.9,,When Will We Kiss
8,Todxs Nos,2020,,5.4,,Todxs Nós
9,El Pionero,2019,16+,7.3,,El Camionero


We will remove the null values for imdb titles, this accounts for 2% of the data.


In [27]:
hbo_missing = hbo_missing[~hbo_missing["imdb_titles"].isna()].reset_index(drop=True)

### 4. Merge IMDB IDs

We will still merge on the year, in order to be sure that we are not merging a remake or a similar title.

In [28]:
hbo_missing = hbo_missing.merge(imdb, how="left", left_on=["imdb_titles", "year"], right_on=["primaryTitle", "startYear"])

We will separate the missing titles that were not matched.

In [29]:
hbo_missing_2 = hbo_missing[hbo_missing["tconst"].isna()].reset_index(drop=True)

In [30]:
hbo_missing_2

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_titles,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,Epitaphs,2004,18+,7.9,,Epitaph,,,,,,,,,
1,When Shall We Kiss?,2011,,7.9,,When Will We Kiss,,,,,,,,,
2,El Pionero,2019,16+,7.3,,El Camionero,,,,,,,,,
3,Sex On//,2015,18+,5.8,,Sex on //,,,,,,,,,
4,KREMEN,2017,,6.4,,Kremen,,,,,,,,,
5,SeNT,2017,,5.6,,Sent,,,,,,,,,
6,The Electric Company,2009,,7.3,,The Electric Company,,,,,,,,,


We will also remove this shows from the analysis since we cannot verify it is the same show due to the years, and some of them don't have a matching title.

In [31]:
hbo_missing = hbo_missing[~hbo_missing["tconst"].isna()]

In [32]:
hbo_missing = hbo_missing.reset_index(drop=True)

In [33]:
hbo_missing

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_titles,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,Jonah From Tonga,2014,,7.1,80%,Jonah from Tonga,tt3431720,tvMiniSeries,Jonah from Tonga,Jonah from Tonga,0.0,2014.0,2014.0,30,Comedy
1,Magnifica 70,2015,16+,7.8,,Magnífica 70,tt4725820,tvSeries,Magnífica 70,Magnífica 70,0.0,2015.0,2015.0,60,Drama
2,VICE News Tonight,2016,16+,8.3,,Vice News Tonight,tt6329790,tvSeries,Vice News Tonight,Vice News Tonight,0.0,2016.0,2016.0,30,News
3,Todxs Nos,2020,,5.4,,Todxs Nós,tt11212828,tvSeries,Todxs Nós,Todxs Nós,0.0,2020.0,2020.0,\N,Drama
4,Destino Rusia,2018,,8.1,,Destino Rusia 2018,tt8367634,tvMiniSeries,Destino Rusia 2018,Destino Rusia 2018,0.0,2018.0,2018.0,30,"Documentary,Drama,Sport"
5,High Maintenance,2012,18+,8.2,,High Maintenance,tt2514438,tvSeries,High Maintenance,High Maintenance,0.0,2012.0,2012.0,30,Comedy
6,Rome,2012,,5.8,,Rome,tt2823184,tvSeries,Rome,Rome,0.0,2012.0,2012.0,30,"News,Sport,Talk-Show"


In [34]:
hbo_ids = hbo_ids.append(hbo_missing).reset_index(drop=True).drop_duplicates("tconst")

In [35]:
hbo_ids

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,tconst,titleType,primaryTitle,originalTitle,isAdult,...,genres,titleId,ordering,title,region,language,types,attributes,isOriginalTitle,imdb_titles
0,Game of Thrones,2011,18+,9.3,89%,tt0944947,tvSeries,Game of Thrones,Game of Thrones,0.0,...,"Action,Adventure,Drama",,,,,,,,,
1,The Wire,2002,18+,9.3,94%,tt0306414,tvSeries,The Wire,The Wire,0.0,...,"Crime,Drama,Thriller",,,,,,,,,
2,Chernobyl,2019,18+,9.4,96%,tt7366338,tvMiniSeries,Chernobyl,Chernobyl,0.0,...,"Drama,History,Thriller",,,,,,,,,
3,The Sopranos,1999,18+,9.2,92%,tt0141842,tvSeries,The Sopranos,The Sopranos,0.0,...,"Crime,Drama",,,,,,,,,
4,Band of Brothers,2001,18+,9.4,94%,tt0185906,tvMiniSeries,Band of Brothers,Band of Brothers,0.0,...,"Action,Drama,History",,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,VICE News Tonight,2016,16+,8.3,,tt6329790,tvSeries,Vice News Tonight,Vice News Tonight,0.0,...,News,,,,,,,,,Vice News Tonight
185,Todxs Nos,2020,,5.4,,tt11212828,tvSeries,Todxs Nós,Todxs Nós,0.0,...,Drama,,,,,,,,,Todxs Nós
186,Destino Rusia,2018,,8.1,,tt8367634,tvMiniSeries,Destino Rusia 2018,Destino Rusia 2018,0.0,...,"Documentary,Drama,Sport",,,,,,,,,Destino Rusia 2018
187,High Maintenance,2012,18+,8.2,,tt2514438,tvSeries,High Maintenance,High Maintenance,0.0,...,Comedy,,,,,,,,,High Maintenance


We will now calculate the ratio of titles with an imdb id match and the original titles.

In [36]:
len(hbo_ids)/len(hbo)

0.93

This means that we will be using 94.5% of the original data.

### 5. Cleaning final data

We will create a final data frame including:
- show
- year
- rating
- imdb
- rotten_tomatoes
- imdb_id
- all data from title basics

First, we will need to create the imdb_id column, this will have tconst and titleId values.

In [37]:
hbo_ids["imdb_id"] = np.where(hbo_ids["tconst"].isna(), hbo_ids["titleId"], hbo_ids["tconst"])

In [38]:
hbo_ids["imdb_id"].isna().value_counts()

False    186
Name: imdb_id, dtype: int64

This means we have all the IMDB IDs for all the titles in this final data frame. We will now remove columns that are not show, year, rating, imdb, rotten_tomatoes and imdb_id and merge again.

In [39]:
to_drop = [col for col in hbo_ids.columns if col not in ["show", "year", "rating", "imdb", "rotten_tomatoes", "imdb_id"]]

In [40]:
hbo_ids = hbo_ids.drop(columns=to_drop)

In [41]:
hbo_ids.head()

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_id
0,Game of Thrones,2011,18+,9.3,89%,tt0944947
1,The Wire,2002,18+,9.3,94%,tt0306414
2,Chernobyl,2019,18+,9.4,96%,tt7366338
3,The Sopranos,1999,18+,9.2,92%,tt0141842
4,Band of Brothers,2001,18+,9.4,94%,tt0185906


Now we will merge all data from title basics

In [42]:
hbo_ids = hbo_ids.merge(imdb, how="left", left_on="imdb_id", right_on="tconst")

In [43]:
hbo_ids.head()

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_id,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,Game of Thrones,2011,18+,9.3,89%,tt0944947,tt0944947,tvSeries,Game of Thrones,Game of Thrones,0,2011.0,2011.0,57,"Action,Adventure,Drama"
1,The Wire,2002,18+,9.3,94%,tt0306414,tt0306414,tvSeries,The Wire,The Wire,0,2002.0,2002.0,59,"Crime,Drama,Thriller"
2,Chernobyl,2019,18+,9.4,96%,tt7366338,tt7366338,tvMiniSeries,Chernobyl,Chernobyl,0,2019.0,2019.0,330,"Drama,History,Thriller"
3,The Sopranos,1999,18+,9.2,92%,tt0141842,tt0141842,tvSeries,The Sopranos,The Sopranos,0,1999.0,1999.0,55,"Crime,Drama"
4,Band of Brothers,2001,18+,9.4,94%,tt0185906,tt0185906,tvMiniSeries,Band of Brothers,Band of Brothers,0,2001.0,2001.0,594,"Action,Drama,History"


### 6. Export final data

In [45]:
# hbo_ids.to_pickle(path + "Data/hbo_final.pkl")