In [1]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)

path = "C:/Users/Admin/Documents/ironhack/streaming_service_recommender/"

## Final Data Cleaning

#### Goals

- Check current data types
- Change data types if necessary
- Remove tv shows which years don't match with IMDB's year (We will remove this rows because we can not confirm if they are the same show, in order to have more accuracy on our analysis.)

----- 

## Netflix

### 1. Import data

In [2]:
netflix = pd.read_pickle(path + "Data/netflix_final.pkl")

In [3]:
netflix.head()

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_id,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,Breaking Bad,2008,18+,9.5,96%,tt0903747,tvSeries,Breaking Bad,Breaking Bad,0.0,2008.0,2008.0,49,"Crime,Drama,Thriller"
1,Stranger Things,2016,16+,8.8,93%,tt4574334,tvSeries,Stranger Things,Stranger Things,0.0,2016.0,2016.0,51,"Drama,Fantasy,Horror"
2,Sherlock,2010,16+,9.1,78%,tt1475582,tvSeries,Sherlock,Sherlock,0.0,2010.0,2010.0,88,"Crime,Drama,Mystery"
3,Better Call Saul,2015,18+,8.7,97%,tt3032476,tvSeries,Better Call Saul,Better Call Saul,0.0,2015.0,2015.0,46,"Crime,Drama"
4,The Office,2005,16+,8.9,81%,tt0386676,tvSeries,The Office,The Office,0.0,2005.0,2005.0,22,Comedy


In [4]:
netflix.dtypes

show                object
year                 int64
rating              object
imdb               float64
rotten_tomatoes     object
imdb_id             object
titleType           object
primaryTitle        object
originalTitle       object
isAdult            float64
startYear          float64
endYear            float64
runtimeMinutes      object
genres              object
dtype: object

### 2. Check null and unique values

In [5]:
len(netflix)

1826

In [6]:
netflix.isna().sum()

show                  0
year                  0
rating              634
imdb                 85
rotten_tomatoes    1377
imdb_id               0
titleType            36
primaryTitle         36
originalTitle        36
isAdult              36
startYear            42
endYear              42
runtimeMinutes       36
genres               36
dtype: int64

In [7]:
netflix.nunique(axis=0)

show               1821
year                 48
rating                5
imdb                 69
rotten_tomatoes      74
imdb_id            1823
titleType             4
primaryTitle       1781
originalTitle      1781
isAdult               1
startYear            48
endYear              48
runtimeMinutes      128
genres              262
dtype: int64

From the previous cells we can see that:
- Most values are missing for rotten_tomatoes.
- isAdult has just one value.

We will drop this two columns, since we cannot get much information from it.


In [8]:
netflix = netflix.drop(columns=["rotten_tomatoes", "isAdult"])

### 3. Change data types

- rating: We will remove the '+' sign and turn it into an integer
- runtimeMinutes: we will change the type to integer

#### i. rating

In [9]:
netflix["rating"] = [str(i).replace("+", "") for i in netflix["rating"]]

In [10]:
netflix["rating"].value_counts()

nan    634
16     386
18     348
7      295
all    160
13       3
Name: rating, dtype: int64

Since rating is a string type, we will convert the 'nan' values to null and 'all' to 0, meaning that the series can be watched by all ages.

In [11]:
netflix["rating"] = np.where(netflix["rating"] == "nan", None, netflix["rating"])
netflix["rating"] = np.where(netflix["rating"] == "all", 0, netflix["rating"])

In [12]:
netflix["rating"].value_counts()

16    386
18    348
7     295
0     160
13      3
Name: rating, dtype: int64

In [13]:
netflix["rating"] = pd.to_numeric(netflix["rating"], errors="coerce")

#### ii. runtimeMinutes

In [14]:
netflix["runtimeMinutes"] = pd.to_numeric(netflix["runtimeMinutes"], errors="coerce")

### 4. Rename rating and imdb columns

In [15]:
netflix = netflix.rename(columns={"rating":"age", "imdb":"imdb_rating"})

### 5. Check final data types

In [16]:
netflix.dtypes

show               object
year                int64
age               float64
imdb_rating       float64
imdb_id            object
titleType          object
primaryTitle       object
originalTitle      object
startYear         float64
endYear           float64
runtimeMinutes    float64
genres             object
dtype: object

### 6. Remove rows where year and startYear don't match

In [17]:
netflix = netflix[netflix["year"] == netflix["startYear"]].reset_index(drop=True)

netflix.head()

Unnamed: 0,show,year,age,imdb_rating,imdb_id,titleType,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,genres
0,Breaking Bad,2008,18.0,9.5,tt0903747,tvSeries,Breaking Bad,Breaking Bad,2008.0,2008.0,49.0,"Crime,Drama,Thriller"
1,Stranger Things,2016,16.0,8.8,tt4574334,tvSeries,Stranger Things,Stranger Things,2016.0,2016.0,51.0,"Drama,Fantasy,Horror"
2,Sherlock,2010,16.0,9.1,tt1475582,tvSeries,Sherlock,Sherlock,2010.0,2010.0,88.0,"Crime,Drama,Mystery"
3,Better Call Saul,2015,18.0,8.7,tt3032476,tvSeries,Better Call Saul,Better Call Saul,2015.0,2015.0,46.0,"Crime,Drama"
4,The Office,2005,16.0,8.9,tt0386676,tvSeries,The Office,The Office,2005.0,2005.0,22.0,Comedy


In [18]:
len(netflix) / 1915

0.8553524804177546

We will be working with 85% from our original data.

### 7. Export data

In [19]:
# netflix.to_pickle(path + "Data/netflix_final_clean.pkl")

------

## Amazon

### 1. Import data

In [20]:
amazon = pd.read_pickle(path + "Data/amazon_final.pkl")

In [21]:
amazon.head()

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_id,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,The Wire,2002,18+,9.3,94%,tt0306414,tvSeries,The Wire,The Wire,0,2002.0,2002.0,59,"Crime,Drama,Thriller"
1,The Sopranos,1999,18+,9.2,92%,tt0141842,tvSeries,The Sopranos,The Sopranos,0,1999.0,1999.0,55,"Crime,Drama"
2,Band of Brothers,2001,18+,9.4,94%,tt0185906,tvMiniSeries,Band of Brothers,Band of Brothers,0,2001.0,2001.0,594,"Action,Drama,History"
3,Vikings,2013,18+,8.6,93%,tt2306299,tvSeries,Vikings,Vikings,0,2013.0,2013.0,44,"Action,Adventure,Drama"
4,Mr. Robot,2015,18+,8.5,94%,tt4158110,tvSeries,Mr. Robot,Mr. Robot,0,2015.0,2015.0,49,"Crime,Drama,Thriller"


In [22]:
amazon.dtypes

show                object
year                 int64
rating              object
imdb               float64
rotten_tomatoes     object
imdb_id             object
titleType           object
primaryTitle        object
originalTitle       object
isAdult              int64
startYear          float64
endYear            float64
runtimeMinutes      object
genres              object
dtype: object

### 2. Check null and unique values

In [23]:
len(amazon)

1607

In [24]:
amazon.isna().sum()

show                  0
year                  0
rating              846
imdb                396
rotten_tomatoes    1383
imdb_id               0
titleType             0
primaryTitle          0
originalTitle         0
isAdult               0
startYear             8
endYear               8
runtimeMinutes        0
genres                0
dtype: int64

In [25]:
amazon.nunique(axis=0)

show               1579
year                 72
rating                5
imdb                 69
rotten_tomatoes      64
imdb_id            1607
titleType             4
primaryTitle       1578
originalTitle      1579
isAdult               2
startYear            72
endYear              72
runtimeMinutes      145
genres              279
dtype: int64

In [26]:
amazon["isAdult"].value_counts()

0    1606
1       1
Name: isAdult, dtype: int64

Since there is just one movie with one true value for isAdult, we will drop this column as well and follow the same process as for netflix.

In [27]:
amazon = amazon.drop(columns=["rotten_tomatoes", "isAdult"])

### 3. Change data types


In [28]:
amazon["rating"] = [str(i).replace("+", "") for i in amazon["rating"]]

In [29]:
amazon["rating"].value_counts()

nan    846
7      210
16     195
all    180
18     175
13       1
Name: rating, dtype: int64

In [30]:
amazon["rating"] = np.where(amazon["rating"] == "nan", None, amazon["rating"])
amazon["rating"] = np.where(amazon["rating"] == "all", 0, amazon["rating"])

In [31]:
amazon["rating"] = pd.to_numeric(amazon["rating"], errors="coerce")

In [32]:
amazon["runtimeMinutes"] = pd.to_numeric(amazon["runtimeMinutes"], errors="coerce")

### 4. Rename rating and imdb columns

In [33]:
amazon = amazon.rename(columns={"rating":"age", "imdb":"imdb_rating"})

### 5. Check final data types

In [34]:
amazon.dtypes

show               object
year                int64
age               float64
imdb_rating       float64
imdb_id            object
titleType          object
primaryTitle       object
originalTitle      object
startYear         float64
endYear           float64
runtimeMinutes    float64
genres             object
dtype: object

### 6. Remove rows where year and startYear don't match


In [35]:
amazon = amazon[amazon["year"] == amazon["startYear"]].reset_index(drop=True)

amazon.head()

Unnamed: 0,show,year,age,imdb_rating,imdb_id,titleType,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,genres
0,The Wire,2002,18.0,9.3,tt0306414,tvSeries,The Wire,The Wire,2002.0,2002.0,59.0,"Crime,Drama,Thriller"
1,The Sopranos,1999,18.0,9.2,tt0141842,tvSeries,The Sopranos,The Sopranos,1999.0,1999.0,55.0,"Crime,Drama"
2,Band of Brothers,2001,18.0,9.4,tt0185906,tvMiniSeries,Band of Brothers,Band of Brothers,2001.0,2001.0,594.0,"Action,Drama,History"
3,Vikings,2013,18.0,8.6,tt2306299,tvSeries,Vikings,Vikings,2013.0,2013.0,44.0,"Action,Adventure,Drama"
4,Mr. Robot,2015,18.0,8.5,tt4158110,tvSeries,Mr. Robot,Mr. Robot,2015.0,2015.0,49.0,"Crime,Drama,Thriller"


In [36]:
len(amazon) / 2136

0.6039325842696629

We will be working with 60% from our original data.

### 7. Export data


In [37]:
# amazon.to_pickle(path + "Data/amazon_final_clean.pkl")

------

## HBO

### 1. Import data

In [38]:
hbo = pd.read_pickle(path + "Data/hbo_final.pkl")

In [39]:
hbo.head()

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes,imdb_id,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,Game of Thrones,2011,18+,9.3,89%,tt0944947,tt0944947,tvSeries,Game of Thrones,Game of Thrones,0,2011.0,2011.0,57,"Action,Adventure,Drama"
1,The Wire,2002,18+,9.3,94%,tt0306414,tt0306414,tvSeries,The Wire,The Wire,0,2002.0,2002.0,59,"Crime,Drama,Thriller"
2,Chernobyl,2019,18+,9.4,96%,tt7366338,tt7366338,tvMiniSeries,Chernobyl,Chernobyl,0,2019.0,2019.0,330,"Drama,History,Thriller"
3,The Sopranos,1999,18+,9.2,92%,tt0141842,tt0141842,tvSeries,The Sopranos,The Sopranos,0,1999.0,1999.0,55,"Crime,Drama"
4,Band of Brothers,2001,18+,9.4,94%,tt0185906,tt0185906,tvMiniSeries,Band of Brothers,Band of Brothers,0,2001.0,2001.0,594,"Action,Drama,History"


In [40]:
hbo.dtypes

show                object
year                 int64
rating              object
imdb               float64
rotten_tomatoes     object
imdb_id             object
tconst              object
titleType           object
primaryTitle        object
originalTitle       object
isAdult              int64
startYear          float64
endYear            float64
runtimeMinutes      object
genres              object
dtype: object

### 2. Check null and unique values

In [41]:
len(hbo)

186

In [42]:
hbo.isna().sum()

show                0
year                0
rating             33
imdb                5
rotten_tomatoes    76
imdb_id             0
tconst              0
titleType           0
primaryTitle        0
originalTitle       0
isAdult             0
startYear           1
endYear             1
runtimeMinutes      0
genres              0
dtype: int64

In [43]:
hbo.nunique(axis=0)

show               184
year                32
rating               4
imdb                45
rotten_tomatoes     42
imdb_id            186
tconst             186
titleType            3
primaryTitle       184
originalTitle      184
isAdult              1
startYear           34
endYear             34
runtimeMinutes      48
genres              70
dtype: int64

We will follow the same process as with Netflix.

In [44]:
hbo = hbo.drop(columns=["rotten_tomatoes", "isAdult"])

### 3. Change data types

In [45]:
hbo["rating"] = [str(i).replace("+", "") for i in hbo["rating"]]

In [46]:
hbo["rating"].value_counts()

18     115
nan     33
16      22
all      9
7        7
Name: rating, dtype: int64

In [47]:
hbo["rating"] = np.where(hbo["rating"] == "nan", None, hbo["rating"])
hbo["rating"] = np.where(hbo["rating"] == "all", 0, hbo["rating"])

In [48]:
hbo["rating"] = pd.to_numeric(hbo["rating"], errors="coerce")

In [49]:
hbo["runtimeMinutes"] = pd.to_numeric(hbo["runtimeMinutes"], errors="coerce")

### 4. Rename rating and imdb columns

In [50]:
hbo = hbo.rename(columns={"rating":"age", "imdb":"imdb_rating"})

### 5. Check final data types

In [51]:
hbo.dtypes

show               object
year                int64
age               float64
imdb_rating       float64
imdb_id            object
tconst             object
titleType          object
primaryTitle       object
originalTitle      object
startYear         float64
endYear           float64
runtimeMinutes    float64
genres             object
dtype: object

### 6. Remove rows where year and startYear don't match


In [52]:
hbo = hbo[hbo["year"] == hbo["startYear"]].reset_index(drop=True)

hbo.head()

Unnamed: 0,show,year,age,imdb_rating,imdb_id,tconst,titleType,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,genres
0,Game of Thrones,2011,18.0,9.3,tt0944947,tt0944947,tvSeries,Game of Thrones,Game of Thrones,2011.0,2011.0,57.0,"Action,Adventure,Drama"
1,The Wire,2002,18.0,9.3,tt0306414,tt0306414,tvSeries,The Wire,The Wire,2002.0,2002.0,59.0,"Crime,Drama,Thriller"
2,Chernobyl,2019,18.0,9.4,tt7366338,tt7366338,tvMiniSeries,Chernobyl,Chernobyl,2019.0,2019.0,330.0,"Drama,History,Thriller"
3,The Sopranos,1999,18.0,9.2,tt0141842,tt0141842,tvSeries,The Sopranos,The Sopranos,1999.0,1999.0,55.0,"Crime,Drama"
4,Band of Brothers,2001,18.0,9.4,tt0185906,tt0185906,tvMiniSeries,Band of Brothers,Band of Brothers,2001.0,2001.0,594.0,"Action,Drama,History"


In [53]:
len(hbo) / 200

0.86

We will be working with 86% from our original data.

### 7. Export data

In [54]:
# hbo.to_pickle(path + "Data/hbo_final_clean.pkl")