In [113]:
import pandas as pd
import numpy as np


## Importing IMDB Dataset

The dataset can be found here -> https://developer.imdb.com/non-commercial-datasets/

For this analysis, I am using two datasets from IMDB

    - Ratings dataset
    - Basic title info dataset
    
The goal is the match this dataset with my Netflix dataset and do a merge in order to expand my dataset for some cool analysis!

**title.basics.tsv.gz**

- tconst (string) - alphanumeric unique identifier of the title
- titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)
- primaryTitle (string) – the more popular title / the title used by the filmmakers on promotional materials at the point of release
- originalTitle (string) - original title, in the original language
- isAdult (boolean) - 0: non-adult title; 1: adult title
- startYear (YYYY) – represents the release year of a title. In the case of TV Series, it is the series start year
- endYear (YYYY) – TV Series end year. ‘\N’ for all other title types
- runtimeMinutes – primary runtime of the title, in minutes
- genres (string array) – includes up to three genres associated with the title

**title.ratings.tsv.gz**

- tconst (string) - alphanumeric unique identifier of the title
- averageRating – weighted average of all the individual user ratings
- numVotes - number of votes the title has received


In [129]:
data_rating = pd.read_csv('/Users/pranavsukumaran/Desktop/netflix/project/data/data-2.tsv', sep='\t')

data_basic = pd.read_csv('/Users/pranavsukumaran/Desktop/netflix/project/data/data-3.tsv', sep='\t')





  data_basic = pd.read_csv('/Users/pranavsukumaran/Desktop/netflix/project/data/data-3.tsv', sep='\t')


In [130]:
data_basic.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [131]:
data_rating.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2029
1,tt0000002,5.7,272
2,tt0000003,6.5,1968
3,tt0000004,5.4,178
4,tt0000005,6.2,2732


In [132]:
final_data = data_basic[['tconst', 'titleType', 'primaryTitle', 'isAdult', 'startYear', 'runtimeMinutes', 'genres']]

final_data

Unnamed: 0,tconst,titleType,primaryTitle,isAdult,startYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,0,1894,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,0,1892,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,0,1892,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,0,1892,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,0,1893,1,"Comedy,Short"
...,...,...,...,...,...,...,...
10584562,tt9916848,tvEpisode,Episode #3.17,0,2009,\N,"Action,Drama,Family"
10584563,tt9916850,tvEpisode,Episode #3.19,0,2010,\N,"Action,Drama,Family"
10584564,tt9916852,tvEpisode,Episode #3.20,0,2010,\N,"Action,Drama,Family"
10584565,tt9916856,short,The Wind,0,2015,27,Short


In [133]:
merged_data = pd.merge(final_data, data_rating, on='tconst')


In [134]:
merged_data

Unnamed: 0,tconst,titleType,primaryTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0000001,short,Carmencita,0,1894,1,"Documentary,Short",5.7,2029
1,tt0000002,short,Le clown et ses chiens,0,1892,5,"Animation,Short",5.7,272
2,tt0000003,short,Pauvre Pierrot,0,1892,4,"Animation,Comedy,Romance",6.5,1968
3,tt0000004,short,Un bon bock,0,1892,12,"Animation,Short",5.4,178
4,tt0000005,short,Blacksmith Scene,0,1893,1,"Comedy,Short",6.2,2732
...,...,...,...,...,...,...,...,...,...
1406649,tt9916730,movie,6 Gunn,0,2017,116,Drama,7.0,12
1406650,tt9916766,tvEpisode,Episode #10.15,0,2019,43,"Family,Game-Show,Reality-TV",7.1,23
1406651,tt9916778,tvEpisode,Escape,0,2019,\N,"Crime,Drama,Mystery",7.2,36
1406652,tt9916840,tvEpisode,Horrid Henry's Comic Caper,0,2014,11,"Adventure,Animation,Comedy",8.8,6


In [103]:
merged_data

Unnamed: 0,tconst,titleType,primaryTitle,genres,averageRating,numVotes
0,tt0000001,short,Carmencita,"Documentary,Short",5.7,2029
1,tt0000002,short,Le clown et ses chiens,"Animation,Short",5.7,272
2,tt0000003,short,Pauvre Pierrot,"Animation,Comedy,Romance",6.5,1968
3,tt0000004,short,Un bon bock,"Animation,Short",5.4,178
4,tt0000005,short,Blacksmith Scene,"Comedy,Short",6.2,2732
...,...,...,...,...,...,...
1406649,tt9916730,movie,6 Gunn,Drama,7.0,12
1406650,tt9916766,tvEpisode,Episode #10.15,"Family,Game-Show,Reality-TV",7.1,23
1406651,tt9916778,tvEpisode,Escape,"Crime,Drama,Mystery",7.2,36
1406652,tt9916840,tvEpisode,Horrid Henry's Comic Caper,"Adventure,Animation,Comedy",8.8,6


In [136]:
netflix_data = pd.read_csv("/Users/pranavsukumaran/Desktop/netflix/project/data/processed_data.csv")

In [137]:
merged_data['Title_lower'] = merged_data['primaryTitle'].str.lower()
netflix_data['Title_lower'] = netflix_data['Title'].str.lower()

In [138]:
imdb_max_votes = merged_data.loc[merged_data.groupby('Title_lower')['numVotes'].idxmax()]

In [139]:
imdb_max_votes

Unnamed: 0,tconst,titleType,primaryTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,numVotes,Title_lower
941768,tt2386381,tvSeries,!Next?,0,1994,\N,Documentary,5.0,22,!next?
867196,tt2071912,tvEpisode,!Que ve el Bisbe!,0,2011,\N,Comedy,6.2,12,!que ve el bisbe!
786393,tt1699720,movie,!Women Art Revolution,0,2010,83,Documentary,6.8,257,!women art revolution
944625,tt2399574,short,#,0,2012,15,"Comedy,Short",3.1,12,#
1149114,tt4724630,video,# My Ass,1,2015,143,Adult,6.8,9,# my ass
...,...,...,...,...,...,...,...,...,...,...
780813,tt1676214,short,Špansko the Continent,0,2009,11,"Action,Comedy,Drama",6.2,24,špansko the continent
734821,tt15392508,movie,Πέντε 5 Five,0,2023,135,"Drama,History,War",7.1,27,πέντε 5 five
122181,tt0180853,movie,Мужская компания,0,1992,70,"Action,Adventure",6.6,25,мужская компания
874764,tt21030032,tvSeries,【Oshi No Ko】,0,2023,24,"Animation,Drama,Fantasy",8.4,8003,【oshi no ko】


In [121]:
merged_data



Unnamed: 0,tconst,titleType,primaryTitle,genres,averageRating,numVotes,Title_lower
0,tt0000001,short,Carmencita,"Documentary,Short",5.7,2029,carmencita
1,tt0000002,short,Le clown et ses chiens,"Animation,Short",5.7,272,le clown et ses chiens
2,tt0000003,short,Pauvre Pierrot,"Animation,Comedy,Romance",6.5,1968,pauvre pierrot
3,tt0000004,short,Un bon bock,"Animation,Short",5.4,178,un bon bock
4,tt0000005,short,Blacksmith Scene,"Comedy,Short",6.2,2732,blacksmith scene
...,...,...,...,...,...,...,...
1406649,tt9916730,movie,6 Gunn,Drama,7.0,12,6 gunn
1406650,tt9916766,tvEpisode,Episode #10.15,"Family,Game-Show,Reality-TV",7.1,23,episode #10.15
1406651,tt9916778,tvEpisode,Escape,"Crime,Drama,Mystery",7.2,36,escape
1406652,tt9916840,tvEpisode,Horrid Henry's Comic Caper,"Adventure,Animation,Comedy",8.8,6,horrid henry's comic caper


In [140]:
#final_merged_data = pd.merge(netflix_data, merged_data, left_on='Title', right_on='Title', how='left')

final_merge = imdb_max_votes.merge(netflix_data, on="Title_lower", how="inner")

In [141]:
final_merge

Unnamed: 0,tconst,titleType,primaryTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,numVotes,Title_lower,Profile Name,Duration,Title,Country,Type,Hour,Day,Month,Year,Device Category
0,tt10311562,tvSeries,#BlackAF,0,2020,36,Comedy,6.8,5304,#blackaf,Priya,0.428333,#blackAF,TH (Thailand),TV Show,9,28,9,2020,Apple Mobile Devices
1,tt10311562,tvSeries,#BlackAF,0,2020,36,Comedy,6.8,5304,#blackaf,Priya,0.523889,#blackAF,TH (Thailand),TV Show,9,28,9,2020,Apple Mobile Devices
2,tt10311562,tvSeries,#BlackAF,0,2020,36,Comedy,6.8,5304,#blackaf,Priya,0.108611,#blackAF,TH (Thailand),TV Show,16,27,9,2020,Apple Mobile Devices
3,tt10311562,tvSeries,#BlackAF,0,2020,36,Comedy,6.8,5304,#blackaf,Priya,0.181111,#blackAF,TH (Thailand),TV Show,13,27,9,2020,Apple Mobile Devices
4,tt10311562,tvSeries,#BlackAF,0,2020,36,Comedy,6.8,5304,#blackaf,Priya,0.286944,#blackAF,TH (Thailand),TV Show,7,27,9,2020,Apple Mobile Devices
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11345,tt7239256,tvSeries,Zumbo's Just Desserts,0,2016,52,"Game-Show,Reality-TV",6.9,2323,zumbo's just desserts,Priya,0.243056,Zumbo's Just Desserts,GB (United Kingdom),TV Show,23,12,2,2019,Web Browsers
11346,tt7239256,tvSeries,Zumbo's Just Desserts,0,2016,52,"Game-Show,Reality-TV",6.9,2323,zumbo's just desserts,Priya,0.523056,Zumbo's Just Desserts,GB (United Kingdom),TV Show,22,12,2,2019,Web Browsers
11347,tt7239256,tvSeries,Zumbo's Just Desserts,0,2016,52,"Game-Show,Reality-TV",6.9,2323,zumbo's just desserts,Priya,0.125278,Zumbo's Just Desserts,GB (United Kingdom),TV Show,22,12,2,2019,Web Browsers
11348,tt7239256,tvSeries,Zumbo's Just Desserts,0,2016,52,"Game-Show,Reality-TV",6.9,2323,zumbo's just desserts,Priya,0.406667,Zumbo's Just Desserts,GB (United Kingdom),TV Show,21,12,2,2019,Web Browsers


In [142]:
unique_titles_by_profile = final_merge.groupby('Profile Name')['primaryTitle'].nunique()
print(unique_titles_by_profile)



Profile Name
Home      584
Pranav    602
Priya     225
Name: primaryTitle, dtype: int64


In [143]:
final_merge.to_csv('merged_data.csv', index=False)