In [2]:
import pandas as pd
import numpy as np


## Importing IMDB Dataset

The dataset can be found here -> https://developer.imdb.com/non-commercial-datasets/

For this analysis, I am using two datasets from IMDB

    - Ratings dataset
    - Basic title info dataset
    
The goal is the match this dataset with my Netflix dataset and do a merge in order to expand my dataset for some cool analysis!

**title.basics.tsv.gz**

- tconst (string) - alphanumeric unique identifier of the title
- titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)
- primaryTitle (string) – the more popular title / the title used by the filmmakers on promotional materials at the point of release
- originalTitle (string) - original title, in the original language
- isAdult (boolean) - 0: non-adult title; 1: adult title
- startYear (YYYY) – represents the release year of a title. In the case of TV Series, it is the series start year
- endYear (YYYY) – TV Series end year. ‘\N’ for all other title types
- runtimeMinutes – primary runtime of the title, in minutes
- genres (string array) – includes up to three genres associated with the title

**title.ratings.tsv.gz**

- tconst (string) - alphanumeric unique identifier of the title
- averageRating – weighted average of all the individual user ratings
- numVotes - number of votes the title has received


In [4]:
data_rating = pd.read_csv('/Users/pranavsukumaran/Desktop/Personal_dev/netflix/project/data/data-2.tsv', sep='\t')

data_basic = pd.read_csv('/Users/pranavsukumaran/Desktop/Personal_dev/netflix/project/data/data-3.tsv', sep='\t')





  data_basic = pd.read_csv('/Users/pranavsukumaran/Desktop/Personal_dev/netflix/project/data/data-3.tsv', sep='\t')


In [5]:
data_basic.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [6]:
data_rating.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2029
1,tt0000002,5.7,272
2,tt0000003,6.5,1968
3,tt0000004,5.4,178
4,tt0000005,6.2,2732


In [7]:
final_data = data_basic[['tconst', 'titleType', 'primaryTitle', 'isAdult', 'startYear', 'runtimeMinutes', 'genres']]

final_data

Unnamed: 0,tconst,titleType,primaryTitle,isAdult,startYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,0,1894,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,0,1892,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,0,1892,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,0,1892,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,0,1893,1,"Comedy,Short"
...,...,...,...,...,...,...,...
10584562,tt9916848,tvEpisode,Episode #3.17,0,2009,\N,"Action,Drama,Family"
10584563,tt9916850,tvEpisode,Episode #3.19,0,2010,\N,"Action,Drama,Family"
10584564,tt9916852,tvEpisode,Episode #3.20,0,2010,\N,"Action,Drama,Family"
10584565,tt9916856,short,The Wind,0,2015,27,Short


In [8]:
merged_data = pd.merge(final_data, data_rating, on='tconst')


In [9]:
merged_data

Unnamed: 0,tconst,titleType,primaryTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0000001,short,Carmencita,0,1894,1,"Documentary,Short",5.7,2029
1,tt0000002,short,Le clown et ses chiens,0,1892,5,"Animation,Short",5.7,272
2,tt0000003,short,Pauvre Pierrot,0,1892,4,"Animation,Comedy,Romance",6.5,1968
3,tt0000004,short,Un bon bock,0,1892,12,"Animation,Short",5.4,178
4,tt0000005,short,Blacksmith Scene,0,1893,1,"Comedy,Short",6.2,2732
...,...,...,...,...,...,...,...,...,...
1406649,tt9916730,movie,6 Gunn,0,2017,116,Drama,7.0,12
1406650,tt9916766,tvEpisode,Episode #10.15,0,2019,43,"Family,Game-Show,Reality-TV",7.1,23
1406651,tt9916778,tvEpisode,Escape,0,2019,\N,"Crime,Drama,Mystery",7.2,36
1406652,tt9916840,tvEpisode,Horrid Henry's Comic Caper,0,2014,11,"Adventure,Animation,Comedy",8.8,6


In [None]:
merged_data

In [11]:
netflix_data = pd.read_csv("/Users/pranavsukumaran/Desktop/Personal_dev/netflix/project/data/processed_data.csv")

In [12]:
merged_data['Title_lower'] = merged_data['primaryTitle'].str.lower()
netflix_data['Title_lower'] = netflix_data['Title'].str.lower()

In [13]:
imdb_max_votes = merged_data.loc[merged_data.groupby('Title_lower')['numVotes'].idxmax()]

In [14]:
imdb_max_votes

Unnamed: 0,tconst,titleType,primaryTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,numVotes,Title_lower
941768,tt2386381,tvSeries,!Next?,0,1994,\N,Documentary,5.0,22,!next?
867196,tt2071912,tvEpisode,!Que ve el Bisbe!,0,2011,\N,Comedy,6.2,12,!que ve el bisbe!
786393,tt1699720,movie,!Women Art Revolution,0,2010,83,Documentary,6.8,257,!women art revolution
944625,tt2399574,short,#,0,2012,15,"Comedy,Short",3.1,12,#
1149114,tt4724630,video,# My Ass,1,2015,143,Adult,6.8,9,# my ass
...,...,...,...,...,...,...,...,...,...,...
780813,tt1676214,short,Špansko the Continent,0,2009,11,"Action,Comedy,Drama",6.2,24,špansko the continent
734821,tt15392508,movie,Πέντε 5 Five,0,2023,135,"Drama,History,War",7.1,27,πέντε 5 five
122181,tt0180853,movie,Мужская компания,0,1992,70,"Action,Adventure",6.6,25,мужская компания
874764,tt21030032,tvSeries,【Oshi No Ko】,0,2023,24,"Animation,Drama,Fantasy",8.4,8003,【oshi no ko】


In [None]:
merged_data



In [15]:
#final_merged_data = pd.merge(netflix_data, merged_data, left_on='Title', right_on='Title', how='left')

final_merge = imdb_max_votes.merge(netflix_data, on="Title_lower", how="inner")

In [16]:
print(final_merge.columns)


Index(['tconst', 'titleType', 'primaryTitle', 'isAdult', 'startYear',
       'runtimeMinutes', 'genres', 'averageRating', 'numVotes', 'Title_lower',
       'Profile Name', 'Duration', 'Title', 'Country', 'Type', 'Hour', 'Day',
       'Month', 'Year', 'Device Category'],
      dtype='object')


In [None]:
final_merge.to_csv('merged_data.csv', index=False)

In [30]:
def search_title(title, final_merge, merged_data, netflix_data,imdb_data):
    # Convert the title to lowercase for case-insensitive comparison
    title_lower = title.lower()

    # Search in final
    final_exists = final_merge['Title'].str.lower().str.contains(title_lower).any()

    # Search in merged_data
    merged_exists = merged_data['primaryTitle'].str.lower().str.contains(title_lower).any()
    netflix_exists = netflix_data['Title'].str.lower().str.contains(title_lower).any()
    imdb_exists = imdb_max_votes['primaryTitle'].str.lower().str.contains(title_lower).any()

    return {
        'title': title,
        'exists_in_finaldata': final_exists,
        'exists_in_merged_data': merged_exists,
        'exists_in_netflix_data': netflix_exists,
        'exists_in_imdb_data': imdb_exists
    }

# Example usage:
result = search_title("Jessica Jones", final_merge, merged_data,netflix_data, imdb_max_votes)
print(result)


{'title': 'Jessica Jones', 'exists_in_finaldata': False, 'exists_in_merged_data': True, 'exists_in_netflix_data': True, 'exists_in_imdb_data': True}


In [31]:
# Check "Jessica Jones" in imdb_max_votes
print(imdb_max_votes[imdb_max_votes['Title_lower'].str.contains("jessica jones", na=False)])

# Check "Jessica Jones" in netflix_data
print(netflix_data[netflix_data['Title_lower'].str.contains("jessica jones", na=False)])


            tconst  titleType                   primaryTitle isAdult  \
934253   tt2357547   tvSeries                  Jessica Jones       0   
1180062  tt5235104  tvEpisode  Jessica Jones & MCU Team Up?!       0   
1337023  tt8119772  tvEpisode      Marvel's Jessica Jones S2       0   

        startYear runtimeMinutes              genres  averageRating  numVotes  \
934253       2015             56  Action,Crime,Drama            7.9    225549   
1180062      2015             \N           Talk-Show            7.6         8   
1337023      2018             \N           Talk-Show            5.0         9   

                           Title_lower  
934253                   jessica jones  
1180062  jessica jones & mcu team up?!  
1337023      marvel's jessica jones s2  
     Profile Name  Duration                   Title        Country     Type  \
3238         Home  0.617778  Marvel's Jessica Jones  TH (Thailand)  TV Show   
3239         Home  0.873056  Marvel's Jessica Jones  TH (Thailan