# Summary

The supplementary IMDb datasets were downloaded directly from IMDB. From the initial 7 datasets available on IMDb, these datasets complemented my project and will be able to expand on the main question

[IMDb Data Files](https://datasets.imdbws.com/)

The data that I was able to extract due to its usefulness is represented in the following columns:

* title.basics.tsv.gz
    * tconst (string) - alphanumeric unique identifier of the title
    * primaryTitle (string) – the more popular title / the title used by the filmmakers on promotional materials at the point of release
    * isAdult (boolean) - 0: non-adult title; 1: adult title
    * runtimeMinutes – primary runtime of the title, in minutes
    * genres (string array) – up to three genres associated with the title
      
* title.ratings.tsv.gz
  
    * tconst (string) - alphanumeric unique identifier of the title
    * averageRating – weighted average of all the individual user ratings
    * numVotes - number of votes the title has received

# Imports

In [1]:
# Import Libraries

import pandas as pd
import numpy as np
import os


# Load Datasets

In [2]:
# Define the base path
base_path = r"C:\Users\kimbe\Documents\StreamingAnalysis\data\raw_data"

# Load the basics and ratings datasets
basics_df = pd.read_csv(os.path.join(base_path, 'imdb_basics.csv'), na_values='\\N', low_memory=False)
ratings_df = pd.read_csv(os.path.join(base_path, 'imdb_ratings.csv'), na_values='\\N', low_memory=False)

# Custom Functions

Display a table with the column name, datatype, a random non-null value, and the number of missing values for each column

In [3]:
def summarize_columns(df):
    summary = []
    for column in df.columns:
        column_dtype = df[column].dtype
        
        # Find a random row with no null values
        non_null_rows = df.dropna()
        random_row = non_null_rows.sample(1).iloc[0]
        random_non_null_value = random_row[column]

        missing_values_count = df[column].isnull().sum()
        summary.append([column, column_dtype, random_non_null_value, missing_values_count])

    summary_df = pd.DataFrame(summary, columns=['Column Name', 'Data Type', 'Random Non-Null Value', 'Missing Values Count'])
    return summary_df

In [4]:
def print_unique_values(df, column_name):
  """Prints all unique values in a specified column of a DataFrame.

  Args:
    df: The DataFrame to analyze.
    column_name: The name of the column to check.
  """

  unique_values = df[column_name].unique()
  return unique_values

## Clean IMDb datasets

In [5]:
summary_table = summarize_columns(basics_df)
summary_table

Unnamed: 0,Column Name,Data Type,Random Non-Null Value,Missing Values Count
0,tconst,object,tt0398567,0
1,titleType,object,tvSeries,0
2,primaryTitle,object,T.J. Hooker,19
3,originalTitle,object,Hayaller ve Hayatlar,19
4,isAdult,float64,0.0,1
5,startYear,float64,1990.0,1417203
6,endYear,float64,2004.0,11146045
7,runtimeMinutes,object,30,7715219
8,genres,object,"Drama,Thriller",500750


In [6]:
# View Missing Values
missing_values = basics_df.isnull().sum()
missing_values

tconst                   0
titleType                0
primaryTitle            19
originalTitle           19
isAdult                  1
startYear          1417203
endYear           11146045
runtimeMinutes     7715219
genres              500750
dtype: int64

In [7]:
# Combine and rename
imdb_info = pd.merge(basics_df, ratings_df, on='tconst', how='inner')
imdb_info.rename(columns={"tconst": "imdbId", "primaryTitle": "title", "titleType": "type"}, inplace=True)
imdb_info.head()

Unnamed: 0,imdbId,type,title,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0000001,short,Carmencita,Carmencita,0.0,1894.0,,1,"Documentary,Short",5.7,2106
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892.0,,5,"Animation,Short",5.6,283
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0.0,1892.0,,5,"Animation,Comedy,Romance",6.5,2128
3,tt0000004,short,Un bon bock,Un bon bock,0.0,1892.0,,12,"Animation,Short",5.4,182
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0.0,1893.0,,1,"Comedy,Short",6.2,2859


In [8]:
missing_values = imdb_info.isnull().sum()
missing_values

imdbId                  0
type                    0
title                   1
originalTitle           1
isAdult                 0
startYear             246
endYear           1448495
runtimeMinutes     450199
genres              21466
averageRating           0
numVotes                0
dtype: int64

In [9]:
# Check for NaN values in imdb_info
nan_values = imdb_info.isna().sum()

# Check for infinite values in imdb_info
inf_values = (imdb_info == float('inf')).sum() + (imdb_info == float('-inf')).sum()

# Print the results
print(f"NaN values in each column:\n{nan_values}")
print(f"Infinite values in each column:\n{inf_values}")

NaN values in each column:
imdbId                  0
type                    0
title                   1
originalTitle           1
isAdult                 0
startYear             246
endYear           1448495
runtimeMinutes     450199
genres              21466
averageRating           0
numVotes                0
dtype: int64
Infinite values in each column:
imdbId            0
type              0
title             0
originalTitle     0
isAdult           0
startYear         0
endYear           0
runtimeMinutes    0
genres            0
averageRating     0
numVotes          0
dtype: int64


In [10]:
# Load the saved IMDb IDs from platforms cleaning
imdb_ids_path = r'C:\Users\kimbe\Documents\StreamingAnalysis\data\imdb_ids_in_platforms.csv'
imdb_ids_in_platforms = pd.read_csv(imdb_ids_path)

# Extract the unique IMDb IDs as a list or array
imdb_ids_list = imdb_ids_in_platforms['imdbId'].unique()

# Assuming imdb_info is already loaded in this notebook
filtered_imdb_info = imdb_info[imdb_info['imdbId'].isin(imdb_ids_list)].reset_index(drop=True)

# Print summary
print(f"Original IMDb info rows: {len(imdb_info)}")
print(f"Filtered IMDb info rows: {len(filtered_imdb_info)}")

# Save the filtered IMDb info DataFrame for further processing
filtered_imdb_info.to_csv(r'C:\Users\kimbe\Documents\StreamingAnalysis\data\filtered_imdb_info.csv', index=False)
print("Filtered IMDb info saved as CSV.")

Original IMDb info rows: 1506750
Filtered IMDb info rows: 89159
Filtered IMDb info saved as CSV.


In [None]:
# Filter the combined IMDb DataFrame to include only the imdbIds that are in the platforms
#filtered_imdb_info= imdb_info[imdb_info['imdbId'].isin(all_platform_imdb_ids)]

# Display the resulting DataFrame
#print(f"Filtered IMDb DataFrame Shape: {filtered_imdb_info.shape}")
#filtered_imdb_info.head()

In [None]:
# Check the minimum and maximum of IMDb's average ratings
#print(filtered_imdb_info['averageRating'].min())
#print(filtered_imdb_info['averageRating'].max())


In [None]:
# Drop isAdult as it is inaccurate.
#imdb_info = imdb_info.drop(columns=['isAdult'])

# Drop endYear 
#imdb_info = imdb_info.drop(columns=['endYear'])

# Drop originalTitle
#imdb_info = imdb_info.drop(columns=['originalTitle'])

# Drop runTime
#imdb_info = imdb_info.drop(columns=['runtimeMinutes'])

# Drop startYear
#imdb_info = imdb_info.drop(columns=['startYear'])