In [None]:
!pip install IMDbPY pandas matplotlib numpy kagglehub

## Load Dataset

In [None]:
import pandas as pd

df_title_basics = pd.read_csv('Datasets/title.basics.tsv.gz', compression='gzip', sep='\t', low_memory=False)
print(df_title_basics.head())

In [None]:
print(len(df_title_basics))

In [None]:
# Convert startYear to numeric, forcing errors to NaN
df_title_basics['startYear'] = pd.to_numeric(df_title_basics['startYear'], errors='coerce')

# Filter out rows where startYear is less than 1970
df_title_basics = df_title_basics[df_title_basics['startYear'] >= 1970]

print(df_title_basics.head())
print(len(df_title_basics))

### Remove Data without length

In [None]:
df_title_basics = df_title_basics[df_title_basics['runtimeMinutes'] != '\\N']

print(df_title_basics.head())
print(len(df_title_basics))

### Remove movies with empty genres

In [None]:

df_title_basics = df_title_basics[df_title_basics['genres'] != '\\N']

print(df_title_basics.head())
print(len(df_title_basics))

### Plot Genres

In [None]:
import matplotlib.pyplot as plt

df_genres = df_title_basics['genres'].str.split(',', expand=True).stack().reset_index(level=1, drop=True)
df_genres.name = 'genre'

genre_counts = df_genres.value_counts()

plt.figure(figsize=(12, 8))
genre_counts.plot(kind='bar')
plt.title('Number of Occurrences of Each Genre')
plt.xlabel('Genre')
plt.ylabel('Number of Occurrences')
plt.show()

### Filter out rows where titleType is not 'movie'


In [None]:
df_title_basics = df_title_basics[df_title_basics['titleType'] == 'movie']

print(df_title_basics.head())
print(len(df_title_basics))


In [None]:
import matplotlib.pyplot as plt

df_genres = df_title_basics['genres'].str.split(',', expand=True).stack().reset_index(level=1, drop=True)
df_genres.name = 'genre'

genre_counts = df_genres.value_counts()

# Plot the results
plt.figure(figsize=(12, 8))
genre_counts.plot(kind='bar')
plt.title('Number of Occurrences of Each Genre')
plt.xlabel('Genre')
plt.ylabel('Number of Occurrences')
plt.show()

In [None]:
# Filter for Reality-TV genre
reality_tv_example = df_title_basics[df_title_basics['genres'].str.contains('Reality-TV', na=False)].head(1)
print("Reality-TV Example:")
print(reality_tv_example)

# Filter for News genre
news_example = df_title_basics[df_title_basics['genres'].str.contains('News', na=False)].head(1)
print("\nNews Example:")
print(news_example)

# Filter for Game-Show genre
gameshow_example = df_title_basics[df_title_basics['genres'].str.contains('Game-Show', na=False)].head(1)
print("\nGame-Show Example:")
print(gameshow_example)

In [None]:
df_title_basics = df_title_basics.drop(columns=['endYear', 'originalTitle'])
print(df_title_basics.head())
print(df_title_basics.columns)

## Adding Movie Descriptions

In [1]:
import pandas as pd
df_title_basics=pd.read_csv('Datasets/df_title_basics.csv')

In [None]:
import os

max_threads = os.cpu_count()
print(f"Maximum number of threads you can deploy: {max_threads}")

In [None]:
from imdb import IMDb
import concurrent.futures

ia = IMDb()
i = 0
max_threads=24
# Function to fetch movie description 
def fetch_movie_description(tconst):
    global i
    try:
        movie = ia.get_movie(tconst[2:])  
        i += 1
        print(i)
        return movie.get('plot outline', 'No description available')
    except Exception as e:
        return 'No description available'

# Function to fetch descriptions for a specific set of entries using multithreading
def fetch_descriptions_for_set(df):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
        descriptions = list(executor.map(fetch_movie_description, df['tconst']))
    
    df['description'] = descriptions
    return df

# Divide the dataframe into chunks based on the number of threads
chunk_size = len(df_title_basics) // max_threads + 1
df_title_basics_chunks = [df_title_basics.iloc[i:i + chunk_size] for i in range(0, len(df_title_basics), chunk_size)]

# Fetch descriptions for each chunk
df_title_basics_subsets = []
for chunk in df_title_basics_chunks:
    df_subset = fetch_descriptions_for_set(chunk)
    df_title_basics_subsets.append(df_subset)
    print(f"Completed fetching descriptions for chunk {len(df_title_basics_subsets)}/{len(df_title_basics_chunks)}")

# Combine all subsets back into a single dataframe
df_title_basics = pd.concat(df_title_basics_subsets)

print(df_title_basics.head())


In [13]:
df_title_basics.to_csv('df_title_basics.csv', index=False)

In [None]:
df_title_basics.head()

In [None]:
empty_descriptions_count = df_title_basics[df_title_basics['description'] == ''].shape[0]
no_description_available_count = df_title_basics[df_title_basics['description'] == 'No description available'].shape[0]

print(f"Number of movies with empty description: {empty_descriptions_count}")
print(f"Number of movies with 'No description available': {no_description_available_count}")
print(f"Total number of movies with empty or no description available: {empty_descriptions_count + no_description_available_count}")