In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from utils import *

In [17]:
from config import ACCOUNT_NAME, ACCOUNT_KEY

In [None]:
import pandas as pd
from azure.storage.blob import BlobServiceClient

# Replace with your actual connection string from the Azure Portal
connection_string = f"DefaultEndpointsProtocol=https;AccountName={ACCOUNT_NAME};AccountKey={ACCOUNT_KEY};EndpointSuffix=core.windows.net"

# Define details for Blob 1 (e.g., title.basics.tsv.gz in the "movies" container)
container_name1 = "movies"
blob_name1 = "title.basics.tsv.gz"
download_path1 = "title.basics.tsv.gz"  # Local file name for blob 1

# Define details for Blob 2 (e.g., title.ratings.tsv.gz in the "movies_rating" container)
container_name2 = "movie-ratings"
blob_name2 = "title.ratings.tsv.gz"
download_path2 = "title.ratings.tsv.gz"  # Local file name for blob 2

# Create the BlobServiceClient
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

# Get container clients for both containers
container_client1 = blob_service_client.get_container_client(container_name1)
container_client2 = blob_service_client.get_container_client(container_name2)

# Download Blob 1: title.basics.tsv.gz
with open(download_path1, "wb") as file1:
    blob_data1 = container_client1.download_blob(blob_name1)
    file1.write(blob_data1.readall())
print(f"Downloaded '{blob_name1}' from container '{container_name1}' to '{download_path1}'")

# Download Blob 2: title.ratings.tsv.gz
with open(download_path2, "wb") as file2:
    blob_data2 = container_client2.download_blob(blob_name2)
    file2.write(blob_data2.readall())
print(f"Downloaded '{blob_name2}' from container '{container_name2}' to '{download_path2}'")

# Optionally, load the downloaded files into pandas DataFrames
df_basics = pd.read_csv(download_path1, sep='\t', compression='gzip', low_memory=False)
df_ratings = pd.read_csv(download_path2, sep='\t', compression='gzip', low_memory=False)

print("First few rows of title.basics.tsv.gz:")
print(df_basics.head())

print("First few rows of title.ratings.tsv.gz:")
print(df_ratings.head())


Downloaded 'title.basics.tsv.gz' from container 'movies' to 'title.basics.tsv.gz'
Downloaded 'title.ratings.tsv.gz' from container 'movie-ratings' to 'title.ratings.tsv.gz'


In [None]:
df_basics.shape

In [None]:
df_ratings.shape

In [None]:
df_basics.info()

In [None]:
df_ratings.info()

In [None]:
df_basics.describe()

In [None]:
df_basics.columns

In [None]:
df_basics.isnull().sum()

In [None]:
df_basics[df_basics['primaryTitle'].isnull()]

In [None]:
df_clean = df_basics.dropna()

In [None]:
df_clean

In [None]:
df_copy = df_clean.copy()
df_copy['startYear'] = pd.to_numeric(df_clean['startYear'], errors='coerce')
df_copy.dropna(subset=['startYear'], inplace=True)  # remove rows where startYear couldn't be converted
df_copy['startYear'] = df_copy['startYear'].astype(int)  # if you want integers

In [None]:
# Plot the distribution of movie release years
years_range = np.arange(1874, 2032, 5)  # 2032 is exclusive so this covers 1874 to 2031

year_counts = df_copy['startYear'].value_counts().sort_index()
# Reindex the series to include every year in the specified range, filling missing values with 0
year_counts = year_counts.reindex(years_range, fill_value=0)

# Plot the bar graph
plt.figure(figsize=(20, 6))
ax = year_counts.plot(kind='bar', width=0.8)
ax.set_xlabel('Year')
ax.set_ylabel('Number of Movies')
ax.set_title('Distribution of Movie Release Years (1874 - 2031)')

# Optionally, set the x-axis limits (adjusting slightly for a clean view)
plt.xlim(-0.5, len(years_range)-0.5)

plt.tight_layout()
plt.show()

In [None]:
df_filtered = df_clean[df_clean['titleType'] == 'movie']

In [None]:
df_filtered.loc[:, 'genres_str'] = df_filtered['genres'].astype(str)

In [None]:
# Combine features (for instance, genres)
df_filtered = df_filtered[df_filtered['genres_str'].notnull() & (df_filtered['genres'] != '')]
df_filtered['combined_features'] = df_filtered['genres'].apply(lambda x: x.replace(',', ' ') if isinstance(x, str) else str(x))

In [None]:
df_filtered

In [None]:
df_ratings

In [None]:
df_merged = pd.merge(df_filtered, df_ratings, on='tconst', how='left')

mean_rating = df_merged['averageRating'].astype(float).mean()
df_merged['averageRating'] = df_merged['averageRating'].fillna(mean_rating)

df_merged.head()

In [None]:
df_merged.info()

In [None]:
m = 1000
overall_mean = df_merged['averageRating'].mean()
df_merged['weightedRating'] = df_merged.apply(lambda row: compute_weighted_rating(row, m, overall_mean), axis = 1)

In [None]:
df_merged.head()

In [None]:
df_merged.to_csv('df_merged.csv', index = False)

In [None]:
df_merged

In [None]:
df_merged.info()

In [None]:
df_merged["primaryTitle"] = df_merged["primaryTitle"].astype(str)
df_merged['genres_str'] = df_merged['genres_str'].astype(str)