In [None]:
import pymongo # import the library
from pymongo import MongoClient
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import chi2_contingency

In [None]:
connection_string = "mongodb+srv://admin:1234@cluster0.lrq2u.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

# Access your database
client = pymongo.MongoClient(connection_string)

# send a ping to confirm a successful connection
try:
  client.admin.command('ping')
  print("Pinged your deployment . You successfully connected to MongoDB !")
except Exception as e:
  print(e)

In [None]:
# client = MongoClient("mongodb://localhost:27017") # connect to the database
db = client['3675ProjectDB'] # connect to the database

**Table 3: Title.Basics**

In [None]:
# table 3
# this contains 11.5 million rows
path = r'3675IMDbData\title.basics.tsv.gz'
df3 = pd.read_csv(path, sep='\t')

In [None]:
data2 = df3.to_dict(orient='records')

collection = db['Title_basics']
collection.insert_many(data2)

**Table 4: TItle.Crew**

In [None]:
# table 4
# this contains 11.5 million rows
path = r'3675IMDbData\title.crew.tsv.gz'
df4 = pd.read_csv(path, sep='\t')

In [None]:
data4 = df4.to_dict(orient='records')

collection = db['Title_crew']
collection.insert_many(data4)

**Filter title.basics for rows where titleType is "movie"**

In [None]:
# this contains 708 thousand rows
# title.basics.tsv.gz
movies_df = df3[df3["titleType"] == "movie"]

Convert runtimeMinutes and startYear to int

In [None]:
# Using .loc makes it explicit that you're modifying the DataFrame in place.
movies_df.loc[:, "runtimeMinutes"] = pd.to_numeric(movies_df["runtimeMinutes"], errors="coerce")
movies_df.loc[:, "startYear"] = pd.to_numeric(movies_df["startYear"], errors="coerce")

Add movies_df to db

In [12]:
movies_data = movies_df.to_dict(orient='records')

collection = db['Movies_only_basics']


collection.insert_many(movies_data)

KeyboardInterrupt: 

In [13]:
# Drop all docs that have a runtime < 60 mins and runtime > 220 mins
collection.delete_many({"runtimeMinutes": {"$lt": 60}})
collection.delete_many({"runtimeMinutes": {"$gt": 220}})

DeleteResult({'n': 1646, 'electionId': ObjectId('7fffffff00000000000000da'), 'opTime': {'ts': Timestamp(1741969706, 10), 't': 218}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1741969706, 10), 'signature': {'hash': b'\x1a*\xf2c\ni\xa4f\x0f\xa6<\xce.\x06\xca\xb0\xb6;b\xe3', 'keyId': 7438644723994066953}}, 'operationTime': Timestamp(1741969706, 10)}, acknowledged=True)

**Join movies_df and title.ratings and add them to db**

In [None]:
# table 5
# this contains 1.5 million rows
path = r'3675IMDbData\title.ratings.tsv.gz'
df5 = pd.read_csv(path, sep='\t')

# Left join movie_df and title.ratings.tsv (keeps all movies regardless if there is a rating)
merged_ratings_movies_df = pd.merge(movies_df, df5, on="tconst", how="left")

merged_ratings_movies_data = merged_ratings_movies_df.to_dict(orient='records')

collection = db['Movies&Ratings_basics']
collection.insert_many(merged_ratings_movies_data)

**Join title.crew on tconst with movies.df and add to db**

In [None]:
# BAD CODE!!!!!!!!!!!!!!!
merged_crew_movies_df = pd.merge(movies_df, df4, on="tconst", how="inner")

merged_crew_movies_data = merged_crew_movies_df.to_dict(orient='records')
collection = db['Movies&Crew_basics']
collection.insert_many(merged_crew_movies_data)

**Ratings per genre**

Print all the different genres

In [None]:
movieCollection = db["Movies_only_basics"]

unique_genres = movieCollection.distinct("genres")

print("Unique genres: ", unique_genres)

Converting the Movies_only_basics collection to a df

In [None]:
# Query the collection - this returns a cursor
cursor = db["Movies_only_basics"].find()

# Convert the cursor to a list of documents, then create a DataFrame
movies_list = list(cursor)
movies_df = pd.DataFrame(movies_list)

# Drop the MongoDB autogenerated '_id' column
if '_id' in df_movies_only.columns:
    df_movies_only = df_movies_only.drop('_id', axis=1)


Converting the Movies&Ratings_basics collection to a df

In [None]:
# Query the collection - this returns a cursor
cursor = db["Movies&Ratings_basics"].find()

# Convert the cursor to a list of documents, then create a DataFrame
movies_Ratings_list = list(cursor)
movies_Ratings_df = pd.DataFrame(movies_Ratings_list)

# Drop the MongoDB autogenerated '_id' column
if '_id' in movies_Ratings_df.columns:
    movies_Ratings_df = movies_Ratings_df.drop('_id', axis=1)


**Central Tendency (Mean, Median, Mode) of Runtime**

In [None]:
# Mean
mean_movies = movies_df["runtimeMinutes"].mean()
print("Mean runtime: ", mean_movies)

# Median
median_movies = movies_df["runtimeMinutes"].median()
print("Median runtime: ", median_movies)

# Mode
mode_movies = movies_df["runtimeMinutes"].mode()
print("Mode runtime: ", mode_movies)

# Max
max_movies = movies_df["runtimeMinutes"].max()
print("Max runtime: ", max_movies)

# Min
min_movies = movies_df["runtimeMinutes"].min()
print("Min runtime: ", min_movies)

**Dispersion (Variance, Standard Deviation)**

In [None]:
# Variance
variance_movies = movies_df["runtimeMinutes"].var()
print("Variance runtime: ", variance_movies)

# Standard Deviation
std_dev_movies = movies_df["runtimeMinutes"].std()
print("Standard Deviation runtime: ", std_dev_movies)

**Histogram**

In [None]:
plt.hist(movies_df["runtimeMinutes"], bins=10, edgecolor='black')
plt.xlabel("Runtime (minutes)")
plt.ylabel("Frequency")
plt.title("Distribution of Movie Runtimes")
plt.show()

**Box Plot**

In [None]:
sns.boxplot(x=movies_df["runtimeMinutes"])
plt.title("Boxplot of Movie Runtimes")
plt.show()

# The “long dark line” extending above Q3 is the upper whisker.
# It reaches the highest value that is not an outlier.

**Scatter Plot**

In [None]:
plt.scatter(movies_Ratings_df["averageRating"], movies_df["runtimeMinutes"])
plt.xlabel("Rating")
plt.ylabel("Runtime (minutes)")
plt.title("Movie Rating Over Runtime")
plt.show()