##SETUP##

In [None]:
import pymongo # import the library
from pymongo import MongoClient
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import chi2_contingency

In [None]:
connection_string = "mongodb+srv://admin:1234@cluster0.lrq2u.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

# Access your database
client = pymongo.MongoClient(connection_string)

# send a ping to confirm a successful connection
try:
  client.admin.command('ping')
  print("Pinged your deployment . You successfully connected to MongoDB !")
except Exception as e:
  print(e)

db = client['3675ProjectDB'] # connect to the database

In [None]:
client = MongoClient("mongodb://localhost:27017") # connect to the database
db = client['3675Project'] # connect to the database

Table 1: Title.basics

In [None]:
# table 1
# this contains 11.5 million rows
path = r'3675IMDbData\title.basics.tsv.gz'
df1 = pd.read_csv(path, sep='\t')

Table 2: Title.ratings

In [None]:
path = r'3675IMDbData\title.ratings.tsv.gz'
df2 = pd.read_csv(path, sep='\t')

**Create a df that filters all the movies without ratings**

In [None]:
merged_title_ratings_df = pd.merge(df1, df2, on='tconst', how='inner')
print("Merged df length:", len(merged_title_ratings_df))
# Merged df length: 1544438

Convert runtimeMinutes and startYear to numerical

In [None]:
# Using .loc makes it explicit that you're modifying the DataFrame in place.
merged_title_ratings_df.loc[:, "runtimeMinutes"] = pd.to_numeric(merged_title_ratings_df["runtimeMinutes"], errors="coerce")
merged_title_ratings_df.loc[:, "startYear"] = pd.to_numeric(merged_title_ratings_df["startYear"], errors="coerce")

Filter out all non movies

In [None]:
merged_title_ratings_df = merged_title_ratings_df[merged_title_ratings_df["titleType"] == "movie"]
print('filtered df length:', len(merged_title_ratings_df))
# 326678

Trimm movies longer shorter then 60mins and longer then 222mins

In [None]:
data = merged_title_ratings_df.to_dict(orient='records')

In [None]:
collection = db['Movies_Ratings_joined'] # had to run with local db
collection.insert_many(data)

In [None]:
# Drop all docs that have a runtime < 60 mins and runtime > 220 mins
collection.delete_many({"runtimeMinutes": {"$lt": 60}})
collection.delete_many({"runtimeMinutes": {"$gt": 220}})
print(collection.count_documents({}))
# 326678

Create a list of valid tconst that are of movies with ratings and with our runtimeMinutes constraints

In [None]:
# Query the collection - this returns a cursor
cursor = db["Movies_Ratings_joined"].find()

# Convert the cursor to a list of documents, then create a DataFrame
movies_list = list(cursor)
movies_df = pd.DataFrame(movies_list)

# Drop the MongoDB autogenerated '_id' column
if '_id' in movies_df.columns:
    movies_df = movies_df.drop('_id', axis=1)


# Get the valid tconst values from Movies_only_basics
valid_tconsts = movies_df["tconst"].unique()

# Convert to text file for use later 
valid_tconsts_df = pd.DataFrame(valid_tconsts, columns=["tconst"])
valid_tconsts_df.to_csv("valid_tconsts.csv", index=False)


**Trimm df's using valid_tconsts**

In [None]:
# Txt file of valid tconsts
path = r'C:\Users\antal\Dev\3675Project\valid_tconsts.csv'
valid_tconsts_df = pd.read_csv(path)
valid_tconsts_df.head()

In [None]:
# table 1
# this contains 11.5 million rows
path = r'3675IMDbData\title.basics.tsv.gz'
df1 = pd.read_csv(path, sep='\t')

d1_filtered = df1[df1["tconst"].isin(valid_tconsts_df["tconst"])]
print("Filtered df length:", len(d1_filtered))
# 308331

In [None]:
d1_filtered.info()

In [None]:
data = d1_filtered.to_dict(orient='records')
collection = db['trimmed_title_basics']
collection.insert_many(data)

In [None]:
# Table 2
path = r'3675IMDbData\title.ratings.tsv.gz'
df2 = pd.read_csv(path, sep='\t')

d2_filtered = df2[df2["tconst"].isin(valid_tconsts_df["tconst"])]
print("Filtered df length:", len(d2_filtered))
# 308331

In [None]:
data = d2_filtered.to_dict(orient='records')
collection = db['trimmed_title_ratings']
collection.insert_many(data)

In [None]:
# Table 3
path = r'3675IMDbData\title.crew.tsv.gz'
df3 = pd.read_csv(path, sep='\t')

d3_filtered = df3[df3["tconst"].isin(valid_tconsts_df["tconst"])]
print("Filtered df length:", len(d3_filtered))
# 308328

In [None]:
d3_filtered.head()

In [None]:
data = d3_filtered.to_dict(orient='records')
collection = db['trimmed_title_crew']
collection.insert_many(data)

In [None]:
# Table 4
path = r'3675IMDbData\title.principals.tsv.gz'
df4 = pd.read_csv(path, sep='\t')

d4_filtered = df4[df4["tconst"].isin(valid_tconsts_df["tconst"])]
print("Filtered df length:", len(d4_filtered))
# 4939507

In [None]:
d4_filtered.tail()

In [None]:
print(d4_filtered["category"].unique())

**Create a collection of directors**

In [None]:
directors_principle = d4_filtered[d4_filtered["category"] == "director"]
print('filtered df length:', len(directors_principle))
# 332781

In [None]:
data = directors_principle.to_dict(orient='records')
collection = db['trimmed_directors_principals']
collection.insert_many(data)

**Create a collection actors**

In [None]:
actor_principle = d4_filtered[d4_filtered["category"] == "actor"]
print('filtered df length:', len(actor_principle))
# 1593654

In [None]:
data = actor_principle.to_dict(orient='records')
collection = db['trimmed_actor_principals']
collection.insert_many(data)

**Create a collection of actress**

In [None]:
actress_principle = d4_filtered[d4_filtered["category"] == "actress"]
print('filtered df length:', len(actress_principle))
# 880676

In [None]:
data = actress_principle.to_dict(orient='records')
collection = db['trimmed_actress_principals']
collection.insert_many(data)

**Sort title.akas by valid_tconsts**

In [None]:
# load the valid tconsts
path = r'valid_tconsts.csv'
valid_tconsts_df = pd.read_csv(path)

In [None]:
# load title.akas.tsv.gz
path = r'3675IMDbData\title.akas.tsv.gz'
df3 = pd.read_csv(path, sep='\t')
print(len(df3))
# 51456486

In [None]:
df3.head()

In [None]:
df_filtered = df3[df3["titleId"].isin(valid_tconsts_df["tconst"])]
print("Filtered df length:", len(df_filtered))

In [None]:
df_filtered.tail()