In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats 
from util import *
df = pd.read_csv("movies.csv")
df = df.drop(columns=[
    "id", 
    # "title", 
    "status", 
    # "release_date", # TODO: this is temporary, might keep this column
    "production_companies", 
    "credits", 
    "poster_path", 
    "backdrop_path", 
    "recommendations", 
    "keywords", 
    "tagline"
])

df = df[
    (df["budget"] > 0)
    & (df["revenue"] > 0)
    & (df["vote_count"] > 10)
    & (df["vote_average"] > 0)
    & (df["runtime"] > 0)
]
df = df.drop_duplicates(subset=["title", "release_date"])
df = df.drop(columns=["release_date", "title"])
df = df.dropna(subset=["genres", "runtime", "overview"])
languages_to_keep = df["original_language"].value_counts().nlargest(15).index
df["original_language"] = df["original_language"].apply(lambda row: row if row in languages_to_keep else "other")
df["profit_margin"] = profit_margin(df["budget"], df["revenue"])
cols_to_adjust = ["popularity", "budget", "vote_count"]
for i in cols_to_adjust: 
    df[i] = np.log1p(df[i])
df = df[df["profit_margin"] > -5]
df["profit_margin"] = np.exp(df["profit_margin"])
df = df.drop(columns="revenue")
df.head()

In [None]:
df.hist()
plt.subplots_adjust(wspace=1, hspace=0.6)
plt.show()

In [None]:
cols_to_min_max_normalize = ["popularity", "budget", "runtime", "vote_average", "vote_count"]
# get the mins and maxs so we can translate the profit margin (label) back to its actual value
# for i in cols_to_log_normalize:
#     df[i] = np.log1p(df[i])
mins = df[cols_to_min_max_normalize].min()
maxs = df[cols_to_min_max_normalize].max()
min_max_df = pd.DataFrame({"min": mins, "max": maxs})
min_max_df.to_csv("min_maxs.csv")
for i in cols_to_min_max_normalize:
    min_val = df[i].min()
    max_val = df[i].max()
    df[i] = (df[i] - min_val) / (max_val - min_val)



In [None]:
df = multihot_tf_idf(df, "overview")
df = multihot_column(df, "-", "genres")
df = pd.get_dummies(df, columns=["original_language"], dtype=int)

In [None]:
# df = df.drop(columns="revenue")
# get the 80% for training
train, test = train_test_split(df, test_size=0.2)
# split the remaining into 10 and 10
val, test = train_test_split(test, test_size=0.5)
train.to_csv("train.csv", index=False)
val.to_csv("validation.csv", index=False)
test.to_csv("test.csv", index=False)

In [None]:
train.describe()

In [None]:
val["profit_margin"].hist()

In [None]:
train.shape, val.shape, test.shape

In [None]:
df["profit_margin"].describe()

In [None]:
test = pd.read_csv("movies.csv")
# test = test[["budget", "revenue", "popularity", "vote_count"]].dropna()
test = test[["budget", "revenue", "popularity", "vote_count", "vote_average", "runtime"]].dropna()
# test = test.drop(columns="id")
test = test[
    (test["budget"] > 0)
    & (test["revenue"] > 0)
    & (test["vote_count"] > 10)
    & (test["vote_average"] > 0)
    & (test["runtime"] > 0)
]
# # test["revenue"] = np.log(test["revenue"])
# test["profit_margin"] = profit_margin(test["budget"], test["revenue"])
# test[test["profit_margin"] > -9999].shape
# test = test[test["profit_margin"] > -9999]
# test.describe()
# test["popularity"] = np.log(test["popularity"])
# test["vote_count"] = np.log(test["vote_count"])
# test["budget"] = np.log(test["budget"])
# # test["vote_average"] = np.log(test["vote_average"])
# test = test.drop(columns=["revenue"])
test["revenue"].hist()

In [None]:
test["revenue"].describe()

In [None]:
test.hist()

In [None]:
df[["revenue", "budget"]].hist()