In [1]:
import pandas as pd
import numpy as np
import joblib

## Clean Movie Info Data (rottentomatoes.com)

In [2]:
# Read the csv data output from the web scraper
df = pd.read_csv("movie_info_5000.csv")

In [3]:
# Define some functions for data cleaning
def clean_genre(x):
    return ",".join([j.strip() for j in str(x).split(",")])

def clean_theater_date(x):
    return str(x).split("\n")[0]

def clean_written_by(x):
    return ",".join([j.replace("\n","").strip() for j in str(x).split(",")])


In [4]:
# Rename the columns
new_columns = ["box_office", "director", "genre", "theater_date", "streaming_date", "rating", "runtime", "studio", "written_by", "audience_score", "critics_consensus", "title", "tomatometer", "tomatometer_count", "url", "user_rating_count"]
df.columns = new_columns

# Clean the data accordingly, convert string to appropriate data type (integer/date etc.)
df["box_office"] = df["box_office"].fillna(0).apply(lambda x: str(x).replace("$", "").replace(",",""))
df["box_office"] = pd.to_numeric(df["box_office"])
df["genre"].values[0].split(",")
df["genre"] = df["genre"].apply(lambda x: clean_genre(x))
df["theater_date"] = pd.to_datetime(df["theater_date"].apply(lambda x: clean_theater_date(x)))
df["streaming_date"] = pd.to_datetime(df['streaming_date'])
df["rating"] = df["rating"].apply(lambda x: str(x).split("(")[0].strip())
df["runtime"] = df["runtime"].apply(lambda x: str(x).replace("minutes", "").replace("nan", "").strip())
df["runtime"] = pd.to_numeric(df["runtime"])
df["written_by"] = df["written_by"].apply(lambda x: clean_written_by(x))
df["audience_score"] =  pd.to_numeric(df["audience_score"].apply(lambda x: str(x).replace("%", "")), errors = "coerce")
df["tomatometer"] =  pd.to_numeric(df["tomatometer"].apply(lambda x: str(x).replace("%", "")), errors = "coerce")
df["tomatometer_count"] = pd.to_numeric(df["tomatometer_count"].apply(lambda x: str(x).replace(",", "")), errors = "coerce")
df["user_rating_count"] = pd.to_numeric(df["user_rating_count"].apply(lambda x: str(x).replace(",", "")), errors = "coerce")

df.head()

Unnamed: 0,box_office,director,genre,theater_date,streaming_date,rating,runtime,studio,written_by,audience_score,critics_consensus,title,tomatometer,tomatometer_count,url,user_rating_count
0,0,Josh Cooley,"Animation,Comedy,Kids & Family,Science Fiction...",2019-06-21,2019-10-01,G,90.0,Disney/Pixar,"Andrew Stanton,Stephany Folsom",94.0,"Heartwarming, funny, and beautifully animated,...",Toy Story 4,97,402,https://www.rottentomatoes.com/m/toy_story_4,53096.0
1,0,James Franco,Drama,2019-10-04,2019-10-04,NR,95.0,Cleopatra Entertainment,Josh Boone,71.0,,Pretenders,22,9,https://www.rottentomatoes.com/m/pretenders,22.0
2,0,Jon Favreau,"Action & Adventure,Animation,Drama",2019-07-19,2019-10-11,PG,110.0,Walt Disney Pictures,Jeff Nathanson,88.0,While it can take pride in its visual achievem...,The Lion King,53,390,https://www.rottentomatoes.com/m/the_lion_king...,76190.0
3,0,Michel Ocelot,"Animation,Art House & International,Kids & Fam...",2019-10-04,2019-10-04,PG,94.0,,Michel Ocelot,,,Dilili in Paris (Dilili à Paris),60,20,https://www.rottentomatoes.com/m/dilili_in_paris,
4,0,Michael Dowse,"Action & Adventure,Comedy",2019-07-12,2019-10-01,R,105.0,20th Century Fox,Tripper Clancy,79.0,Though it makes a strong case for future colla...,Stuber,42,207,https://www.rottentomatoes.com/m/stuber,5388.0


In [5]:
# Output the data as csv (sep = ';')
df.to_csv("hive_movie_info_semisep.csv", sep=";", index = False)

## Clean Movie Reviews Data (rottentomatoes.com)

In [6]:
# Load the pkl reviews data (scrap from rottentomatoes.com)
reviews_data = joblib.load("movie_reviews_5000.pkl")

# Restructure the data into a dataframe
reviews_data_list = []
for key, val in reviews_data.items():
    for each_review in val:
        d = {"url_id": key, "review": each_review}
        reviews_data_list.append(d)
reviews_df = pd.DataFrame(reviews_data_list)
reviews_df.head()

Unnamed: 0,review,url_id
0,It is a bit of a reboot... If you can get past...,https://www.rottentomatoes.com/m/toy_story_4
1,While I'm still not sure if Toy Story 4 is who...,https://www.rottentomatoes.com/m/toy_story_4
2,...Toy Story 4 is beyond satisfying and tear-i...,https://www.rottentomatoes.com/m/toy_story_4
3,"While all of the ""Toy Story"" movies have dealt...",https://www.rottentomatoes.com/m/toy_story_4
4,The fourth entry in Pixar's first and best fra...,https://www.rottentomatoes.com/m/toy_story_4


In [7]:
# Output the data as csv (sep = ';')
reviews_df.to_csv("hive_movie_reviews_semisep.csv", sep=";", index = False)


## Clean Movie Box Office Data (boxofficemojo.com)

In [8]:
# load the box office data
box_df = pd.read_csv("movie_box_office_5000.csv")

In [9]:
# Define some functions to obtain specific data from strings
def get_opening_rank(x):
    try:
        return str(x).split("#")[1].split("rank")[0].strip().replace(",", "")
    except: 
        return None

def get_opening_theater(x):
    try:
        return str(x).split("rank,")[1].split("theater")[0].strip().replace(",", "")
    except:
        return None

def get_opening_average(x):
    try:
        return str(x).split("$")[1].split("average")[0].strip().replace(",", "")
    except:
        return None

In [10]:
# rename the columns
box_new_columns = ["close_date", "in_release", "widest_release", "domestic_gross", "foreign_gross", "opening_weekend", "opening_stats", "movie_title_key", "total_gross", "unknown", "limited_opening_weekend", "wide_opening_weekend"]
box_df.columns = box_new_columns

In [11]:
# Clean each columns accordingly
box_df["close_date"] = pd.to_datetime(box_df["close_date"])
box_df["in_release"] = pd.to_numeric(box_df["in_release"].fillna(0).apply(lambda x: str(x).split("days")[0].replace(",","")))
box_df["widest_release"] = pd.to_numeric(box_df["widest_release"].fillna(0).apply(lambda x: str(x).split("theaters")[0].replace(",", "")))
box_df["domestic_gross"] = pd.to_numeric(box_df["domestic_gross"].fillna(0).apply(lambda x: str(x).replace("$", "").replace(",","")), errors = "coerce")
box_df["foreign_gross"] = pd.to_numeric(box_df["foreign_gross"].fillna(0).apply(lambda x: str(x).replace("$", "").replace(",","")), errors = "coerce")
box_df["total_gross"] = pd.to_numeric(box_df["total_gross"].fillna(0).apply(lambda x: str(x).replace("$", "").replace(",","")), errors = "coerce")
box_df["opening_weekend"] = pd.to_numeric(box_df["opening_weekend"].fillna(0).apply(lambda x: str(x).replace("$", "").replace(",","")), errors = "coerce")
box_df["opening_rank"] = pd.to_numeric(box_df["opening_stats"].apply(lambda x: get_opening_rank(x)), errors = "coerce")
box_df["opening_theater"] = pd.to_numeric(box_df["opening_stats"].apply(lambda x: get_opening_theater(x)), errors = "coerce")
box_df["opening_average"] = pd.to_numeric(box_df["opening_stats"].apply(lambda x: get_opening_average(x)), errors = "coerce")
box_df["total_gross_calc"] = box_df["domestic_gross"] + box_df["foreign_gross"]
box_df["total_gross"] = box_df["total_gross"].fillna(box_df["total_gross_calc"])


In [12]:
# Find columns with at least 80% missing observations 
box_df.columns[box_df.isna().mean(axis =0) > 0.8]

Index(['unknown', 'limited_opening_weekend', 'wide_opening_weekend'], dtype='object')

In [13]:
# Drop columns that are not useful
box_df = box_df.drop(["opening_stats","total_gross_calc", "limited_opening_weekend", "wide_opening_weekend", "unknown"], axis = 1)
box_df.head()

Unnamed: 0,close_date,in_release,widest_release,domestic_gross,foreign_gross,opening_weekend,movie_title_key,total_gross,opening_rank,opening_theater,opening_average
0,2019-05-19,17,1,12042,0.0,5150.0,Quartet,0,71.0,1.0,5150.0
1,NaT,0,0,0,0.0,0.0,,0,,,
2,NaT,0,0,0,0.0,0.0,Spider-Man: Far From Home,0,,,
3,2018-11-18,17,17,113956,0.0,53000.0,Bodied,0,43.0,14.0,3786.0
4,NaT,0,0,0,0.0,0.0,,0,,,


In [14]:
# Output the data as csv (sep = ';')
box_df.to_csv("hive_movie_box_office_semisep.csv", sep=";", index = False)
