# COGS 118B Final Project (Group RILS): Cleaning Code #

In [3]:
import pandas as pd
import numpy as np

## Load Data ##

In [4]:
df = pd.read_csv('movies.csv')
df.head()
df.shape[0]

722480

## Drop Irrelevant Columns ##

In [5]:
#not sure what else we want to drop 
columns_to_remove = ['overview', 'keywords', 'poster_path', 'backdrop_path']
df = df.drop(columns = columns_to_remove)
df.head()

Unnamed: 0,id,title,genres,original_language,popularity,production_companies,release_date,budget,revenue,runtime,status,tagline,vote_average,vote_count,credits,recommendations
0,615656,Meg 2: The Trench,Action-Science Fiction-Horror,en,8763.998,Apelles Entertainment-Warner Bros. Pictures-di...,2023-08-02,129000000.0,352056482.0,116.0,Released,Back for seconds.,7.079,1365.0,Jason Statham-Wu Jing-Shuya Sophia Cai-Sergio ...,1006462-298618-569094-1061181-346698-1076487-6...
1,758323,The Pope's Exorcist,Horror-Mystery-Thriller,en,5953.227,Screen Gems-2.0 Entertainment-Jesus & Mary-Wor...,2023-04-05,18000000.0,65675816.0,103.0,Released,Inspired by the actual files of Father Gabriel...,7.433,545.0,Russell Crowe-Daniel Zovatto-Alex Essoe-Franco...,713704-296271-502356-1076605-1084225-1008005-9...
2,667538,Transformers: Rise of the Beasts,Action-Adventure-Science Fiction,en,5409.104,Skydance-Paramount-di Bonaventura Pictures-Bay...,2023-06-06,200000000.0,407045464.0,127.0,Released,Unite or fall.,7.34,1007.0,Anthony Ramos-Dominique Fishback-Luna Lauren V...,496450-569094-298618-385687-877100-598331-4628...
3,640146,Ant-Man and the Wasp: Quantumania,Action-Adventure-Science Fiction,en,4425.387,Marvel Studios-Kevin Feige Productions,2023-02-15,200000000.0,475766228.0,125.0,Released,Witness the beginning of a new dynasty.,6.507,2811.0,Paul Rudd-Evangeline Lilly-Jonathan Majors-Kat...,823999-676841-868759-734048-267805-965839-1033...
4,677179,Creed III,Drama-Action,en,3994.342,Metro-Goldwyn-Mayer-Proximity Media-Balboa Pro...,2023-03-01,75000000.0,269000000.0,116.0,Released,You can't run from your past.,7.262,1129.0,Michael B. Jordan-Tessa Thompson-Jonathan Majo...,965839-267805-943822-842942-1035806-823999-107...


## Drop Duplicates ##

In [6]:
df_no_duplicates = df.drop_duplicates()
df_no_duplicates.shape[0]

722462

## Drop Na Values ##

In [7]:
df_cleaned = df_no_duplicates.dropna(subset=['recommendations'])
df_cleaned.head()
df_cleaned.shape[0]

#this might be an issue but i figured data without recommendations wouldnt be needed?
#when i drop all Na values, the number drops to 14,000ish, might need new dataset

34894

## Split Genres and Recommendations Columns Up ##

In [8]:
# Split the 'genres' column by '-' and expand the result into separate columns
genres_split = df['genres'].str.split('-', expand=True)

# Truncate genres_split to include only the first three split columns
genres_split = genres_split.iloc[:, :5]

# Assign new column names to the truncated columns
genres_split.columns = ['genre_1', 'genre_2', 'genre_3', 'genre_4', 'genre_5']

# Split the 'recommendations' column by '-' and expand the result into separate columns
recommendations_split = df['recommendations'].str.split('-', expand=True)

# Truncate recommendations_split to include only the first five split columns
recommendations_split = recommendations_split.iloc[:, :5]

# Assign new column names to the truncated columns
recommendations_split.columns = ['recommendation_1', 'recommendation_2', 'recommendation_3', 'recommendation_4', 'recommendation_5']

# Concatenate the original DataFrame with the new columns
df_cleaned = pd.concat([df, genres_split, recommendations_split], axis=1)

# Drop the original 'genres' and 'recommendations' columns
df_cleaned = df_cleaned.drop(columns=['genres', 'recommendations'])

df_cleaned.head()

Unnamed: 0,id,title,original_language,popularity,production_companies,release_date,budget,revenue,runtime,status,...,genre_1,genre_2,genre_3,genre_4,genre_5,recommendation_1,recommendation_2,recommendation_3,recommendation_4,recommendation_5
0,615656,Meg 2: The Trench,en,8763.998,Apelles Entertainment-Warner Bros. Pictures-di...,2023-08-02,129000000.0,352056482.0,116.0,Released,...,Action,Science Fiction,Horror,,,1006462,298618,569094,1061181,346698
1,758323,The Pope's Exorcist,en,5953.227,Screen Gems-2.0 Entertainment-Jesus & Mary-Wor...,2023-04-05,18000000.0,65675816.0,103.0,Released,...,Horror,Mystery,Thriller,,,713704,296271,502356,1076605,1084225
2,667538,Transformers: Rise of the Beasts,en,5409.104,Skydance-Paramount-di Bonaventura Pictures-Bay...,2023-06-06,200000000.0,407045464.0,127.0,Released,...,Action,Adventure,Science Fiction,,,496450,569094,298618,385687,877100
3,640146,Ant-Man and the Wasp: Quantumania,en,4425.387,Marvel Studios-Kevin Feige Productions,2023-02-15,200000000.0,475766228.0,125.0,Released,...,Action,Adventure,Science Fiction,,,823999,676841,868759,734048,267805
4,677179,Creed III,en,3994.342,Metro-Goldwyn-Mayer-Proximity Media-Balboa Pro...,2023-03-01,75000000.0,269000000.0,116.0,Released,...,Drama,Action,,,,965839,267805,943822,842942,1035806


In [9]:
genres_split2 = df['genres'].str.split('-', expand=True)

genres_split2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,Action,Science Fiction,Horror,,,,,,,,,,,,,
1,Horror,Mystery,Thriller,,,,,,,,,,,,,
2,Action,Adventure,Science Fiction,,,,,,,,,,,,,
3,Action,Adventure,Science Fiction,,,,,,,,,,,,,
4,Drama,Action,,,,,,,,,,,,,,


In [10]:
# Alternate method for above to get OHE
genres_split3 = df['genres'].str.get_dummies(sep='-')

genres_split3

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0
2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
722475,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
722476,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
722477,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
722478,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [11]:
from collections import Counter

In [12]:
# Doing this as a copy because I need to figure out what I'm doing
df_copy = df.copy()
df_copy['production_companies'].fillna('', inplace=True)

# Splitting the production companies by '-' and flattening the list
all_companies = [company for sublist in df_copy['production_companies'].apply(lambda x: str(x).split('-')) for company in sublist]

# Count occurrences of each company and filter for ones that occur at least 500 times
company_counts = Counter(all_companies)
unique_companies = [company for company, count in company_counts.items() if count >= 500]

# Was in here to help me debug but maybe useful still so I'm leaving it
print("Unique Companies:", unique_companies)
print("Number of Unique Companies:", len(unique_companies))

# Create one-hot encoding for the selected unique companies
for company in unique_companies:
    df_copy[company] = df_copy['production_companies'].apply(lambda x: 1 if company in x else 0)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy['production_companies'].fillna('', inplace=True)


Unique Companies: ['Warner Bros. Pictures', 'Paramount', 'Metro', 'Goldwyn', 'Mayer', 'Universal Pictures', 'Columbia Pictures', 'Walt Disney Pictures', '', 'Toei Company', '20th Century Fox', 'Shochiku', 'ARTE', 'SVT', 'ZDF', 'Canal+', 'SF Studios', 'RAI', 'CNC', 'Gaumont', 'TVE', 'Walt Disney Productions', 'Nikkatsu Corporation', 'Toho', 'Shaw Brothers', 'United Artists', 'World Wrestling Entertainment (WWE)', 'ARTE France Cinéma', 'France Télévisions', 'HBO', 'ORF', 'Viva Films', 'Mosfilm', 'WDR', 'Film', 'Hal Roach Studios', 'ARD', 'RKO Radio Pictures', 'ONF | NFB', 'DEFA', 'Filmové studio Barrandov', 'Soyuzmultfilm', 'BBC', 'BR', 'NDR', 'Daiei Film', 'National Geographic', 'Republic Pictures', 'Gorky Film Studios', 'Česká televize', 'Lenfilm', 'Fox Film Corporation', 'The Essanay Film Manufacturing Company', 'Universal Film Manufacturing Company', 'Monogram Pictures', 'Pathé Frères']
Number of Unique Companies: 56


In [13]:
df_copy.head()

Unnamed: 0,id,title,genres,original_language,popularity,production_companies,release_date,budget,revenue,runtime,...,National Geographic,Republic Pictures,Gorky Film Studios,Česká televize,Lenfilm,Fox Film Corporation,The Essanay Film Manufacturing Company,Universal Film Manufacturing Company,Monogram Pictures,Pathé Frères
0,615656,Meg 2: The Trench,Action-Science Fiction-Horror,en,8763.998,Apelles Entertainment-Warner Bros. Pictures-di...,2023-08-02,129000000.0,352056482.0,116.0,...,0,0,0,0,0,0,0,0,0,0
1,758323,The Pope's Exorcist,Horror-Mystery-Thriller,en,5953.227,Screen Gems-2.0 Entertainment-Jesus & Mary-Wor...,2023-04-05,18000000.0,65675816.0,103.0,...,0,0,0,0,0,0,0,0,0,0
2,667538,Transformers: Rise of the Beasts,Action-Adventure-Science Fiction,en,5409.104,Skydance-Paramount-di Bonaventura Pictures-Bay...,2023-06-06,200000000.0,407045464.0,127.0,...,0,1,0,0,0,0,0,0,0,0
3,640146,Ant-Man and the Wasp: Quantumania,Action-Adventure-Science Fiction,en,4425.387,Marvel Studios-Kevin Feige Productions,2023-02-15,200000000.0,475766228.0,125.0,...,0,0,0,0,0,0,0,0,0,0
4,677179,Creed III,Drama-Action,en,3994.342,Metro-Goldwyn-Mayer-Proximity Media-Balboa Pro...,2023-03-01,75000000.0,269000000.0,116.0,...,0,0,0,0,0,0,0,0,0,0
