# Data Cleaning and Features Engineering (Part 1)
### Clean and Extract Features from Rotten Tomatoes Movie Info Data (rottentomatoes.com)

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read movie info data csv from hive
df = pd.read_csv(r".\hive_movie_info_semisep.csv", sep =';')
df.head()

Unnamed: 0,box_office,director,genre,theater_date,streaming_date,rating,runtime,studio,written_by,audience_score,critics_consensus,title,tomatometer,tomatometer_count,url,user_rating_count
0,0,Josh Cooley,"Animation,Comedy,Kids & Family,Science Fiction...",2019-06-21,2019-10-01,G,90.0,Disney/Pixar,"Andrew Stanton,Stephany Folsom",94.0,"Heartwarming, funny, and beautifully animated,...",Toy Story 4,97,402,https://www.rottentomatoes.com/m/toy_story_4,53096.0
1,0,James Franco,Drama,2019-10-04,2019-10-04,NR,95.0,Cleopatra Entertainment,Josh Boone,71.0,,Pretenders,22,9,https://www.rottentomatoes.com/m/pretenders,22.0
2,0,Jon Favreau,"Action & Adventure,Animation,Drama",2019-07-19,2019-10-11,PG,110.0,Walt Disney Pictures,Jeff Nathanson,88.0,While it can take pride in its visual achievem...,The Lion King,53,390,https://www.rottentomatoes.com/m/the_lion_king...,76190.0
3,0,Michel Ocelot,"Animation,Art House & International,Kids & Fam...",2019-10-04,2019-10-04,PG,94.0,,Michel Ocelot,,,Dilili in Paris (Dilili à Paris),60,20,https://www.rottentomatoes.com/m/dilili_in_paris,
4,0,Michael Dowse,"Action & Adventure,Comedy",2019-07-12,2019-10-01,R,105.0,20th Century Fox,Tripper Clancy,79.0,Though it makes a strong case for future colla...,Stuber,42,207,https://www.rottentomatoes.com/m/stuber,5388.0


### Check the rows with null values in 'genre' column

In [3]:
df[df.genre.isnull()]

Unnamed: 0,box_office,director,genre,theater_date,streaming_date,rating,runtime,studio,written_by,audience_score,critics_consensus,title,tomatometer,tomatometer_count,url,user_rating_count
141,0,Justin Copeland,,,2019-07-20,PG-13,81.0,,"Jeph Loeb,Jim Lee",,,Batman: Hush,88,17,https://www.rottentomatoes.com/m/batman_hush,
279,0,,,,2018-10-02,PG-13,72.0,,Peter J. Tomasi,87.0,,The Death of Superman,92,13,https://www.rottentomatoes.com/m/the_death_of_...,381.0
367,0,Kunihiko Yuyama,,2017-11-05,2018-02-13,NR,120.0,,,64.0,,Pokémon The Movie: I Choose You!,33,9,https://www.rottentomatoes.com/m/pokemon_the_m...,717.0
840,0,Ryan Bellgardt,,,2018-06-12,NR,86.0,High Octane Pictures,Ryan Merriman,41.0,,The Jurassic Games,80,5,https://www.rottentomatoes.com/m/the_jurassic_...,95.0
958,0,Jesse V. Johnson,,,2018-06-05,NR,97.0,Compound B,"Jesse V. Johnson,Stu Small",46.0,,The Debt Collector,80,5,https://www.rottentomatoes.com/m/the_debt_coll...,58.0
1205,0,Rick Morales,,,2017-10-17,PG,72.0,DC Entertainment,"Michael Jelenic,James Tucker",60.0,,Batman vs. Two-Face,100,8,https://www.rottentomatoes.com/m/batman_vs_two...,171.0
1238,0,Sam Liu,,,2018-04-10,R,91.0,DC Entertainment,"Len Wein,John Byrne,Alan Burnett",69.0,,Suicide Squad: Hell to Pay,88,8,https://www.rottentomatoes.com/m/suicide_squad...,349.0
1585,0,Kenny Ortega,,,2017-08-15,NR,30.0,,,72.0,,Descendants 2,67,6,https://www.rottentomatoes.com/m/descendants_2,827.0
1954,0,Rick Morales,,,2016-11-01,PG,72.0,Warner Bros. Animation,"William Dozier,Bill Finger,Michael Jelenic,Jam...",67.0,Adam West's groovy interpretation of the The C...,Batman: Return Of The Caped Crusaders,94,18,https://www.rottentomatoes.com/m/batman_return...,471.0
2089,0,Jay Roach,,2016-05-21,2016-09-06,NR,132.0,Amblin Television,Robert Schenkkan,83.0,Anchored by Bryan Cranston's phenomenal perfor...,All the Way,88,33,https://www.rottentomatoes.com/m/all_the_way_2016,2822.0


### Manually impute missing values for 'genre' column
- Total of 14 movies with missing value for 'genre'.
- Manually searched for genre from Google/Wikipedia/IMDb

In [4]:
df.loc[(df["title"] == "Batman: Hush"), "genre"] = "Action,Adventure"
df.loc[(df["title"] == "The Death of Superman"), "genre"] = "Sci-fi,Superhero"
df.loc[(df["title"] == "Pokémon The Movie: I Choose You!"), "genre"] = "Fantasy,Action"
df.loc[(df["title"] == "The Jurassic Games"), "genre"] = "Sci-fi,Action,Horror"
df.loc[(df["title"] == "The Debt Collector"), "genre"] = "Action"
df.loc[(df["title"] == "Batman vs. Two-Face"), "genre"] = "Action,Comedy"
df.loc[(df["title"] == "Suicide Squad: Hell to Pay"), "genre"] = "Action,Adventure"
df.loc[(df["title"] == "Descendants 2"), "genre"] = "Fantasy,Action"
df.loc[(df["title"] == "Batman: Return Of The Caped Crusaders"), "genre"] = "Crime,Sci-fi"
df.loc[(df["title"] == "All the Way"), "genre"] = "Drama,History"
df.loc[(df["title"] == "Jane Wants a Boyfriend"), "genre" ] = "Drama,Romance"
df.loc[(df["title"] == "400 Days"), "genre"] = "Mystery,Sci-fi"
df.loc[(df["title"] == "Manson Family Vacation"), "genre" ] = "Drama,Thriller"
df.loc[(df["title"] == "Lost in the Sun"), "genre"] = "Drama,Thriller"

In [5]:
# Standardise the genre  
df.genre = df.genre.replace('Science Fiction','Sci-fi',regex=True)
df.genre = df.genre.replace('Anime','Animation',regex=True)
df.genre = df.genre.replace(' & ',',',regex=True)

### Melt the 'genre' column (binary encoding)

In [6]:
# Melt 'genre' column
genre_melt = df.genre.str.get_dummies(sep=',').add_prefix('genre_')
print(genre_melt.columns)
genre_melt.head()


Index(['genre_Action', 'genre_Adventure', 'genre_Animation', 'genre_Art House',
       'genre_Classics', 'genre_Comedy', 'genre_Crime', 'genre_Cult Movies',
       'genre_Documentary', 'genre_Drama', 'genre_Faith', 'genre_Family',
       'genre_Fantasy', 'genre_Fitness', 'genre_Gay', 'genre_History',
       'genre_Horror', 'genre_International', 'genre_Kids', 'genre_Lesbian',
       'genre_Manga', 'genre_Musical', 'genre_Mystery',
       'genre_Performing Arts', 'genre_Romance', 'genre_Sci-fi',
       'genre_Special Interest', 'genre_Spirituality', 'genre_Sports',
       'genre_Superhero', 'genre_Suspense', 'genre_Television',
       'genre_Thriller', 'genre_Western'],
      dtype='object')


Unnamed: 0,genre_Action,genre_Adventure,genre_Animation,genre_Art House,genre_Classics,genre_Comedy,genre_Crime,genre_Cult Movies,genre_Documentary,genre_Drama,...,genre_Romance,genre_Sci-fi,genre_Special Interest,genre_Spirituality,genre_Sports,genre_Superhero,genre_Suspense,genre_Television,genre_Thriller,genre_Western
0,0,0,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,1,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Check for number for each genre
genre_melt.sum(axis=0)

genre_Action               783
genre_Adventure            778
genre_Animation            186
genre_Art House            500
genre_Classics              38
genre_Comedy              1171
genre_Crime                  1
genre_Cult Movies            3
genre_Documentary          804
genre_Drama               2391
genre_Faith                 10
genre_Family               175
genre_Fantasy              408
genre_Fitness               49
genre_Gay                    4
genre_History                1
genre_Horror               535
genre_International        500
genre_Kids                 175
genre_Lesbian                4
genre_Manga                  4
genre_Musical              162
genre_Mystery              887
genre_Performing Arts      162
genre_Romance              325
genre_Sci-fi               410
genre_Special Interest     347
genre_Spirituality          10
genre_Sports                49
genre_Superhero              1
genre_Suspense             886
genre_Television            42
genre_Th

### Combine multiple genres into a single 'genre cluster'
Using domain knowledge, we identify genres that are similar to each other, and group them into a single genre cluster, such as:
- genre_FamilyKids: Family, Kids (movies that exhibit a family or kids theme)
- genre_AnimationManga: Animation, Manga (movies that are animated or have japanese manga reference)
- genre_FitnessSports: Fitness, Sports (movies that exhibit fitness or sports theme)
- genre_DramaTele: Drama, Television (movies that are drama or TV series based)
- genre_MusicalPerfarts: Musical, Performing Arts (movies that exhibit musical or performings arts theme)
- genre_FaithSpirit: Faith, Spirituality (movies that exhibit faith or spirituality theme)
- genre_ClassicsCult: Classics, Cult Movies (movies that exhibit classical or are cult classics) 
- genre_ArthouseInter: Art House, International (international movies)
- genre_ThrillMysSusp: Thriller, Mystery, Suspense (movies that exhibit thriller, mystery or suspense theme)
- genre_GayLesbian: Gay, Lesbian (movies that exhibit a homosexual theme)
- genre_HistDocument: History, Documentary (documentary films or movies that are based on history)

With this procedure, we have reduced a binary matrix of 34 genres into 22 genre clusters.

In [8]:
# Combine similar genre
genre_melt["genre_FamilyKids"] = genre_melt[["genre_Family", "genre_Kids"]].any(axis = 1)*1
genre_melt.drop(["genre_Family", "genre_Kids"], axis = 1, inplace = True)
genre_melt["genre_AnimationManga"] = genre_melt[["genre_Animation", "genre_Manga"]].any(axis = 1)*1
genre_melt.drop(["genre_Animation", "genre_Manga"], axis = 1, inplace = True)
genre_melt["genre_FitnessSports"] = genre_melt[["genre_Fitness", "genre_Sports"]].any(axis = 1)*1
genre_melt.drop(["genre_Fitness", "genre_Sports"], axis = 1, inplace = True)
genre_melt["genre_DramaTele"] = genre_melt[["genre_Drama", "genre_Television"]].any(axis = 1)*1
genre_melt.drop(["genre_Drama", "genre_Television"], axis = 1, inplace = True)
genre_melt["genre_MusicalPerfarts"] = genre_melt[["genre_Musical", "genre_Performing Arts"]].any(axis = 1)*1
genre_melt.drop(["genre_Musical", "genre_Performing Arts"], axis = 1, inplace = True)
genre_melt["genre_FaithSpirit"] = genre_melt[["genre_Faith", "genre_Spirituality"]].any(axis = 1)*1
genre_melt.drop(["genre_Faith", "genre_Spirituality"], axis = 1, inplace = True)
genre_melt["genre_ClassicsCult"] = genre_melt[["genre_Classics", "genre_Cult Movies"]].any(axis = 1)*1
genre_melt.drop(["genre_Classics", "genre_Cult Movies"], axis = 1, inplace = True)
genre_melt["genre_ArthouseInter"] = genre_melt[["genre_Art House", "genre_International"]].any(axis = 1)*1
genre_melt.drop(["genre_Art House", "genre_International"], axis = 1, inplace = True)
genre_melt["genre_ThrillMysSusp"] = genre_melt[["genre_Thriller", "genre_Mystery","genre_Suspense"]].any(axis = 1)*1
genre_melt.drop(["genre_Thriller", "genre_Mystery","genre_Suspense"], axis = 1, inplace = True)
genre_melt["genre_GayLesbian"] = genre_melt[["genre_Gay", "genre_Lesbian"]].any(axis = 1)*1
genre_melt.drop(["genre_Gay", "genre_Lesbian"], axis = 1, inplace = True)
genre_melt["genre_HistDocument"] = genre_melt[["genre_History", "genre_Documentary"]].any(axis = 1)*1
genre_melt.drop(["genre_History", "genre_Documentary"], axis = 1, inplace = True)

In [9]:
# Drop the genre that is not significant
genre_melt.drop(["genre_Crime", "genre_Superhero","genre_GayLesbian","genre_FaithSpirit"], axis = 1, inplace = True)

In [10]:
# Combine melted genre column with the movie info table 
df.drop(["genre"], axis = 1, inplace = True)
df = pd.concat([df, genre_melt], axis=1)

### Melt the 'rating' column (binary encoding)
- then append the binary features to the original dataframe

In [11]:
rating_melt = df.rating.str.get_dummies(sep=',')

df.drop(["rating"], axis = 1, inplace = True)
df = pd.concat([df, rating_melt], axis=1)


### Drop columns that are not useful to predict the `audience_score` or `tomatometer`

In [12]:
df = df.drop(["box_office", "director", "theater_date", "streaming_date", "studio", "written_by", "critics_consensus"], axis = 1)


### Features 1: Movie information from Rotten Tomatoes (movie_df)


In [13]:
movie_df = df.copy()
movie_df.to_csv("features_movie_info.csv", index = False)

In [14]:
movie_df.head()

Unnamed: 0,runtime,audience_score,title,tomatometer,tomatometer_count,url,user_rating_count,genre_Action,genre_Adventure,genre_Comedy,...,genre_ClassicsCult,genre_ArthouseInter,genre_ThrillMysSusp,genre_HistDocument,G,NC17,NR,PG,PG-13,R
0,90.0,94.0,Toy Story 4,97,402,https://www.rottentomatoes.com/m/toy_story_4,53096.0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,95.0,71.0,Pretenders,22,9,https://www.rottentomatoes.com/m/pretenders,22.0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,110.0,88.0,The Lion King,53,390,https://www.rottentomatoes.com/m/the_lion_king...,76190.0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
3,94.0,,Dilili in Paris (Dilili à Paris),60,20,https://www.rottentomatoes.com/m/dilili_in_paris,,0,0,0,...,0,1,1,0,0,0,0,1,0,0
4,105.0,79.0,Stuber,42,207,https://www.rottentomatoes.com/m/stuber,5388.0,1,1,1,...,0,0,0,0,0,0,0,0,0,1
