# Feature Engineering

https://www.kaggle.com/datasets/bharatnatrayn/movies-dataset-for-feature-extracion-prediction?select=movies.csv

In [1]:
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm

In [2]:
tqdm.pandas()

# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [3]:
df = pd.read_csv("../data/movies.csv")
df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,
4,Army of Thieves,(2021),"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   MOVIES    9999 non-null   object 
 1   YEAR      9355 non-null   object 
 2   GENRE     9919 non-null   object 
 3   RATING    8179 non-null   float64
 4   ONE-LINE  9999 non-null   object 
 5   STARS     9999 non-null   object 
 6   VOTES     8179 non-null   object 
 7   RunTime   7041 non-null   float64
 8   Gross     460 non-null    object 
dtypes: float64(2), object(7)
memory usage: 703.2+ KB


## Data Cleaning

### Year

In [5]:
df["YEAR"] = df["YEAR"].str.replace("(", "", regex=False)
df["YEAR"] = df["YEAR"].str.replace(")", "", regex=False)
df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,2021,"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,
1,Masters of the Universe: Revelation,2021–,"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,
2,The Walking Dead,2010–2022,"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,
3,Rick and Morty,2013–,"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,
4,Army of Thieves,2021,"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,


In [6]:
df["TYPE"] = np.where(df["YEAR"].str.contains("–"), "Series", "Movie")
df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE
0,Blood Red Sky,2021,"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,,Movie
1,Masters of the Universe: Revelation,2021–,"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,,Series
2,The Walking Dead,2010–2022,"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,,Series
3,Rick and Morty,2013–,"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,,Series
4,Army of Thieves,2021,"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,,Movie


In [7]:
df["Year_From"] = np.NaN
df["Year_To"] = np.NaN

def extract_from(x):
    if pd.isna(x):
        return np.NaN

    year_to_return = None
    
    year = str(x)
    if "–" not in year:
        year_to_return = year
    else:
        years = year.split("–")
        year_to_return = years[0]
        
    year_to_return = re.sub("[^0-9]", "", year_to_return)
    return year_to_return
                    
def extract_to(x):
    if pd.isna(x):
        return np.NaN
    
    year_to_return = None
    
    year = str(x)
    if "–" not in year:
        return np.NaN
    else:
        years = year.split("–")
        year_to_return = re.sub("[^0-9]", "", years[1])
        if len(year_to_return) == 0:
            return np.NaN
        else:
            return year_to_return

df["Year_From"] = df["YEAR"].progress_apply(extract_from)
df["Year_To"] = df["YEAR"].progress_apply(extract_to)

df.sample(10)

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,Year_From,Year_To
9891,Dexter,2006–2013,"\nCrime, Drama, Mystery",9.5,\nDexter must buy time to protect himself when...,\n Director:\nS.J. Clarkson\n| \n Stars:...,5437,51.0,,Series,2006,2013.0
9076,Paranormal,2020–,"\nDrama, Fantasy, Horror",8.3,\nAfter a supposedly cursed mummy is autopsied...,"\n Directors:\nMajid Al Ansari, \nAmr Salam...",6673,46.0,,Series,2020,
26,Vikings,2013–2020,"\nAction, Adventure, Drama",8.5,\nVikings transports us to the brutal and myst...,"\n \n Stars:\nKatheryn Winnick, ...",459913,44.0,,Series,2013,2020.0
7934,Le Bazar de la Charité,2019,"\nDrama, History",7.7,"\nAlice loses her virginity with Victor, and t...",\n Director:\nAlexandre Laurent\n| \n St...,94,49.0,,Movie,2019,
5221,Tiffany Haddish: She Ready! From the Hood to H...,2017 TV Special,\nComedy,6.7,\nComedic breakout Tiffany Haddish delivers a ...,\n Director:\nChris Robinson\n| \n Star:...,468,65.0,,Movie,2017,
764,Roma,2018,\nDrama,7.7,\nA year in the life of a middle-class family'...,\n Director:\nAlfonso Cuarón\n| \n Stars...,146484,135.0,,Movie,2018,
1788,Racing Stripes,2005,"\nAdventure, Comedy, Drama",5.2,\nAn abandoned zebra grows up believing he is ...,\n Director:\nFrederik Du Chau\n| \n Sta...,16506,102.0,$49.77M,Movie,2005,
1940,Darwin's Game,2020–,"\nAnimation, Action, Drama",7.3,\nAn unknowing Sudou Kaname is invited to try ...,"\n \n Stars:\nYûsuke Kobayashi, ...",1344,23.0,,Series,2020,
121,Hawaii Five-0,2010–2020,"\nAction, Crime, Drama",7.3,\nSteve McGarrett returns home to Oahu in orde...,"\n \n Stars:\nAlex O'Loughlin, \...",68627,42.0,,Series,2010,2020.0
606,Into the Night,II 2020–,"\nDrama, Sci-Fi, Thriller",7.1,\nWhen radiation from the sun causes a global ...,"\n \n Stars:\nPauline Etienne, \...",23336,,,Series,2020,


### Genre

In [8]:
df["GENRE"] = df["GENRE"].str.replace("\n", "", regex=False)
df["GENRE"] = df["GENRE"].str.replace(" ", "", regex=False)
df.sample(10)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,Year_From,Year_To
9146,Lost Ollie,,"Animation,Adventure,Drama",,\nAdd a Plot\n,"\n \n Stars:\nGina Rodriguez, \n...",,,,Series,,
939,Baki,2018–2020,"Animation,Action,Drama",6.7,"\nThe protagonist, Baki Hanma, trains with an ...","\n \n Stars:\nTôru Furuya, \nKir...",4329.0,45.0,,Series,2018.0,2020.0
6130,The Ice Bucket Challenge,,"Biography,Drama",,"\nThe life story of Pete Frates, a former Bost...",\n,,,,Series,,
46,Ozark,2017–,"Crime,Drama,Thriller",8.4,\nA financial advisor drags his family from Ch...,"\n \n Stars:\nJason Bateman, \nL...",206090.0,60.0,,Series,2017.0,
8403,Fauda,2015–,"Action,Drama,Thriller",8.0,\nBashar returns to the west bank promising to...,\n Director:\nRotem Shamir\n| \n Stars:\...,289.0,39.0,,Series,2015.0,
2012,The Main Event,2020,"Action,Comedy,Family",4.8,"\nAfter discovering a magical mask, an 11-year...",\n Director:\nJay Karas\n| \n Stars:\nSe...,2062.0,101.0,,Movie,2020.0,
9249,The Upshaws,2021–,Comedy,8.5,\nWhen Bennie tries to live a healthier lifest...,\n Director:\nPhill Lewis\n| \n Stars:\n...,44.0,,,Series,2021.0,
1825,Soundtrack,2019,"Drama,Musical",6.8,\nMusic connects the lives of random people li...,"\n \n Stars:\nPaul James, \nCall...",1910.0,56.0,,Movie,2019.0,
6250,Nueva serie de Daniel Sánchez Arévalo,,Drama,,\nAn exciting story of friendship and persever...,\n,,,,Series,,
3839,Untold: Crimes and Penalties,2021,"Documentary,Sport",,\nAdd a Plot\n,"\n Directors:\nChapman Way, \nMaclain Way\n",,85.0,,Movie,2021.0,


#### Dummy on Genres

In [9]:
df.head(1)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,Year_From,Year_To
0,Blood Red Sky,2021,"Action,Horror,Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062,121.0,,Movie,2021,


In [10]:
df_dummy = df["GENRE"].str.get_dummies(sep=",")
df_dummy = df_dummy.add_prefix('Genre_')
df_dummy.head()

Unnamed: 0,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Biography,Genre_Comedy,Genre_Crime,Genre_Documentary,Genre_Drama,Genre_Family,Genre_Fantasy,...,Genre_News,Genre_Reality-TV,Genre_Romance,Genre_Sci-Fi,Genre_Short,Genre_Sport,Genre_Talk-Show,Genre_Thriller,Genre_War,Genre_Western
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# merging/joining by index
df = df.merge(df_dummy, how="inner", left_index=True, right_index=True)
df.sample(10)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Genre_News,Genre_Reality-TV,Genre_Romance,Genre_Sci-Fi,Genre_Short,Genre_Sport,Genre_Talk-Show,Genre_Thriller,Genre_War,Genre_Western
7467,The Chestnut Man,2021–,"Crime,Drama,Mystery",,\nAdd a Plot\n,\n Director:\nMikkel Serup\n| \n Stars:\...,,,,Series,...,0,0,0,0,0,0,0,0,0,0
4261,Oh My Baby,2020,"Drama,Romance",7.3,\nJang Ha Ri is a 39-year-old single woman and...,"\n \n Stars:\nJang Na-ra, \nJoon...",132.0,60.0,,Movie,...,0,0,1,0,0,0,0,0,0,0
6542,Avatar: The Last Airbender,2005–2008,"Animation,Action,Adventure",9.1,\nAang relives the events after finding out th...,\n Director:\nLauren MacMullan\n| \n Sta...,3555.0,25.0,,Series,...,0,0,0,0,0,0,0,0,0,0
7171,Las chicas del cable,2017–2020,Drama,7.7,\nAdd a Plot\n,\n Director:\nAntonio Hernández\n| \n St...,95.0,,,Series,...,0,0,0,0,0,0,0,0,0,0
6120,Slow Country,2017,"Action,Drama",4.6,"\nA homeless teenage mother, who gets herself ...",\n Director:\nEric Aghimien\n| \n Stars:...,32.0,115.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
8696,Baki,2018–2020,"Animation,Action,Drama",7.8,\nThe legendary Sea Emperor has to face the St...,\n Director:\nToshiki Hirano\n| \n Stars...,70.0,24.0,,Series,...,0,0,0,0,0,0,0,0,0,0
8538,The Midnight Gospel,2020–,"Animation,Adventure,Comedy",9.2,\nAt a prison for simulated beings suffering f...,"\n Directors:\nPendleton Ward, \nMike L. Ma...",997.0,24.0,,Series,...,0,0,0,0,0,0,0,0,0,0
2855,Nuestros amantes,2016,"Comedy,Drama,Romance",6.8,"\nOne man, one woman, one rule: not fall in love.",\n Director:\nMiguel Ángel Lamata\n| \n ...,3890.0,88.0,,Movie,...,0,0,1,0,0,0,0,0,0,0
1155,The Killer,I,"Action,Crime,Drama",,\nPlot unknown.,\n Director:\nDavid Fincher\n| \n Star:\...,,,,Movie,...,0,0,0,0,0,0,0,0,0,0
2895,The King and I,1999,"Animation,Family,Fantasy",4.5,"\nTraveling to the exotic kingdom of Siam, Eng...",\n Director:\nRichard Rich\n| \n Stars:\...,3922.0,87.0,$11.96M,Movie,...,0,0,0,0,0,0,0,0,0,0


### Stars

In [12]:
df["STARS"] = df["STARS"].str.replace("\n", "", regex=False)
df.sample(10)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Genre_News,Genre_Reality-TV,Genre_Romance,Genre_Sci-Fi,Genre_Short,Genre_Sport,Genre_Talk-Show,Genre_Thriller,Genre_War,Genre_Western
2155,Life in Colour,2021–,Documentary,8.3,\nExploring the many ways animals use colour t...,"Stars:David Attenborough, Joer...",2173.0,59.0,,Series,...,0,0,0,0,0,0,0,0,0,0
9418,The Empress,2022–,"Drama,History,Romance",,\nAdd a Plot\n,"Directors:Florian Cossen, Katrin Gebbe| ...",,,,Series,...,0,0,1,0,0,0,0,0,0,0
7733,La casa de las flores,2018–2020,"Comedy,Drama",6.4,\nDiego confronts Lucía about a secret she's b...,Director:Yibran Asuad| Stars:Cecilia S...,123.0,,,Series,...,0,0,0,0,0,0,0,0,0,0
6698,Never Have I Ever,2020–,Comedy,8.5,\nFlashbacks unearth painful memories and rese...,Director:Tristram Shapeero| Stars:Mait...,708.0,24.0,,Series,...,0,0,0,0,0,0,0,0,0,0
8715,Move to Heaven,2021–,Drama,8.8,\nA factory intern's fatal injury brings Han J...,"Stars:Lee Jehoon, Olivia Casta...",116.0,,,Series,...,0,0,0,0,0,0,0,0,0,0
9146,Lost Ollie,,"Animation,Adventure,Drama",,\nAdd a Plot\n,"Stars:Gina Rodriguez, Jake Joh...",,,,Series,...,0,0,0,0,0,0,0,0,0,0
6355,"Stamped: Racism, Antiracism and You",,Documentary,,"\nRacism, how it was originated in America and...",Director:Roger Ross Williams,,,,Series,...,0,0,0,0,0,0,0,0,0,0
4425,Residue,2015,"Drama,Horror,Mystery",6.1,\nThe government cover-up of the causes behind...,"Stars:Natalia Tena, Iwan Rheon...",2551.0,130.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
2518,The Adventures of Puss in Boots,2015–2018,"Animation,Action,Adventure",6.7,\nThe famous cat protects the city of San Lore...,"Stars:Eric Bauza, Jayma Mays, ...",1491.0,23.0,,Series,...,0,0,0,0,0,0,0,0,0,0
7435,Grand Army,2020,Drama,8.3,"\nDom, Joey and Sid find some release. After h...",Director:Clement Virgo| Stars:Odessa A...,128.0,71.0,,Movie,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df["Directors"] = None
df["Actors"] = None

def extract_directors(x):
    if "Director" in x:
        stars = x.split("|")
        if "Director" in stars[0]:
            return stars[0]
        else:
            return stars[1]
    else:
        return np.NaN
    
def extract_actors(x):
    if "Star" in x:
        stars = x.split("|")
        if "Star" in stars[0]:
            return stars[0]
        else:
            return stars[1]
    else:
        return np.NaN
    

df["Directors"] = df["STARS"].progress_apply(extract_directors)
df["Actors"] = df["STARS"].progress_apply(extract_actors)

df["Directors"] = df["Directors"].str.replace("Director:", "", regex=False)
df["Directors"] = df["Directors"].str.replace("Directors:", "", regex=False)
df["Actors"] = df["Actors"].str.replace("Star:", "", regex=False)
df["Actors"] = df["Actors"].str.replace("Stars:", "", regex=False)

df.sample(10)

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Genre_Romance,Genre_Sci-Fi,Genre_Short,Genre_Sport,Genre_Talk-Show,Genre_Thriller,Genre_War,Genre_Western,Directors,Actors
727,Dirty Grandpa,2016,Comedy,5.9,"\nRight before his wedding, an uptight guy is ...",Director:Dan Mazer| Stars:Robert De Ni...,117026.0,102.0,$35.59M,Movie,...,0,0,0,0,0,0,0,0,Dan Mazer,"Robert De Niro, Zac Efron, Zoey Deutch, A..."
3643,The Short Game,2013,"Documentary,Sport",7.3,\nThe best 7 year old golfers from around the ...,Director:Josh Greenbaum| Stars:Amari A...,1439.0,99.0,$0.04M,Movie,...,0,0,0,1,0,0,0,0,Josh Greenbaum,"Amari Avery, Jed Dy, Allan Kournikova, Za..."
3237,Barbie Princess Adventure,2020,"Animation,Family,Musical",5.6,"\nWith new friends in a new kingdom, Barbie le...",Director:Conrad Helten| Stars:Ogie Ban...,288.0,72.0,,Movie,...,0,0,0,0,0,0,0,0,Conrad Helten,"Ogie Banks, Eamon Brennan, Nakia Burrise,..."
5950,Lavell Crawford: Can a Brother Get Some Love,2011 TV Special,"Documentary,Comedy",7.0,\nLavell Crawford is quickly climbing the come...,Director:Michael Drumm| Stars:Lavell C...,213.0,81.0,,Movie,...,0,0,0,0,0,0,0,0,Michael Drumm,"Lavell Crawford, Kareem Taylor"
3457,Dark Net,2016–2017,Documentary,7.1,\nExplores the furthest reaches of the interne...,"Stars:Lauren Terp, Kristie, Dr...",1447.0,30.0,,Series,...,0,0,0,0,0,0,0,0,,"Lauren Terp, Kristie, Drew, An..."
902,Suburra - La serie,2017–2020,"Action,Crime",7.9,"\nIn 2008, a fight over land in a seaside town...","Stars:Alessandro Borghi, Giaco...",12933.0,50.0,,Series,...,0,0,0,0,0,0,0,0,,"Alessandro Borghi, Giacomo Fer..."
6360,Oceans,,Documentary,,"\nThe stories of the oceans of the planet, whi...",,,,,Series,...,0,0,0,0,0,0,0,0,,
2799,Cuando los ángeles duermen,2018,"Action,Adventure,Crime",5.5,\nA CEO lives the worst night of his life afte...,Director:Gonzalo Bendala| Stars:Julián...,3386.0,91.0,,Movie,...,0,0,0,0,0,0,0,0,Gonzalo Bendala,"Julián Villagrán, Marian Álvarez, Ester E..."
5193,Jachebalgwang Opiseu,2017,"Comedy,Drama,Romance",7.0,\nAn office romantic-comedy features the hijin...,"Stars:Ko Asung, Ha Seok-jin, D...",187.0,65.0,,Movie,...,1,0,0,0,0,0,0,0,,"Ko Asung, Ha Seok-jin, Dong-hw..."
9194,Power Rangers: Dino Fury,2021,"Action,Adventure,Comedy",,\nAdd a Plot\n,"Stars:Russell Curry, Hunter De...",,21.0,,Movie,...,0,0,0,0,0,0,0,0,,"Russell Curry, Hunter Deno, Ka..."


#### Dummy on Actors

In [14]:
df["Actors"] = df["Actors"].str.replace(", ", ",", regex=False)
df['Actors'] = df['Actors'].str.strip()

df_dummy = df["Actors"].str.get_dummies(sep=",")
df_dummy = df_dummy.add_prefix('Actor_')
df_dummy.head()

Unnamed: 0,Actor_2 Chainz,Actor_2'Live Bre,Actor_2Mex,Actor_50 Cent,Actor_A Boogie wit da Hoodie,Actor_A.J. Baime,Actor_A.J. Daulerio,Actor_A.J. LoCascio,Actor_A.N.T.I.,Actor_AJ Bowen,...,Actor_Özge Borak,Actor_Özge Özpirinçci,Actor_Özgür Emre Yildirim,Actor_Özgür Ozan,Actor_Özkan Ugur,Actor_Özz Nûjen,Actor_Úrsula Corberó,Actor_Úrsula Pruneda,Actor_Ülkü Duru,Actor_Þorsteinn Bachmann
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
df_dummy.shape

(9999, 17325)

In [16]:
# merging/joining by index
df = df.merge(df_dummy, how="inner", left_index=True, right_index=True)
df.sample(10)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Actor_Özge Borak,Actor_Özge Özpirinçci,Actor_Özgür Emre Yildirim,Actor_Özgür Ozan,Actor_Özkan Ugur,Actor_Özz Nûjen,Actor_Úrsula Corberó,Actor_Úrsula Pruneda,Actor_Ülkü Duru,Actor_Þorsteinn Bachmann
1754,Masum,2017,"Crime,Drama,Mystery",8.4,"\nCevdet (Haluk Bilginer), a retired commissio...","Stars:Haluk Bilginer, Nur Süre...",16763.0,60.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
9997,The Imperfects,2021–,"Adventure,Drama,Fantasy",,\nAdd a Plot\n,Director:Jovanka Vuckovic| Stars:Morga...,,,,Series,...,0,0,0,0,0,0,0,0,0,0
2627,6 Balloons,2018,Drama,5.9,\nA woman (Jacobson) learns her brother (Franc...,Director:Marja-Lewis Ryan| Stars:Abbi ...,3865.0,74.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
9895,Dexter,2006–2013,"Crime,Drama,Mystery",9.2,\nDexter gets some much-needed R&R time with R...,Director:John Dahl| Stars:Michael C. H...,4256.0,55.0,,Series,...,0,0,0,0,0,0,0,0,0,0
8400,The Protector,2018–2020,"Action,Fantasy,Sci-Fi",7.5,\nA mysterious symbol offers Hakan a clue as h...,Director:Burcu Alptekin| Stars:Çagatay...,305.0,,,Series,...,0,0,0,0,0,0,0,0,0,0
6548,The Blue Planet,2001,Documentary,8.1,\nCoral reefs are the rainforests of the sea; ...,"Stars:David Attenborough, Pier...",594.0,49.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
6475,Avatar: The Last Airbender,2005–2008,"Animation,Action,Adventure",7.6,\nSokka and Katara have to solve a centuries-o...,Director:Lauren MacMullan| Stars:Zach ...,2871.0,24.0,,Series,...,0,0,0,0,0,0,0,0,0,0
4755,Live at the Foxes Den,2013,Drama,5.0,\nA corporate lawyer reconsiders his career op...,Director:Michael Kristoff| Stars:Jacks...,141.0,103.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
2519,Christmas with a View,2018 TV Movie,"Drama,Romance",5.2,\nSecret aspirations between a failed restaura...,Director:Justin G. Dyck| Stars:Kaitlyn...,3906.0,91.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
6918,Outer Banks,2020–,"Action,Crime,Drama",7.7,\nAs the gang hatches an illicit scheme to get...,Director:Cherie Nowlan| Stars:Chase St...,762.0,47.0,,Series,...,0,0,0,0,0,0,0,0,0,0


#### Dummy on Directors

In [17]:
df["Directors"] = df["Directors"].str.replace(", ", ",", regex=False)
df['Directors'] = df['Directors'].str.strip()

df_dummy = df["Directors"].str.get_dummies(sep=",")
df_dummy = df_dummy.add_prefix('Director_')
df_dummy.head()

Unnamed: 0,Director_Aadish Keluskar,Director_Aaron Augenblick,Director_Aaron Burns,Director_Aaron Hann,Director_Aaron Lieber,Director_Aaron Long,Director_Aaron Moorhead,Director_Aaron Saidman,Director_Aaron Sorkin,Director_Aban Raza,...,Director_Àlex Pastor,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df_dummy.shape

(9999, 4083)

In [19]:
# merging/joining by index
df = df.merge(df_dummy, how="inner", left_index=True, right_index=True)
df.sample(10)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Director_Àlex Pastor,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur
1434,The Get Down,2016–2017,"Drama,Music,Musical",8.3,\nA ragtag group of teenagers run wild in the ...,"Stars:Justice Smith, Shameik M...",21504.0,52.0,,Series,...,0,0,0,0,0,0,0,0,0,0
5244,Cuckoo Song,,"Drama,Horror,Mystery",,"\nIn the haunting aftermath of war, a missing ...",,,60.0,,Series,...,0,0,0,0,0,0,0,0,0,0
3294,Quantum Quest: A Cassini Space Odyssey,2010,"Animation,Adventure,Sci-Fi",5.2,"\nQuantum Quest centers on the story of Dave, ...","Directors:Harry 'Doc' Kloor, Daniel St. Pi...",350.0,45.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
5542,God Loves Uganda,2013,Documentary,7.4,\nAn account of the American Evangelicals' att...,Director:Roger Ross Williams,1179.0,83.0,$0.05M,Movie,...,0,0,0,0,0,0,0,0,0,0
361,Zoo,2015–2017,"Drama,Mystery,Sci-Fi",6.7,\nA young scientist searches to find out what'...,"Stars:James Wolk, Kristen Conn...",23095.0,42.0,,Series,...,0,0,0,0,0,0,0,0,0,0
8788,Exatlon Challenge,2020–,"Game-Show,Reality-TV,Sport",4.3,\nThe contestants swap teams. Will the White T...,Director:Mustafa Kazan| Stars:Dilan Ay...,16.0,,,Series,...,0,0,0,0,0,0,0,0,0,0
6709,On My Block,2018–,"Comedy,Drama",8.4,"\nCesar meets his fate head-on, but friendship...",Director:Lauren Iungerich| Stars:Sierr...,199.0,30.0,,Series,...,0,0,0,0,0,0,0,0,0,0
5904,Visions of Us: LGBTQ+ Latine Representation in...,2021,Documentary,,\nVisions of Us is a four-part digital documen...,,,,,Movie,...,0,0,0,0,0,0,0,0,0,0
5946,Stamped from the Beginning,,Documentary,,\nFollowing the roots and evolution of racist ...,Director:Roger Ross Williams,,,,Series,...,0,0,0,0,0,0,0,0,0,0
3896,Jim Jefferies: Intolerant,2020 TV Special,Comedy,7.2,"\nBetween scenes from an excruciating date, Ji...",Director:Scott Zabielski| Star:Jim Jef...,2394.0,66.0,,Movie,...,0,0,0,0,0,0,0,0,0,0


In [20]:
df.shape

(9999, 21449)

### One-Line

In [21]:
df["ONE-LINE"] = df["ONE-LINE"].str.replace("\n", "", regex=False)
df["ONE-LINE"] = np.where(df["ONE-LINE"] == "Add a Plot", np.NaN, df["ONE-LINE"])
df["ONE-LINE"] = np.where(df["ONE-LINE"] == "Plot unknown.", np.NaN, df["ONE-LINE"])
df.sample(10)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Director_Àlex Pastor,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur
8421,Buddi,2020–,Animation,,"The Buddis bounce, spin, glide - and giggle. -...","Directors:Jason Jameson, Robert Milne, Kar...",,,,Series,...,0,0,0,0,0,0,0,0,0,0
2023,Eojjeoda Balgyeonhan Haru,2019,"Comedy,Fantasy,Romance",7.9,What would you do if you found out that you we...,"Stars:Kim Hye-Yoon, Ro-Woon Ki...",1868.0,35.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
8680,Dexter,2006–2013,"Crime,Drama,Mystery",8.1,Dexter scrambles to convince Rita that his mar...,Director:Keith Gordon| Stars:Michael C...,3651.0,54.0,,Series,...,0,0,0,0,0,0,0,0,0,0
3647,Secreto bien guardado,2019,Drama,4.9,,"Stars:Oriana Sabatini, Victori...",162.0,,,Movie,...,0,0,0,0,0,0,0,0,0,0
6051,Parrot Heads,2017,"Documentary,Music",6.5,"An in depth look at the world of Parrot Heads,...",Director:Bryce Wagoner| Stars:Jimmy Bu...,65.0,88.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
2525,Walt Before Mickey,2015,"Biography,Drama",5.9,"Based on the book ""Walt Before Mickey"" covers ...",Director:Khoa Le| Stars:Thomas Ian Nic...,5887.0,107.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
5609,The Calling,2018–2019,Reality-TV,8.1,Three people compete in travel show across Ind...,,11.0,30.0,,Series,...,0,0,0,0,0,0,0,0,0,0
9651,True Story,III,"Comedy,Drama",,,Director:Stephen Williams| Stars:Kevin...,,,,Movie,...,0,0,0,0,0,0,0,0,0,0
6894,Historia de un crimen: Colmenares,2019,Crime,7.0,González focuses his attention on Carlos Cárde...,Director:Felipe Martínez Amador| Stars...,28.0,,,Movie,...,0,0,0,0,0,0,0,0,0,0
6687,The Queen's Gambit,2020,Drama,8.5,Russian class opens the door to a new social s...,Director:Scott Frank| Stars:Anya Taylo...,7730.0,49.0,,Movie,...,0,0,0,0,0,0,0,0,0,0


## Missing Values

In [22]:
column_names = ["MOVIES", "GENRE", "RATING", "ONE-LINE",
                "STARS", "VOTES", "RunTime", "Gross", "TYPE",
                "Year_From", "Year_To"]
missing_df = df[column_names].isna().sum().to_frame()
missing_df = missing_df.rename(columns={0:"missing"})
missing_df["percentage"] = (missing_df["missing"] / df.shape[0]) * 100
missing_df

Unnamed: 0,missing,percentage
MOVIES,0,0.0
GENRE,80,0.80008
RATING,1820,18.20182
ONE-LINE,1272,12.721272
STARS,0,0.0
VOTES,1820,18.20182
RunTime,2958,29.582958
Gross,9539,95.39954
TYPE,0,0.0
Year_From,644,6.440644


### Year

In [23]:
# replacing missing values from Year_TO with Year_FROM

df["Year_To"] = np.where(df["Year_To"].isna(), df["Year_From"], df["Year_To"])

### RunTime

In [24]:
df[column_names].describe()

Unnamed: 0,RATING,RunTime
count,8179.0,7041.0
mean,6.921176,68.688539
std,1.220232,47.258056
min,1.1,1.0
25%,6.2,36.0
50%,7.1,60.0
75%,7.8,95.0
max,9.9,853.0


In [25]:
df[df["RunTime"] >= 853]

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Director_Àlex Pastor,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur
1902,El tiempo entre costuras,2013–2014,"Adventure,Drama,History",8.3,Sira Quiroga is a young Spanish dressmaker eng...,"Stars:Adriana Ugarte, Mari Car...",3876,853.0,,Series,...,0,0,0,0,0,0,0,0,0,0


In [26]:
df[df["RunTime"].isna()].sample(5)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Director_Àlex Pastor,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur
8523,Sky Rojo,2021–,"Action,Adventure,Crime",6.6,An injured Wendy contemplates seeking revenge ...,Director:David Victori| Stars:Verónica...,109.0,,,Series,...,0,0,0,0,0,0,0,0,0,0
9477,Bir Denizalti Hikayesi,,"Action,Adventure,Drama",,,"Stars:Kivanç Tatlitug, Özge Öz...",,,,Series,...,0,0,0,0,0,0,0,0,0,0
4473,House of Secrets: The Burari Deaths,2021–,"Documentary,Crime,History",,"Suicide, murder or something else? This docuse...",,,,,Series,...,0,0,0,0,0,0,0,0,0,0
8832,The Protector,2018–2020,"Action,Fantasy,Sci-Fi",7.3,Faysal turns the tables on Vizier. Trapped in ...,Director:Burcu Alptekin| Stars:Çagatay...,167.0,,,Series,...,0,0,0,0,0,0,0,0,0,0
8611,Magic for Humans,2018–,"Comedy,Reality-TV",7.7,"Traditions don't need to make sense, they just...","Directors:Adam Franklin, Dave Paige| S...",41.0,,,Series,...,0,0,0,0,0,0,0,0,0,0


In [27]:
df[column_names][df["TYPE"] == "Movie"].describe()

Unnamed: 0,RATING,RunTime
count,4374.0,4137.0
mean,6.489209,89.269761
std,1.200889,46.489358
min,1.8,1.0
25%,5.8,64.0
50%,6.6,90.0
75%,7.4,105.0
max,9.4,573.0


In [28]:
df[column_names][df["TYPE"] == "Series"].describe()

Unnamed: 0,RATING,RunTime
count,3805.0,2904.0
mean,7.41774,39.368802
std,1.040193,29.549575
min,1.1,1.0
25%,6.9,24.0
50%,7.5,38.0
75%,8.1,47.0
max,9.9,853.0


In [29]:
df["RunTime"][df["TYPE"] == "Movie"].isna().sum()

650

In [30]:
df.loc[(df["TYPE"] == "Movie") & (df["RunTime"].isna()), 'RunTime'] = 89

In [31]:
df["RunTime"][df["TYPE"] == "Series"].isna().sum()

2308

In [32]:
df.loc[(df["TYPE"] == "Series") & (df["RunTime"].isna()), 'RunTime'] = 39

### Elements Missing in Row

In [33]:
df_missing_in_records = df[column_names].isna().sum(axis=1).to_frame()
df_missing_in_records = df_missing_in_records.rename(columns={0:"missing"})

df_missing_in_records.describe()

Unnamed: 0,missing
count,9999.0
mean,1.582058
std,1.361621
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,7.0


In [34]:
df = df.merge(df_missing_in_records, how="inner", left_index=True, right_index=True)

In [35]:
df[df["missing"] > 4].sample(5)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur,missing
8428,In from the Cold,,"Crime,Drama,Thriller",,,Director:Ami Canaan Mann| Stars:Ivanna...,,39.0,,Series,...,0,0,0,0,0,0,0,0,0,6
9276,Stay Close,,"Crime,Drama,Mystery",,,"Stars:Leon Annor, Bethany Anto...",,39.0,,Series,...,0,0,0,0,0,0,0,0,0,6
8244,One Piece,,"Action,Adventure,Fantasy",,,,,39.0,,Series,...,0,0,0,0,0,0,0,0,0,6
6187,Untitled Prentice Penny/Holiday Project,,Comedy,,"Follows convicted felon T.R. ""The Brick"" Johns...",,,39.0,,Series,...,0,0,0,0,0,0,0,0,0,5
9492,Heartstopper,,"Drama,Romance",,,"Director:Euros Lyn| Stars:Kit Connor, ...",,39.0,,Series,...,0,0,0,0,0,0,0,0,0,6


### Removing null values

In [36]:
df = df.dropna(subset=["GENRE", "RATING", "Year_From", "Year_To"])

### Final Checking

In [37]:
column_names = ["MOVIES", "GENRE", "RATING", "ONE-LINE",
                "STARS", "VOTES", "RunTime", "Gross", "TYPE",
                "Year_From", "Year_To"]
missing_df = df[column_names].isna().sum().to_frame()
missing_df = missing_df.rename(columns={0:"missing"})
missing_df["percentage"] = (missing_df["missing"] / df.shape[0]) * 100
missing_df

Unnamed: 0,missing,percentage
MOVIES,0,0.0
GENRE,0,0.0
RATING,0,0.0
ONE-LINE,371,4.542116
STARS,0,0.0
VOTES,0,0.0
RunTime,0,0.0
Gross,7708,94.368266
TYPE,0,0.0
Year_From,0,0.0


## Dropping Redundant Columns

In [38]:
df.drop(columns=["MOVIES", "YEAR", "GENRE", "STARS", "ONE-LINE", "Gross",
                 "Directors", "Actors", "missing"], inplace=True)

## Fixing Votes

In [40]:
df["VOTES"] = df["VOTES"].str.replace(",", "", regex=False)
df.head()

Unnamed: 0,RATING,VOTES,RunTime,TYPE,Year_From,Year_To,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Biography,...,Director_Àlex Pastor,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur
0,6.1,21062,121.0,Movie,2021,2021,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5.0,17870,25.0,Series,2021,2021,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,8.2,885805,44.0,Series,2010,2022,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9.2,414849,23.0,Series,2013,2013,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
5,7.6,25858,50.0,Series,2020,2020,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
df = df.astype({"VOTES": "int64"})
df.dtypes

RATING                            float64
VOTES                               int64
RunTime                           float64
TYPE                               object
Year_From                          object
                                   ...   
Director_Ángel Gómez Hernández      int64
Director_Ángeles Reiné              int64
Director_Åke Sandgren               int64
Director_Óscar Pedraza              int64
Director_Ömer Ugur                  int64
Length: 21441, dtype: object

## Saving

In [45]:
df.to_parquet("../data/movies-clean.parquet.gzip", compression='gzip')