# Feature Engineering

https://www.kaggle.com/datasets/bharatnatrayn/movies-dataset-for-feature-extracion-prediction?select=movies.csv

In [1]:
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm

In [2]:
tqdm.pandas()

# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [3]:
df = pd.read_csv("../data/movies.csv")
df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,
4,Army of Thieves,(2021),"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   MOVIES    9999 non-null   object 
 1   YEAR      9355 non-null   object 
 2   GENRE     9919 non-null   object 
 3   RATING    8179 non-null   float64
 4   ONE-LINE  9999 non-null   object 
 5   STARS     9999 non-null   object 
 6   VOTES     8179 non-null   object 
 7   RunTime   7041 non-null   float64
 8   Gross     460 non-null    object 
dtypes: float64(2), object(7)
memory usage: 703.2+ KB


## Data Cleaning

### Year

In [5]:
df["YEAR"] = df["YEAR"].str.replace("(", "", regex=False)
df["YEAR"] = df["YEAR"].str.replace(")", "", regex=False)
df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,2021,"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,
1,Masters of the Universe: Revelation,2021–,"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,
2,The Walking Dead,2010–2022,"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,
3,Rick and Morty,2013–,"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,
4,Army of Thieves,2021,"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,


In [6]:
df["TYPE"] = np.where(df["YEAR"].str.contains("–"), "Series", "Movie")
df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE
0,Blood Red Sky,2021,"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,,Movie
1,Masters of the Universe: Revelation,2021–,"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,,Series
2,The Walking Dead,2010–2022,"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,,Series
3,Rick and Morty,2013–,"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,,Series
4,Army of Thieves,2021,"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,,Movie


In [7]:
df["Year_From"] = np.NaN
df["Year_To"] = np.NaN

def extract_from(x):
    if pd.isna(x):
        return np.NaN

    year_to_return = None
    
    year = str(x)
    if "–" not in year:
        year_to_return = year
    else:
        years = year.split("–")
        year_to_return = years[0]
        
    year_to_return = re.sub("[^0-9]", "", year_to_return)
    return year_to_return
                    
def extract_to(x):
    if pd.isna(x):
        return np.NaN
    
    year_to_return = None
    
    year = str(x)
    if "–" not in year:
        return np.NaN
    else:
        years = year.split("–")
        year_to_return = re.sub("[^0-9]", "", years[1])
        if len(year_to_return) == 0:
            return np.NaN
        else:
            return year_to_return

df["Year_From"] = df["YEAR"].progress_apply(extract_from)
df["Year_To"] = df["YEAR"].progress_apply(extract_to)

df.sample(10)

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,Year_From,Year_To
1360,Day of the Dead: Bloodline,2017,"\nAction, Drama, Horror",3.4,\nA small group of military personnel and surv...,\n Director:\nHèctor Hernández Vicens\n| \n...,10173.0,90.0,,Movie,2017,
2660,On the Verge,2021–,\nComedy,,\nA look a life for a group of women in their ...,"\n \n Stars:\nJulie Delpy, \nSar...",,,,Series,2021,
7595,Brews Brothers,2020–,\nComedy,6.2,\nAdd a Plot\n,\n Director:\nJeff Schaffer\n| \n Stars:...,68.0,,,Series,2020,
1717,Jungle Beat: The Movie,2020,"\nAnimation, Adventure, Comedy",5.5,\nA homesick alien crash-lands his spaceship n...,\n Director:\nBrent Dawes\n| \n Stars:\n...,760.0,88.0,,Movie,2020,
3238,Dukhtar,2014,"\nDrama, Thriller",7.0,"\nIn the mountains of Pakistan, a mother and h...",\n Director:\nAfia Nathaniel\n| \n Stars...,2324.0,93.0,,Movie,2014,
5927,Diário de um Exorcista - Zero,2016,"\nHorror, Thriller",3.8,\nWhen a mysterious tragedy traumatizes his fa...,\n Director:\nRenato Siqueira\n| \n Star...,607.0,98.0,,Movie,2016,
8165,Kalifat,2020–,"\nCrime, Drama, Thriller",7.8,\nPervin scrambles to cover her tracks after a...,\n Director:\nGoran Kapetanovic\n| \n St...,301.0,47.0,,Series,2020,
4445,Muqaddar Ka Faisla,1987,"\nAction, Comedy, Drama",4.8,\nAn honest Hindu pandit (priest) is framed fo...,\n Director:\nPrakash Mehra\n| \n Stars:...,55.0,171.0,,Movie,1987,
7247,Midnight Mass,2021–,"\nDrama, Horror, Mystery",,\nparticipation of the cast of the series (Mid...,\n Director:\nMike Flanagan\n| \n Stars:...,,,,Series,2021,
7444,Cannon Busters,2019–,"\nAnimation, Fantasy",6.3,\nAdd a Plot\n,"\n \n Stars:\nGreg Chun, \nTrevo...",35.0,,,Series,2019,


### Genre

In [8]:
df["GENRE"] = df["GENRE"].str.replace("\n", "", regex=False)
df["GENRE"] = df["GENRE"].str.replace(" ", "", regex=False)
df.sample(10)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,Year_From,Year_To
4100,InMates,2017–,Comedy,6.6,"\nWhile living under one roof, five close-knit...","\n \n Stars:\nAshish Verma, \nMu...",1200.0,25.0,,Series,2017.0,
4733,Theo Von: No Offense,2016 TV Special,"Documentary,Comedy",5.6,\nSouthern comedian Theo Von offers up a twist...,\n Director:\nJohn Asher\n| \n Star:\nTh...,745.0,67.0,,Movie,2016.0,
9059,Maid,2021–,"Comedy,Drama",,\nAdd a Plot\n,\n Director:\nNzingha Stewart\n| \n Star...,,,,Series,2021.0,
6170,Hans Teeuwen: Spiksplinter,2011 TV Special,"Documentary,Comedy",6.5,\nHard pruning jokes are interspersed with mus...,\n Director:\nDiederik Ebbinge\n| \n Sta...,272.0,110.0,,Movie,2011.0,
5536,Pemburu di Manchester Biru,2020,"Biography,Drama",5.9,\nThis movie tells the story of Hanif Thamrin ...,\n Director:\nRako Prijanto\n| \n Stars:...,36.0,90.0,,Movie,2020.0,
7617,El inocente,2021,"Crime,Drama,Mystery",8.1,\nKimmy gives Olivia the info she needs. Mat t...,\n Director:\nOriol Paulo\n| \n Stars:\n...,567.0,71.0,,Movie,2021.0,
1936,Much Loved,2015,Drama,6.2,\nA group of women in Morocco make a living as...,\n Director:\nNabil Ayouch\n| \n Stars:\...,2535.0,104.0,,Movie,2015.0,
8107,The Circle,2020–,"Game-Show,Reality-TV",7.9,\nMercedeze faces her toughest critics and mak...,"\n \n Stars:\nMichelle Buteau, \...",141.0,45.0,,Series,2020.0,
6181,Again,,"Crime,Drama,Thriller",,\nA female homicide detective in New Delhi dis...,\n,,,,Series,,
2649,Find Me in Your Memory,2020–,"Fantasy,Mystery,Romance",7.4,\nA love story between a man who remembers eve...,"\n \n Stars:\nDong-wook Kim, \nM...",492.0,35.0,,Series,2020.0,


#### Dummy on Genres

In [9]:
df.head(1)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,Year_From,Year_To
0,Blood Red Sky,2021,"Action,Horror,Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062,121.0,,Movie,2021,


In [10]:
df_dummy = df["GENRE"].str.get_dummies(sep=",")
df_dummy = df_dummy.add_prefix('Genre_')
df_dummy.head()

Unnamed: 0,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Biography,Genre_Comedy,Genre_Crime,Genre_Documentary,Genre_Drama,Genre_Family,Genre_Fantasy,...,Genre_News,Genre_Reality-TV,Genre_Romance,Genre_Sci-Fi,Genre_Short,Genre_Sport,Genre_Talk-Show,Genre_Thriller,Genre_War,Genre_Western
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# merging/joining by index
df = df.merge(df_dummy, how="inner", left_index=True, right_index=True)
df.sample(10)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Genre_News,Genre_Reality-TV,Genre_Romance,Genre_Sci-Fi,Genre_Short,Genre_Sport,Genre_Talk-Show,Genre_Thriller,Genre_War,Genre_Western
1110,Johnny Test,2005–,"Animation,Action,Adventure",5.1,\nThe adventures of average suburban boy Johnn...,\n \n Stars:\nJames Arnold Taylo...,7357.0,30.0,,Series,...,0,0,0,0,0,0,0,0,0,0
6539,Dexter,2006–2013,"Crime,Drama,Mystery",8.4,\nDexter and Debra try to dissuade their boss ...,\n Director:\nMichael Cuesta\n| \n Stars...,5212.0,51.0,,Series,...,0,0,0,0,0,0,0,0,0,0
5260,The Ollie & Moon Show,2017–,"Animation,Family",8.3,\nTwo cats travel the globe and teach children...,"\n \n Stars:\nMattea Conforti, \...",59.0,11.0,,Series,...,0,0,0,0,0,0,0,0,0,0
3560,Untold: Deal with the Devil,2021,"Documentary,Sport",,\nChristy Martin broke boundaries and noses as...,\n Director:\nLaura Brownson\n,,77.0,,Movie,...,0,0,0,0,0,1,0,0,0,0
6853,The Pentaverate,II,Comedy,,\nAdd a Plot\n,\n Director:\nTim Kirkby\n| \n Stars:\nD...,,,,Movie,...,0,0,0,0,0,0,0,0,0,0
1674,B: The Beginning,2018–,"Animation,Action,Crime",7.2,"\nIn this anime series, scientists hope ""new h...","\n \n Stars:\nHiroaki Hirata, \n...",4285.0,25.0,,Series,...,0,0,0,0,0,0,0,0,0,0
4537,"Gabriel ""Fluffy"" Iglesias: One Show Fits All",2019 TV Special,Comedy,7.3,"\nGabriel ""Fluffy"" Iglesias discusses his teen...",\n Director:\nManny Rodriguez\n| \n Star...,1624.0,90.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
6894,Historia de un crimen: Colmenares,2019,Crime,7.0,\nGonzález focuses his attention on Carlos Cár...,\n Director:\nFelipe Martínez Amador\n| \n ...,28.0,,,Movie,...,0,0,0,0,0,0,0,0,0,0
3263,Coin Heist,2017,"Crime,Drama,Romance",4.8,"\nUnited by dire circumstances, four unlikely ...",\n Director:\nEmily Hagins\n| \n Stars:\...,2543.0,97.0,,Movie,...,0,0,1,0,0,0,0,0,0,0
5408,Homies,2015,Comedy,5.8,\nAfter they accidentally get involved in a me...,\n Director:\nJon Karthaus\n| \n Stars:\...,1036.0,100.0,,Movie,...,0,0,0,0,0,0,0,0,0,0


### Stars

In [12]:
df["STARS"] = df["STARS"].str.replace("\n", "", regex=False)
df.sample(10)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Genre_News,Genre_Reality-TV,Genre_Romance,Genre_Sci-Fi,Genre_Short,Genre_Sport,Genre_Talk-Show,Genre_Thriller,Genre_War,Genre_Western
194,Baby Driver,2017,"Action,Crime,Drama",7.6,\nAfter being coerced into working for a crime...,Director:Edgar Wright| Stars:Ansel Elg...,468655,113.0,$107.83M,Movie,...,0,0,0,0,0,0,0,0,0,0
4179,Shelby American,2019,"Documentary,Biography,Family",7.3,\nThe story of automotive icon Carroll Shelby ...,"Directors:Nate Adams, Adam Carolla| St...",606,119.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
3542,Liu xing hua yuan,2001–2002,"Comedy,Drama,Romance",7.8,\nMeteor Garden is Taiwanese adaptation the Ja...,"Stars:Barbie Hsu, Jerry Yan, V...",789,60.0,,Series,...,0,0,1,0,0,0,0,0,0,0
3736,Too Funny to Fail: The Life & Death of The Dan...,2017,Documentary,7.7,"\nCreators, cast and network executives recoun...",Director:Josh Greenbaum| Stars:Dana Ca...,1369,92.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
5759,David A. Arnold Fat Ballerina,2020 TV Special,Comedy,7.4,\nAdd a Plot\n,Director:Milton Horowitz| Star:David A...,59,60.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
3882,Chosen,2017–2019,Action,5.5,\nCHOSEN is a psychological thriller about an ...,"Stars:Blue Cheng-Lung Lan, Jia...",115,,,Series,...,0,0,0,0,0,0,0,0,0,0
3716,Einsatzgruppen: The Nazi Death Squads,2009,"Documentary,History,War",7.6,"\nTestimony from witnesses, survivors and Nazi...","Stars:Christopher Browning, Ch...",544,45.0,,Movie,...,0,0,0,0,0,0,0,0,1,0
2220,We the People,2021–,"Animation,Short,Comedy",4.6,\nLearn the basics of rights and citizenship w...,"Stars:Brandi Carlile, Cordae, ...",202,3.0,,Series,...,0,0,0,0,1,0,0,0,0,0
310,Locke & Key,2020–,"Drama,Fantasy,Horror",7.4,\nAfter their father is murdered under mysteri...,"Stars:Darby Stanchfield, Conno...",47717,48.0,,Series,...,0,0,0,0,0,0,0,0,0,0
7779,Dynasty,2017–,Drama,8.0,\nDominique presents Fallon with a business pr...,Director:Matt Earl Beesley| Stars:Eliz...,196,42.0,,Series,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df["Directors"] = None
df["Actors"] = None

def extract_directors(x):
    if "Director" in x:
        stars = x.split("|")
        if "Director" in stars[0]:
            return stars[0]
        else:
            return stars[1]
    else:
        return np.NaN
    
def extract_actors(x):
    if "Star" in x:
        stars = x.split("|")
        if "Star" in stars[0]:
            return stars[0]
        else:
            return stars[1]
    else:
        return np.NaN
    

df["Directors"] = df["STARS"].progress_apply(extract_directors)
df["Actors"] = df["STARS"].progress_apply(extract_actors)

df["Directors"] = df["Directors"].str.replace("Director:", "", regex=False)
df["Directors"] = df["Directors"].str.replace("Directors:", "", regex=False)
df["Actors"] = df["Actors"].str.replace("Star:", "", regex=False)
df["Actors"] = df["Actors"].str.replace("Stars:", "", regex=False)

df.sample(10)

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Genre_Romance,Genre_Sci-Fi,Genre_Short,Genre_Sport,Genre_Talk-Show,Genre_Thriller,Genre_War,Genre_Western,Directors,Actors
9936,Élite,2018–,"Crime,Drama,Thriller",,\nAdd a Plot\n,"Stars:Itzan Escamilla, Omar Ay...",,,,Series,...,0,0,0,0,0,1,0,0,,"Itzan Escamilla, Omar Ayuso, C..."
915,The Daily Life of the Immortal King,2020–,"Animation,Adventure",7.4,\nWang Ling is a near-invincible existence wit...,"Stars:Qian Chen, Gu Jiangshan,...",559.0,18.0,,Series,...,0,0,0,0,0,0,0,0,,"Qian Chen, Gu Jiangshan, Lulu,..."
7078,The Sinner,2017–,"Crime,Drama,Mystery",7.5,\nDetective Ambrose investigates a tragic yet ...,Director:Adam Bernstein| Stars:Bill Pu...,1669.0,49.0,,Series,...,0,0,0,0,0,0,0,0,Adam Bernstein,"Bill Pullman, Jessica Hecht, Parisa Fitz-..."
6829,How to Sell Drugs Online (Fast),2019–,"Comedy,Crime,Drama",7.8,\nWhen his girlfriend breaks up with him after...,Director:Lars Montag| Stars:Maximilian...,854.0,32.0,,Series,...,0,0,0,0,0,0,0,0,Lars Montag,"Maximilian Mundt, Danilo Kamperidis, Lena..."
7124,White Lines,2020,"Crime,Drama,Mystery",7.3,\nDuring a dinner meeting organized by David t...,Director:Ashley Way| Stars:Laura Haddo...,301.0,,,Movie,...,0,0,0,0,0,0,0,0,Ashley Way,"Laura Haddock, Nuno Lopes, Marta Milans, ..."
8062,Next in Fashion,2020,Reality-TV,7.3,"\nIn it to win it, 18 designers pair up to con...","Stars:Alexa Chung, Tan France,...",113.0,50.0,,Movie,...,0,0,0,0,0,0,0,0,,"Alexa Chung, Tan France, Marco..."
6590,Bleach: Burîchi,2004–2012,"Animation,Action,Adventure",8.7,\nSensing great reiatsu near the Repentance Ce...,"Directors:Noriyuki Abe, Kôji Aritomi| ...",199.0,24.0,,Series,...,0,0,0,0,0,0,0,0,"Noriyuki Abe, Kôji Aritomi","Masakazu Morita, Fumiko Orikasa, Shirô Sa..."
1996,Operation Christmas Drop,2020,"Comedy,Family,Romance",5.8,\nCongressional aide Erica (Graham) forgoes fa...,Director:Martin Wood| Stars:Kat Graham...,9647.0,95.0,,Movie,...,1,0,0,0,0,0,0,0,Martin Wood,"Kat Graham, Alexander Ludwig, Trezzo Maho..."
2323,Shot in the Dark,2017,"Documentary,Reality-TV",8.0,\nThree rival freelance stringers scour the st...,"Stars:Zak Holman, Scott Lane, ...",1677.0,37.0,,Movie,...,0,0,0,0,0,0,0,0,,"Zak Holman, Scott Lane, Austin..."
3688,The Man in the Iron Mask,II 1998,Adventure,4.9,\nA lad jousting with his tutor is kidnaped an...,Director:William Richert| Stars:Edward...,226.0,85.0,,Movie,...,0,0,0,0,0,0,0,0,William Richert,"Edward Albert, Dana Barron, Timothy Botto..."


#### Dummy on Actors

In [14]:
df["Actors"] = df["Actors"].str.replace(", ", ",", regex=False)
df['Actors'] = df['Actors'].str.strip()

df_dummy = df["Actors"].str.get_dummies(sep=",")
df_dummy = df_dummy.add_prefix('Actor_')
df_dummy.head()

Unnamed: 0,Actor_2 Chainz,Actor_2'Live Bre,Actor_2Mex,Actor_50 Cent,Actor_A Boogie wit da Hoodie,Actor_A.J. Baime,Actor_A.J. Daulerio,Actor_A.J. LoCascio,Actor_A.N.T.I.,Actor_AJ Bowen,...,Actor_Özge Borak,Actor_Özge Özpirinçci,Actor_Özgür Emre Yildirim,Actor_Özgür Ozan,Actor_Özkan Ugur,Actor_Özz Nûjen,Actor_Úrsula Corberó,Actor_Úrsula Pruneda,Actor_Ülkü Duru,Actor_Þorsteinn Bachmann
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
df_dummy.shape

(9999, 17325)

In [16]:
# merging/joining by index
df = df.merge(df_dummy, how="inner", left_index=True, right_index=True)
df.sample(10)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Actor_Özge Borak,Actor_Özge Özpirinçci,Actor_Özgür Emre Yildirim,Actor_Özgür Ozan,Actor_Özkan Ugur,Actor_Özz Nûjen,Actor_Úrsula Corberó,Actor_Úrsula Pruneda,Actor_Ülkü Duru,Actor_Þorsteinn Bachmann
1961,Det som göms i snö,2018–,"Crime,Drama,Mystery",7.4,\nAfter a long sick leave and recovery police ...,"Stars:Robert Gustafsson, Louis...",1765.0,43.0,,Series,...,0,0,0,0,0,0,0,0,0,0
2741,Battlefish,2018–,Reality-TV,7.1,"\nOver the course of the fishing season, a gro...","Stars:Justin Bradbury, William...",385.0,42.0,,Series,...,0,0,0,0,0,0,0,0,0,0
4757,Jimmy Carr: The Best of Ultimate Gold Greatest...,2019 TV Special,Comedy,7.3,\nNothing is off limits as Jimmy Carr serves u...,Director:Brian Klein| Star:Jimmy Carr,1753.0,58.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
9794,Magic for Humans by Mago Pop,2021–,"Comedy,Reality-TV",8.0,\nAdd a Plot\n,"Stars:Antonio Díaz, Paco Plaza",6.0,22.0,,Series,...,0,0,0,0,0,0,0,0,0,0
6569,Bleach: Burîchi,2004–2012,"Animation,Action,Adventure",8.3,"\nRukia, wandering the streets at night, runs ...","Directors:Noriyuki Abe, Jun'ya Koshiba| ...",216.0,24.0,,Series,...,0,0,0,0,0,0,0,0,0,0
3773,Le guetteur,2012,"Action,Crime,Thriller",5.7,\nA detective hunts for the marksman who foile...,Director:Michele Placido| Stars:Daniel...,2673.0,89.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
9750,The Cuphead Show!,2021–,"Animation,Action,Adventure",,\nAdd a Plot\n,"Stars:Tru Valentino, Frank Todaro",,,,Series,...,0,0,0,0,0,0,0,0,0,0
4718,Hartenstraat,2014,Romance,6.3,\nSingle father and Deli owner Daan tries to f...,Director:Sanne Vogel| Stars:Marwan Ken...,2068.0,90.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
2232,Miss Americana,2020,"Documentary,Biography,Music",7.4,\nA look at iconic pop artist Taylor Swift dur...,Director:Lana Wilson| Stars:Taylor Swi...,17209.0,85.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
2120,Geunyeoneun yeppeodda,2015,"Comedy,Romance",7.7,"\nWhen Ji Sung-Joon was young, he was ugly. As...","Stars:Hwang Jeong-eum, Seo Jun...",4012.0,60.0,,Movie,...,0,0,0,0,0,0,0,0,0,0


#### Dummy on Directors

In [17]:
df["Directors"] = df["Directors"].str.replace(", ", ",", regex=False)
df['Directors'] = df['Directors'].str.strip()

df_dummy = df["Directors"].str.get_dummies(sep=",")
df_dummy = df_dummy.add_prefix('Director_')
df_dummy.head()

Unnamed: 0,Director_Aadish Keluskar,Director_Aaron Augenblick,Director_Aaron Burns,Director_Aaron Hann,Director_Aaron Lieber,Director_Aaron Long,Director_Aaron Moorhead,Director_Aaron Saidman,Director_Aaron Sorkin,Director_Aban Raza,...,Director_Àlex Pastor,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df_dummy.shape

(9999, 4083)

In [19]:
# merging/joining by index
df = df.merge(df_dummy, how="inner", left_index=True, right_index=True)
df.sample(10)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Director_Àlex Pastor,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur
3595,SanPa: Sins of the Savior,2020,"Documentary,Crime",7.8,"\nAmidst a heroin crisis, Vincenzo Muccioli ca...","Stars:Vincenzo Andreucci, Anto...",1297.0,,,Movie,...,0,0,0,0,0,0,0,0,0,0
7241,The Sandman,2021–,"Action,Adventure,Drama",,\nAdd a Plot\n,Director:Jamie Childs| Stars:Jenna Col...,,,,Series,...,0,0,0,0,0,0,0,0,0,0
4334,#TemanTapiMenikah,2018,"Biography,Comedy,Drama",6.9,\nAyudia (Vanesha Prescilla) and Ditto (Adipat...,Director:Rako Prijanto| Stars:Adipati ...,602.0,102.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
3054,Santana,2020,"Action,Crime,Drama",3.8,"\nTwo cops, two brothers. One is out for justi...","Directors:Maradona Dias Dos Santos, Chris ...",876.0,106.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
1571,Falsa Identidad,2018–,"Crime,Drama,Thriller",7.2,\nDiego is a hustler and is forced to leave th...,"Stars:Luis Ernesto Franco, Sam...",262.0,,,Series,...,0,0,0,0,0,0,0,0,0,0
1431,Inside Man: Most Wanted,2019,"Action,Crime,Thriller",5.6,\nAn NYPD hostage negotiator teams up with a f...,Director:M.J. Bassett| Stars:Aml Ameen...,3439.0,105.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
4812,Jigeum Uri Hakgyoneun,2021–,"Drama,Fantasy,Horror",,\nThe story of a group of students stuck in th...,"Stars:Yi-Hyun Cho, Ji-hu Park,...",,,,Series,...,0,0,0,0,0,0,0,0,0,0
1491,Homunculus,2021,"Drama,Fantasy,Mystery",5.6,\nTruth and illusion blurs when a homeless amn...,Director:Takashi Shimizu| Stars:Gô Aya...,1819.0,115.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
1822,I.T.,2016,"Crime,Drama,Mystery",5.5,\nA millionaire has his life turned upside dow...,Director:John Moore| Stars:Pierce Bros...,15373.0,95.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
6560,Bleach: Burîchi,2004–2012,"Animation,Action,Adventure",7.6,\nKûkaku Shiba's plan is to catapult Ichigo an...,"Directors:Noriyuki Abe, Akira Iwanaga| ...",182.0,24.0,,Series,...,0,0,0,0,0,0,0,0,0,0


In [20]:
df.shape

(9999, 21449)

### One-Line

In [21]:
df["ONE-LINE"] = df["ONE-LINE"].str.replace("\n", "", regex=False)
df["ONE-LINE"] = np.where(df["ONE-LINE"] == "Add a Plot", np.NaN, df["ONE-LINE"])
df["ONE-LINE"] = np.where(df["ONE-LINE"] == "Plot unknown.", np.NaN, df["ONE-LINE"])
df.sample(10)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Director_Àlex Pastor,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur
1829,Riphagen,2016,"Biography,Drama,War",7.1,"The story about Riphagen, a cunning Dutch trai...",Director:Pieter Kuijpers| Stars:Jeroen...,7016,131.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
7162,Bridgerton,2020–,"Drama,Romance",7.6,Simon's apparent courtship of Daphne angers he...,Director:Tom Verica| Stars:Adjoa Andoh...,1948,61.0,,Series,...,0,0,0,0,0,0,0,0,0,0
4853,The Mars Generation,2017,Documentary,6.4,Aspiring teenage astronauts reveal that a jour...,Director:Michael Barnett| Stars:Elsa S...,954,97.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
8095,Fauda,2015–,"Action,Drama,Thriller",8.3,Doron is determined to stop Fawze by any means...,"Director:Rotem Shamir| Stars:Lior Raz,...",272,50.0,,Series,...,0,0,0,0,0,0,0,0,0,0
4603,Ronny Chieng: Asian Comedian Destroys America,2019 TV Special,Comedy,7.4,"Ronny Chieng (""The Daily Show,"" ""Crazy Rich As...",Director:Sebastian DiNatale| Star:Ronn...,2554,63.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
7686,Wir sind die Welle,2019–,Drama,6.7,"Obedient high school-er Lea is drawn to smart,...",Director:Anca Miruna Lazarescu| Stars:...,148,48.0,,Series,...,0,0,0,0,0,0,0,0,0,0
3779,Joan Didion: The Center Will Not Hold,2017,Documentary,7.4,Literary icon Joan Didion reflects on her rema...,Director:Griffin Dunne| Stars:Hilton A...,2005,94.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
7114,El caso Alcàsser,2019,"Documentary,Crime",7.3,,Director:León Siminiani| Stars:Luisa G...,106,,,Movie,...,0,0,0,0,0,0,0,0,0,0
1398,Romeo & Juliet,II 2013,"Drama,Romance",5.8,Romeo and Juliet secretly wed despite the swor...,Director:Carlo Carlei| Stars:Hailee St...,11411,118.0,$1.16M,Movie,...,0,0,0,0,0,0,0,0,0,0
5337,Color Me Barbra,1966 TV Special,"Comedy,Music",8.1,"Barbra Streisand's second television special, ...",Director:Dwight Hemion| Star:Barbra St...,267,60.0,,Movie,...,0,0,0,0,0,0,0,0,0,0


## Missing Values

In [22]:
column_names = ["MOVIES", "GENRE", "RATING", "ONE-LINE",
                "STARS", "VOTES", "RunTime", "Gross", "TYPE",
                "Year_From", "Year_To"]
missing_df = df[column_names].isna().sum().to_frame()
missing_df = missing_df.rename(columns={0:"missing"})
missing_df["percentage"] = (missing_df["missing"] / df.shape[0]) * 100
missing_df

Unnamed: 0,missing,percentage
MOVIES,0,0.0
GENRE,80,0.80008
RATING,1820,18.20182
ONE-LINE,1272,12.721272
STARS,0,0.0
VOTES,1820,18.20182
RunTime,2958,29.582958
Gross,9539,95.39954
TYPE,0,0.0
Year_From,644,6.440644


### Year

In [23]:
# replacing missing values from Year_TO with Year_FROM

df["Year_To"] = np.where(df["Year_To"].isna(), df["Year_From"], df["Year_To"])

### RunTime

In [24]:
df[column_names].describe()

Unnamed: 0,RATING,RunTime
count,8179.0,7041.0
mean,6.921176,68.688539
std,1.220232,47.258056
min,1.1,1.0
25%,6.2,36.0
50%,7.1,60.0
75%,7.8,95.0
max,9.9,853.0


In [25]:
df[df["RunTime"] >= 853]

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Director_Àlex Pastor,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur
1902,El tiempo entre costuras,2013–2014,"Adventure,Drama,History",8.3,Sira Quiroga is a young Spanish dressmaker eng...,"Stars:Adriana Ugarte, Mari Car...",3876,853.0,,Series,...,0,0,0,0,0,0,0,0,0,0


In [26]:
df[df["RunTime"].isna()].sample(5)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Director_Àlex Pastor,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur
3105,The Playbook,2020,"Documentary,Sport",7.5,The Playbook profiles legendary coaches as the...,"Stars:Jill Ellis, Patrick Mour...",1165.0,,,Movie,...,0,0,0,0,0,0,0,0,0,0
9869,Cobra Kai,2018–,"Action,Comedy,Drama",,,"Stars:Ralph Macchio, William Z...",,,,Series,...,0,0,0,0,0,0,0,0,0,0
9926,Ginny & Georgia,2021–,"Comedy,Drama",,,Director:Anya Adams| Stars:Brianne How...,,,,Series,...,0,0,0,0,0,0,0,0,0,0
1396,Love Hard,2021,"Comedy,Romance",,A woman travels to her online date's hometown ...,Director:Hernan Jimenez| Stars:Darren ...,,,,Movie,...,0,0,0,0,0,0,0,0,0,0
9108,Vikings: Valhalla,,"Action,Adventure,Drama",,,"Stars:James Ballanger, Laura B...",,,,Series,...,0,0,0,0,0,0,0,0,0,0


In [27]:
df[column_names][df["TYPE"] == "Movie"].describe()

Unnamed: 0,RATING,RunTime
count,4374.0,4137.0
mean,6.489209,89.269761
std,1.200889,46.489358
min,1.8,1.0
25%,5.8,64.0
50%,6.6,90.0
75%,7.4,105.0
max,9.4,573.0


In [28]:
df[column_names][df["TYPE"] == "Series"].describe()

Unnamed: 0,RATING,RunTime
count,3805.0,2904.0
mean,7.41774,39.368802
std,1.040193,29.549575
min,1.1,1.0
25%,6.9,24.0
50%,7.5,38.0
75%,8.1,47.0
max,9.9,853.0


In [29]:
df["RunTime"][df["TYPE"] == "Movie"].isna().sum()

650

In [30]:
df.loc[(df["TYPE"] == "Movie") & (df["RunTime"].isna()), 'RunTime'] = 89

In [31]:
df["RunTime"][df["TYPE"] == "Series"].isna().sum()

2308

In [32]:
df.loc[(df["TYPE"] == "Series") & (df["RunTime"].isna()), 'RunTime'] = 39

### Elements Missing in Row

In [33]:
df_missing_in_records = df[column_names].isna().sum(axis=1).to_frame()
df_missing_in_records = df_missing_in_records.rename(columns={0:"missing"})

df_missing_in_records.describe()

Unnamed: 0,missing
count,9999.0
mean,1.582058
std,1.361621
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,7.0


In [34]:
df = df.merge(df_missing_in_records, how="inner", left_index=True, right_index=True)

In [35]:
df[df["missing"] > 4].sample(5)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur,missing
9273,Stay Close,,"Crime,Drama,Mystery",,,"Stars:Bethany Antonia, Richard...",,39.0,,Series,...,0,0,0,0,0,0,0,0,0,6
6109,The Stronghold,,"Action,Crime",,Police brigade faces strong pressure from thei...,Director:Cédric Jimenez,,39.0,,Series,...,0,0,0,0,0,0,0,0,0,5
5287,13 Minutes,,"Drama,Thriller",,Unusual events begin to occur for a high schoo...,Star:Jimmy Waitman,,39.0,,Series,...,0,0,0,0,0,0,0,0,0,5
5736,Free Food for Millionaires,,Drama,,"Casey Han, a strong-willed, Queens-bred daught...",,,39.0,,Series,...,0,0,0,0,0,0,0,0,0,5
5089,Dorothy and Alice,,Fantasy,,Heroines from the the classic novels 'The Wiza...,,,39.0,,Series,...,0,0,0,0,0,0,0,0,0,5


### Removing null values

In [36]:
df = df.dropna(subset=["GENRE", "RATING", "Year_From", "Year_To"])

### Final Checking

In [37]:
column_names = ["MOVIES", "GENRE", "RATING", "ONE-LINE",
                "STARS", "VOTES", "RunTime", "Gross", "TYPE",
                "Year_From", "Year_To"]
missing_df = df[column_names].isna().sum().to_frame()
missing_df = missing_df.rename(columns={0:"missing"})
missing_df["percentage"] = (missing_df["missing"] / df.shape[0]) * 100
missing_df

Unnamed: 0,missing,percentage
MOVIES,0,0.0
GENRE,0,0.0
RATING,0,0.0
ONE-LINE,371,4.542116
STARS,0,0.0
VOTES,0,0.0
RunTime,0,0.0
Gross,7708,94.368266
TYPE,0,0.0
Year_From,0,0.0


## Dropping Redundant Columns

In [38]:
df.drop(columns=["MOVIES", "YEAR", "GENRE", "STARS", "ONE-LINE", "Gross",
                 "Directors", "Actors", "missing"], inplace=True)

## Saving

In [40]:
df.to_parquet("../data/movies-clean.parquet.gzip", compression='gzip')