# Feature Engineering

https://www.kaggle.com/datasets/bharatnatrayn/movies-dataset-for-feature-extracion-prediction?select=movies.csv

In [1]:
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm

In [2]:
tqdm.pandas()

# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [3]:
df = pd.read_csv("../data/movies.csv")
df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,
4,Army of Thieves,(2021),"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   MOVIES    9999 non-null   object 
 1   YEAR      9355 non-null   object 
 2   GENRE     9919 non-null   object 
 3   RATING    8179 non-null   float64
 4   ONE-LINE  9999 non-null   object 
 5   STARS     9999 non-null   object 
 6   VOTES     8179 non-null   object 
 7   RunTime   7041 non-null   float64
 8   Gross     460 non-null    object 
dtypes: float64(2), object(7)
memory usage: 703.2+ KB


## Data Cleaning

### Year

In [5]:
df["YEAR"] = df["YEAR"].str.replace("(", "", regex=False)
df["YEAR"] = df["YEAR"].str.replace(")", "", regex=False)
df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,2021,"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,
1,Masters of the Universe: Revelation,2021–,"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,
2,The Walking Dead,2010–2022,"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,
3,Rick and Morty,2013–,"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,
4,Army of Thieves,2021,"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,


In [6]:
df["TYPE"] = np.where(df["YEAR"].str.contains("–"), "Series", "Movie")
df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE
0,Blood Red Sky,2021,"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,,Movie
1,Masters of the Universe: Revelation,2021–,"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,,Series
2,The Walking Dead,2010–2022,"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,,Series
3,Rick and Morty,2013–,"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,,Series
4,Army of Thieves,2021,"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,,Movie


In [7]:
df["Year_From"] = np.NaN
df["Year_To"] = np.NaN

def extract_from(x):
    if pd.isna(x):
        return np.NaN

    year_to_return = None
    
    year = str(x)
    if "–" not in year:
        year_to_return = year
    else:
        # 2010-2011 -> [2010, 2011]
        # 2010-     -> [2010,]
        years = year.split("–")
        year_to_return = years[0]
        
    # ABC2010 -> 2010
    year_to_return = re.sub("[^0-9]", "", year_to_return)
    return year_to_return
                    
def extract_to(x):
    if pd.isna(x):
        return np.NaN
    
    year_to_return = None
    
    year = str(x)
    if "–" not in year:
        return np.NaN
    else:
        years = year.split("–")
        year_to_return = re.sub("[^0-9]", "", years[1])
        if len(year_to_return) == 0:
            return np.NaN
        else:
            return year_to_return

df["Year_From"] = df["YEAR"].progress_apply(extract_from)
df["Year_To"] = df["YEAR"].progress_apply(extract_to)

df.sample(10)

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,Year_From,Year_To
5711,Facing Darkness,2017,"\nDocumentary, Drama",7.3,\nA Christian relief organization is met with ...,\n Director:\nArthur Rasco\n| \n Stars:\...,88.0,93.0,$0.56M,Movie,2017.0,
5289,David Cross: Making America Great Again,2016 TV Special,\nComedy,6.5,\nActor/comedian David Cross skewers Donald Tr...,\n Director:\nAlex Coletti\n| \n Star:\n...,1573.0,74.0,,Movie,2016.0,
8872,¡A cantar!,2020–,"\nGame-Show, Music",6.7,\nAdd a Plot\n,\n Director:\nSalva Romero\n| \n Stars:\...,7.0,39.0,,Series,2020.0,
9684,I [Heart] Arlo,2021–,"\nAnimation, Short, Adventure",,\nAdd a Plot\n,\n \n Star:\nMichael J. Woodard\n,,,,Series,2021.0,
4403,Ari Shaffir: Double Negative,2016 TV Special,\nComedy,6.8,\nComedian Ari Shaffir steps outside of his ya...,\n Director:\nEric Abrams\n| \n Star:\nA...,667.0,,,Movie,2016.0,
2337,Go! Vive a Tu Manera,2019,"\nComedy, Musical, Romance",7.1,\nFiercely talented Mia receives a scholarship...,"\n \n Stars:\nRebecca Davis, \nM...",491.0,40.0,,Movie,2019.0,
9802,From Scratch,,\nDrama,,\nAdd a Plot\n,\n Director:\nDennie Gordon\n| \n Stars:...,,,,Series,,
4324,Exatlon Challenge,2020–,"\nGame-Show, Reality-TV, Sport",3.8,\nOrkun Isitmak hosts an epic sports challenge...,"\n \n Stars:\nDilan Ay, \nDilara...",505.0,46.0,,Series,2020.0,
1026,When We First Met,2018,"\nComedy, Fantasy, Romance",6.4,\nNoah meets Avery at a Halloween party and fa...,\n Director:\nAri Sandel\n| \n Stars:\nA...,44822.0,97.0,,Movie,2018.0,
7388,Johnny Test,2005–,"\nAnimation, Action, Adventure",6.5,\nJohnny and Dukey want to win a radio station...,\n Director:\nJoseph Sherman\n| \n Stars...,26.0,23.0,,Series,2005.0,


### Genre

In [8]:
df["GENRE"] = df["GENRE"].str.replace("\n", "", regex=False)
df["GENRE"] = df["GENRE"].str.replace(" ", "", regex=False)
df.sample(10)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,Year_From,Year_To
7438,Valeria,2020–,"Comedy,Drama,Romance",7.5,\nAdd a Plot\n,\n Director:\nInma Torrente\n| \n Stars:...,60.0,,,Series,2020,
6018,Patton Oswalt: My Weakness Is Strong,2009 TV Special,"Documentary,Comedy",7.6,"\nThe critically-acclaimed comedian, actor, wr...",\n Director:\nJason Woliner\n| \n Star:\...,1096.0,64.0,,Movie,2009,
8803,Cyberpunk: Edgerunners,2022–,"Animation,Action,Adventure",,\nAdd a Plot\n,\n,,,,Series,2022,
1451,Sexy Beasts,2014–,Comedy,5.5,\nA show on a mission to to proving that perso...,"\n \n Stars:\nRob Delaney, \nDan...",53.0,,,Series,2014,
3846,Robin Robin,2021 TV Special,"Animation,Short,Adventure",,"\nIn this stop-motion short film, a bird raise...","\n Directors:\nDaniel Ojari, \nMichael Plea...",,30.0,,Movie,2021,
1893,Eastsiders,2012–,Drama,6.9,\nEastSiders is a two-time Daytime Emmy nomina...,"\n \n Stars:\nKit Williamson, \n...",2460.0,115.0,,Series,2012,
6753,Power Rangers Beast Morphers,2019–2020,"Action,Adventure,Drama",8.7,\nNate develops new technology from a captured...,\n Director:\nRiccardo Pellizzeri\n| \n ...,43.0,,,Series,2019,2020.0
160,Dawson's Creek,1998–2003,"Drama,Romance",6.6,\nFour friends in a small coastal town help ea...,\n \n Stars:\nJames Van Der Beek...,38117.0,60.0,,Series,1998,2003.0
418,Geostorm,2017,"Action,Sci-Fi,Thriller",5.3,\nWhen the network of satellites designed to c...,\n Director:\nDean Devlin\n| \n Stars:\n...,99724.0,109.0,$33.70M,Movie,2017,
3283,Bottom of the World,2017,"Drama,Mystery,Thriller",5.2,\nThe mysterious disappearance of a young woma...,\n Director:\nRichard Sears\n| \n Stars:...,4836.0,85.0,,Movie,2017,


#### Dummy on Genres

In [9]:
df.head(1)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,Year_From,Year_To
0,Blood Red Sky,2021,"Action,Horror,Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062,121.0,,Movie,2021,


In [10]:
df_dummy = df["GENRE"].str.get_dummies(sep=",")
df_dummy = df_dummy.add_prefix('Genre_')
df_dummy.head()

Unnamed: 0,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Biography,Genre_Comedy,Genre_Crime,Genre_Documentary,Genre_Drama,Genre_Family,Genre_Fantasy,...,Genre_News,Genre_Reality-TV,Genre_Romance,Genre_Sci-Fi,Genre_Short,Genre_Sport,Genre_Talk-Show,Genre_Thriller,Genre_War,Genre_Western
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# merging/joining by index
df = df.merge(df_dummy, how="inner", left_index=True, right_index=True)
df.sample(10)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Genre_News,Genre_Reality-TV,Genre_Romance,Genre_Sci-Fi,Genre_Short,Genre_Sport,Genre_Talk-Show,Genre_Thriller,Genre_War,Genre_Western
7674,Living Undocumented,2019–,Documentary,5.7,\nAdd a Plot\n,"\n Directors:\nAnna Chai, \nAaron Saidman\n",30.0,,,Series,...,0,0,0,0,0,0,0,0,0,0
8754,Floor Is Lava,2020–,"Game-Show,Reality-TV",6.6,\nAdd a Plot\n,\n Director:\nBrian Smith\n| \n Star:\nR...,83.0,31.0,,Series,...,0,1,0,0,0,0,0,0,0,0
9540,First Kill,,"Drama,Horror,Mystery",,\nAdd a Plot\n,\n \n Stars:\nElizabeth Mitchell...,,,,Series,...,0,0,0,0,0,0,0,0,0,0
8483,Poketto monsutâ,1997–,"Animation,Action,Adventure",8.3,\nProfessor Cerise gives Ash and Goh tickets t...,"\n \n Stars:\nSarah Natochenny, ...",50.0,21.0,,Series,...,0,0,0,0,0,0,0,0,0,0
4270,Aggressive Retsuko,2016–,"Animation,Comedy,Musical",8.3,"\nIn this series of shorts, a polite accountan...",\n \n Star:\nKaolip\n,948.0,,,Series,...,0,0,0,0,0,0,0,0,0,0
733,Tribes of Europa,2021–,"Action,Adventure,Drama",6.8,\n2074. In the wake of a mysterious global dis...,\n \n Stars:\nHenriette Confuriu...,10043.0,,,Series,...,0,0,0,0,0,0,0,0,0,0
8770,Rádio Coisa Mais Linda,2020,,,\nAdd a Plot\n,\n Director:\nCaito Ortiz\n| \n Stars:\n...,,,,Movie,...,0,0,0,0,0,0,0,0,0,0
9501,Jurassic World: Camp Cretaceous,2020–,"Animation,Action,Adventure",7.3,\nDarius and Sammy face a moral dilemma while ...,"\n Directors:\nZesung Kang, \nShih Ming Tay...",248.0,24.0,,Series,...,0,0,0,0,0,0,0,0,0,0
9371,Gojira shingyura pointo,2021–,"Animation,Action,Adventure",7.3,\nMei learns of a large explosion triggered by...,"\n Directors:\nNao Miyoshi, \nNoriyuki Noma...",50.0,24.0,,Series,...,0,0,0,0,0,0,0,0,0,0
5748,Filthy Gorgeous: The Bob Guccione Story,2013,"Documentary,Biography",6.5,\nA look at the extraordinary world of Penthou...,\n Director:\nBarry Avrich\n| \n Stars:\...,353.0,96.0,,Movie,...,0,0,0,0,0,0,0,0,0,0


### Stars

In [12]:
df["STARS"] = df["STARS"].str.replace("\n", "", regex=False)
df.sample(10)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Genre_News,Genre_Reality-TV,Genre_Romance,Genre_Sci-Fi,Genre_Short,Genre_Sport,Genre_Talk-Show,Genre_Thriller,Genre_War,Genre_Western
3077,The Holiday Calendar,2018,"Drama,Fantasy,Romance",5.7,\nA struggling but talented photographer inher...,Director:Bradley Walsh| Stars:Kat Grah...,8871.0,95.0,,Movie,...,0,0,1,0,0,0,0,0,0,0
5204,Jeff Dunham: Relative Disaster,2017 TV Special,Comedy,6.4,"\nWelcome to Ireland, where Ventriloquist Jeff...","Directors:Michael Simon, Matthew McNeil| ...",1211.0,70.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
9911,Pretty Smart,,Comedy,,\nAdd a Plot\n,Director:Jody Margolin Hahn| Stars:Cin...,,,,Series,...,0,0,0,0,0,0,0,0,0,0
8214,Country Comfort,2021,"Comedy,Drama,Family",6.8,\nHarmony leads to heartache as aspiring singe...,Director:Kelly Park| Stars:Katharine M...,109.0,,,Movie,...,0,0,0,0,0,0,0,0,0,0
9932,Ginny & Georgia,2021–,"Comedy,Drama",,\nAdd a Plot\n,Director:Anya Adams| Stars:Brianne How...,,,,Series,...,0,0,0,0,0,0,0,0,0,0
1660,The Christmas Chronicles,2018,"Adventure,Comedy,Family",7.0,"\nThe story of sister and brother, Kate and Te...",Director:Clay Kaytis| Stars:Kurt Russe...,63607.0,104.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
3860,Distancia de rescate,2021,"Drama,Mystery,Thriller",,\nIt will tell the haunting story of broken so...,Director:Claudia Llosa| Stars:María Va...,,,,Movie,...,0,0,0,0,0,0,0,1,0,0
2035,When Heroes Fly,2018–,"Action,Drama",7.2,"\nFour friends, 11 years after a major falling...","Stars:Tomer Capon, Ninet Tayeb...",2666.0,45.0,,Series,...,0,0,0,0,0,0,0,0,0,0
2048,Il Divin Codino,2021,"Biography,Drama,Sport",6.1,\nBiographical film about Italian footballer R...,Director:Letizia Lamartire| Stars:Andr...,3523.0,91.0,,Movie,...,0,0,0,0,0,1,0,0,0,0
1483,Yummy,2019,"Action,Comedy,Horror",5.9,"\nAn orgy of blood, violence and fun in which ...",Director:Lars Damoiseaux| Stars:Maaike...,4301.0,88.0,,Movie,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df["Directors"] = None
df["Actors"] = None

def extract_directors(x):
    if "Director" in x:
        stars = x.split("|")
        if "Director" in stars[0]:
            return stars[0]
        else:
            return stars[1]
    else:
        return np.NaN
    
def extract_actors(x):
    if "Star" in x:
        stars = x.split("|")
        if "Star" in stars[0]:
            return stars[0]
        else:
            return stars[1]
    else:
        return np.NaN
    

df["Directors"] = df["STARS"].progress_apply(extract_directors)
df["Actors"] = df["STARS"].progress_apply(extract_actors)

df["Directors"] = df["Directors"].str.replace("Director:", "", regex=False)
df["Directors"] = df["Directors"].str.replace("Directors:", "", regex=False)
df["Actors"] = df["Actors"].str.replace("Star:", "", regex=False)
df["Actors"] = df["Actors"].str.replace("Stars:", "", regex=False)

df.sample(10)

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Genre_Romance,Genre_Sci-Fi,Genre_Short,Genre_Sport,Genre_Talk-Show,Genre_Thriller,Genre_War,Genre_Western,Directors,Actors
862,The Do-Over,2016,"Action,Adventure,Comedy",5.7,\nTwo down-on-their-luck guys decide to fake t...,Director:Steven Brill| Stars:Adam Sand...,42672.0,108.0,,Movie,...,0,0,0,0,0,0,0,0,Steven Brill,"Adam Sandler, David Spade, Paula Patton, ..."
3432,Suzzanna: Bernapas dalam Kubur,2018,"Comedy,Drama,Horror",5.6,"\nAfter a pregnant woman is murdered, her spir...","Directors:Rocky Soraya, Anggy Umbara| ...",834.0,125.0,,Movie,...,0,0,0,0,0,0,0,0,"Rocky Soraya, Anggy Umbara","Luna Maya, Herjunot Ali, T. Rifnu Wikana,..."
7569,Brews Brothers,2020–,Comedy,5.3,\nBrewery owner Wilhelm Rodman gets a visit fr...,Director:Jeff Schaffer| Stars:Mike Cas...,104.0,,,Series,...,0,0,0,0,0,0,0,0,Jeff Schaffer,"Mike Castle, Alan Aisenberg, Carmen Flood..."
9734,Centaurworld,2021–,"Animation,Adventure,Comedy",,\nAdd a Plot\n,"Stars:Parvesh Cheena, Chris Di...",,,,Series,...,0,0,0,0,0,0,0,0,,"Parvesh Cheena, Chris Diamanto..."
8171,Dirty Money,2018–,"Documentary,Crime",7.6,\nWells Fargo was long seen as the 'golden chi...,Director:Dan Krauss| Stars:Yesenia Gui...,365.0,,,Series,...,0,0,0,0,0,0,0,0,Dan Krauss,"Yesenia Guitron, Kilian Colin, Peter, Emi..."
2715,7 Guardians of the Tomb,2018,"Action,Adventure,Horror",4.5,\nAn innocent discovery of a well-preserved mu...,Director:Kimble Rendall| Stars:Bingbin...,3681.0,90.0,,Movie,...,0,0,0,0,0,0,0,0,Kimble Rendall,"Bingbing Li, Kellan Lutz, Kelsey Grammer,..."
4396,Iwa Kakeru! Sport Climbing Girls,2020,"Animation,Sport",6.1,"\nKonomi Kasahara, a junior high school girl, ...","Stars:Sumire Uesaka, Yui Ishik...",145.0,24.0,,Movie,...,0,0,0,1,0,0,0,0,,"Sumire Uesaka, Yui Ishikawa, A..."
7088,Dark,2017–2020,"Crime,Drama,Mystery",9.5,"\nOn the day of the apocalypse, Clausen execut...",Director:Baran bo Odar| Stars:Sandra B...,12762.0,57.0,,Series,...,0,0,0,0,0,0,0,0,Baran bo Odar,"Sandra Borgmann, Karoline Eichhorn, Carlo..."
4597,Chelsea Peretti: One of the Greats,2014 TV Special,Comedy,7.0,\nWriter/actress/comedienne Chelsea Peretti pr...,Director:Lance Bangs| Stars:Chelsea Pe...,2277.0,74.0,,Movie,...,0,0,0,0,0,0,0,0,Lance Bangs,"Chelsea Peretti, Brendon Walsh, John Howa..."
7834,The Dragon Prince,2018–,"Animation,Adventure,Drama",8.8,\nAs the human army marches towards Xadia Vire...,Director:Villads Spangsberg| Stars:Rac...,434.0,,,Series,...,0,0,0,0,0,0,0,0,Villads Spangsberg,"Racquel Belmonte, Paula Burrows, Jack De ..."


#### Dummy on Actors

In [14]:
df["Actors"] = df["Actors"].str.replace(", ", ",", regex=False)
df['Actors'] = df['Actors'].str.strip()

df_dummy = df["Actors"].str.get_dummies(sep=",")
df_dummy = df_dummy.add_prefix('Actor_')
df_dummy.head()

Unnamed: 0,Actor_2 Chainz,Actor_2'Live Bre,Actor_2Mex,Actor_50 Cent,Actor_A Boogie wit da Hoodie,Actor_A.J. Baime,Actor_A.J. Daulerio,Actor_A.J. LoCascio,Actor_A.N.T.I.,Actor_AJ Bowen,...,Actor_Özge Borak,Actor_Özge Özpirinçci,Actor_Özgür Emre Yildirim,Actor_Özgür Ozan,Actor_Özkan Ugur,Actor_Özz Nûjen,Actor_Úrsula Corberó,Actor_Úrsula Pruneda,Actor_Ülkü Duru,Actor_Þorsteinn Bachmann
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
df_dummy.shape

(9999, 17325)

In [16]:
# merging/joining by index
df = df.merge(df_dummy, how="inner", left_index=True, right_index=True)
df.sample(10)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Actor_Özge Borak,Actor_Özge Özpirinçci,Actor_Özgür Emre Yildirim,Actor_Özgür Ozan,Actor_Özkan Ugur,Actor_Özz Nûjen,Actor_Úrsula Corberó,Actor_Úrsula Pruneda,Actor_Ülkü Duru,Actor_Þorsteinn Bachmann
5419,After Tiller,2013,"Documentary,Biography,Drama",7.4,"\nThis thought provoking, sometimes troubling ...","Directors:Martha Shane, Lana Wilson| S...",1046.0,85.0,$0.07M,Movie,...,0,0,0,0,0,0,0,0,0,0
4814,O Crush Perfeito,2020–,"Reality-TV,Romance",5.3,\nPeople from different genders and sexual ori...,Star:Dieter Truppel,105.0,,,Series,...,0,0,0,0,0,0,0,0,0,0
6009,Derren Brown: Infamous,2014 TV Special,Reality-TV,7.6,\nDerren Brown's sixth live stage show which t...,"Director:Andy Nyman| Stars:Carl Barma,...",244.0,71.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
5399,Mr. Angel,2013,"Documentary,Biography",6.6,\nMr. Angel chronicles the extraordinary life ...,"Director:Dan Hunt| Stars:Lux Alptraum,...",298.0,68.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
4203,Gajokggiri Wae Irae,2014–2015,"Comedy,Drama,Romance",7.0,\nDrama series follows the daily lives of a ch...,"Stars:Dong-Geun Yoo, Kim Hyun-...",223.0,65.0,,Series,...,0,0,0,0,0,0,0,0,0,0
6010,Rob Schneider: Soy Sauce and the Holocaust,2013 TV Special,Comedy,5.6,\nRob Schneider tells the side-splitting stori...,Director:Ryan Polito| Star:Rob Schneider,263.0,60.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
6245,Edoardo Ferrario: Temi Caldi,2019 TV Special,Comedy,6.8,\nAdd a Plot\n,Director:Francesco Imperato| Star:Edoa...,114.0,65.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
4833,Charlie and the Chocolate Factory,,"Animation,Adventure,Comedy",,\nAnimated series based on Roald Dahl's classi...,,,,,Series,...,0,0,0,0,0,0,0,0,0,0
5894,Gina Yashere: Skinny B*tch,2008 TV Special,"Documentary,Comedy",6.4,"\nLive DVD from Last Comic Standing star, Gina...",Director:Paul M. Green| Star:Gina Yashere,70.0,58.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
118,Star Trek: Deep Space Nine,1993–1999,"Action,Adventure,Drama",8.0,\nIn the vicinity of the liberated planet of B...,"Stars:Avery Brooks, Rene Auber...",55687.0,45.0,,Series,...,0,0,0,0,0,0,0,0,0,0


#### Dummy on Directors

In [17]:
df["Directors"] = df["Directors"].str.replace(", ", ",", regex=False)
df['Directors'] = df['Directors'].str.strip()

df_dummy = df["Directors"].str.get_dummies(sep=",")
df_dummy = df_dummy.add_prefix('Director_')
df_dummy.head()

Unnamed: 0,Director_Aadish Keluskar,Director_Aaron Augenblick,Director_Aaron Burns,Director_Aaron Hann,Director_Aaron Lieber,Director_Aaron Long,Director_Aaron Moorhead,Director_Aaron Saidman,Director_Aaron Sorkin,Director_Aban Raza,...,Director_Àlex Pastor,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df_dummy.shape

(9999, 4083)

In [19]:
# merging/joining by index
df = df.merge(df_dummy, how="inner", left_index=True, right_index=True)
df.sample(10)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Director_Àlex Pastor,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur
7693,Beastars,2019–,"Animation,Crime,Drama",7.8,\nA brutal murder takes place at a school popu...,Director:Shin'ichi Matsumi| Stars:Chik...,419.0,23.0,,Series,...,0,0,0,0,0,0,0,0,0,0
5168,El sendero de la anaconda,2019,Documentary,6.8,\nAdd a Plot\n,Director:Alessandro Angulo,243.0,73.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
8052,Black Summer,2019–,"Action,Drama,Horror",7.8,\nLife and death spin savagely out of control....,"Director:John Hyams| Stars:Jaime King,...",736.0,44.0,,Series,...,0,0,0,0,0,0,0,0,0,0
6762,Ultraman,2019–,"Animation,Action,Adventure",7.2,"\nEmbattled at first, Shinjiro finds his resol...","Directors:Shinji Aramaki, Kenji Kamiyama, ...",71.0,,,Series,...,0,0,0,0,0,0,0,0,0,0
4718,Hartenstraat,2014,Romance,6.3,\nSingle father and Deli owner Daan tries to f...,Director:Sanne Vogel| Stars:Marwan Ken...,2068.0,90.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
2904,Duck Duck Goose,I 2018,"Animation,Adventure,Comedy",5.8,\nA bachelor goose must form a bond with two l...,Director:Christopher Jenkins| Stars:Ji...,3335.0,91.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
5778,Uncoupled,,"Comedy,Thriller",,"\nThe script penned by Fox, is a comedic thril...",,,,,Series,...,0,0,0,0,0,0,0,0,0,0
7378,13 Reasons Why,2017–2020,"Drama,Mystery,Thriller",5.7,"\nThe police question Tyler about the guns, le...",Director:Russell Mulcahy| Stars:Dylan ...,1798.0,59.0,,Series,...,0,0,0,0,0,0,0,0,0,0
7007,La Reina de Indias y el Conquistador,2020–,"Drama,History",,\nFather Sancho moves into the governor's resi...,"Stars:Emmanuel Esparza, Essine...",,,,Series,...,0,0,0,0,0,0,0,0,0,0
5843,Dino Daycare,,"Animation,Adventure,Comedy",,\nFollows a 6-year-old human boy named Cole as...,,,,,Series,...,0,0,0,0,0,0,0,0,0,0


In [20]:
df.shape

(9999, 21449)

### One-Line

In [21]:
df["ONE-LINE"] = df["ONE-LINE"].str.replace("\n", "", regex=False)
df["ONE-LINE"] = np.where(df["ONE-LINE"] == "Add a Plot", np.NaN, df["ONE-LINE"])
df["ONE-LINE"] = np.where(df["ONE-LINE"] == "Plot unknown.", np.NaN, df["ONE-LINE"])
df.sample(10)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Director_Àlex Pastor,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur
4403,Ari Shaffir: Double Negative,2016 TV Special,Comedy,6.8,Comedian Ari Shaffir steps outside of his yama...,Director:Eric Abrams| Star:Ari Shaffir,667.0,,,Movie,...,0,0,0,0,0,0,0,0,0,0
9870,Bee and PuppyCat,2013–,"Animation,Short,Action",,,,,,,Series,...,0,0,0,0,0,0,0,0,0,0
7709,Dynasty,2017–,Drama,7.2,Adam attempts to sabotage Blake and Cristal's ...,Director:Kenny Leon| Stars:Elizabeth G...,274.0,42.0,,Series,...,0,0,0,0,0,0,0,0,0,0
4807,Tee Shot: Ariya Jutanugarn,2019 TV Movie,"Biography,Drama,Sport",6.6,This biopic follows pro golfer Ariya Jutanugar...,Director:Tanawat Aiemjinda| Stars:Mari...,54.0,103.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
3204,Aala Kaf Ifrit,2017,"Crime,Drama,Thriller",7.0,A college student seeks help after a brutal as...,Director:Kaouther Ben Hania| Stars:Mar...,1912.0,100.0,$0.01M,Movie,...,0,0,0,0,0,0,0,0,0,0
2464,Away,I 2016,"Crime,Drama",6.8,A story set in the north English seaside town ...,Director:David Blair| Stars:Timothy Sp...,2079.0,105.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
5665,Kingdom Come,VII,,,,Director:Rawson Marshall Thurber,,,,Movie,...,0,0,0,0,0,0,0,0,0,0
7206,Typewriter,2019,"Horror,Mystery,Thriller",7.6,Sam makes a damning accusation. Ravi contempla...,Director:Sujoy Ghosh| Stars:Aarnaa Sha...,129.0,43.0,,Movie,...,0,0,0,0,0,0,0,0,0,0
109,Supergirl,2015–2021,"Action,Adventure,Drama",6.2,The adventures of Superman's cousin in her own...,"Stars:Melissa Benoist, Chyler ...",115373.0,43.0,,Series,...,0,0,0,0,0,0,0,0,0,0
721,Wizards,2020,"Animation,Action,Adventure",8.2,The third installment of DreamWork's 'Tales of...,"Stars:Colin O'Donoghue, David ...",2795.0,233.0,,Movie,...,0,0,0,0,0,0,0,0,0,0


## Missing Values

In [22]:
column_names = ["MOVIES", "GENRE", "RATING", "ONE-LINE",
                "STARS", "VOTES", "RunTime", "Gross", "TYPE",
                "Year_From", "Year_To"]
missing_df = df[column_names].isna().sum().to_frame()
missing_df = missing_df.rename(columns={0:"missing"})
missing_df["percentage"] = (missing_df["missing"] / df.shape[0]) * 100
missing_df

Unnamed: 0,missing,percentage
MOVIES,0,0.0
GENRE,80,0.80008
RATING,1820,18.20182
ONE-LINE,1272,12.721272
STARS,0,0.0
VOTES,1820,18.20182
RunTime,2958,29.582958
Gross,9539,95.39954
TYPE,0,0.0
Year_From,644,6.440644


### Year

In [23]:
# replacing missing values from Year_TO with Year_FROM

df["Year_To"] = np.where(df["Year_To"].isna(), df["Year_From"], df["Year_To"])

### RunTime

In [24]:
df[column_names].describe()

Unnamed: 0,RATING,RunTime
count,8179.0,7041.0
mean,6.921176,68.688539
std,1.220232,47.258056
min,1.1,1.0
25%,6.2,36.0
50%,7.1,60.0
75%,7.8,95.0
max,9.9,853.0


In [25]:
df[df["RunTime"] >= 853]

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Director_Àlex Pastor,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur
1902,El tiempo entre costuras,2013–2014,"Adventure,Drama,History",8.3,Sira Quiroga is a young Spanish dressmaker eng...,"Stars:Adriana Ugarte, Mari Car...",3876,853.0,,Series,...,0,0,0,0,0,0,0,0,0,0


In [26]:
df[df["RunTime"].isna()].sample(5)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Director_Àlex Pastor,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur
3217,Untitled Arnold Schwarzenegger/Skydance Project,,"Action,Adventure,Thriller",,A spy adventure with a father and daughter at ...,"Stars:Monica Barbaro, Arnold S...",,,,Series,...,0,0,0,0,0,0,0,0,0,0
7725,Scams,2019–,"Action,Crime,Drama",,,"Stars:Shôdai Fukuyama, Asuka H...",,,,Series,...,0,0,0,0,0,0,0,0,0,0
2948,Human Resources,II,"Animation,Comedy,Romance",,A workplace comedy set in the world of the mon...,"Stars:Aidy Bryant, Brandon Kyl...",,,,Movie,...,0,0,0,0,0,0,0,0,0,0
1553,A Boy Called Christmas,2021,"Adventure,Drama,Family",,"This is the story of Father Christmas, the tal...",Director:Gil Kenan| Stars:Kristen Wiig...,,,,Movie,...,0,0,0,0,0,0,0,0,0,0
5546,Conquest,,,,,"Stars:Bruna Marquezine, Charle...",,,,Series,...,0,0,0,0,0,0,0,0,0,0


In [27]:
df[column_names][df["TYPE"] == "Movie"].describe()

Unnamed: 0,RATING,RunTime
count,4374.0,4137.0
mean,6.489209,89.269761
std,1.200889,46.489358
min,1.8,1.0
25%,5.8,64.0
50%,6.6,90.0
75%,7.4,105.0
max,9.4,573.0


In [28]:
df[column_names][df["TYPE"] == "Series"].describe()

Unnamed: 0,RATING,RunTime
count,3805.0,2904.0
mean,7.41774,39.368802
std,1.040193,29.549575
min,1.1,1.0
25%,6.9,24.0
50%,7.5,38.0
75%,8.1,47.0
max,9.9,853.0


In [29]:
df["RunTime"][df["TYPE"] == "Movie"].isna().sum()

650

In [30]:
df.loc[(df["TYPE"] == "Movie") & (df["RunTime"].isna()), 'RunTime'] = 89

In [31]:
df["RunTime"][df["TYPE"] == "Series"].isna().sum()

2308

In [32]:
df.loc[(df["TYPE"] == "Series") & (df["RunTime"].isna()), 'RunTime'] = 39

### Elements Missing in Row

In [33]:
df_missing_in_records = df[column_names].isna().sum(axis=1).to_frame()
df_missing_in_records = df_missing_in_records.rename(columns={0:"missing"})

df_missing_in_records.describe()

Unnamed: 0,missing
count,9999.0
mean,1.582058
std,1.361621
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,7.0


In [34]:
df = df.merge(df_missing_in_records, how="inner", left_index=True, right_index=True)

In [35]:
df[df["missing"] > 4].sample(5)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,TYPE,...,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur,missing
9781,Inside Man,,"Drama,Mystery,Thriller",,,Director:Paul McGuigan| Stars:July Nam...,,39.0,,Series,...,0,0,0,0,0,0,0,0,0,6
9154,Clark,,"Action,Crime,Drama",,,"Stars:Bill Skarsgård, Alicia A...",,39.0,,Series,...,0,0,0,0,0,0,0,0,0,6
6453,Untitled Tituss Burgess/Netflix Project,,,,Plot under wraps.,,,39.0,,Series,...,0,0,0,0,0,0,0,0,0,6
8433,In from the Cold,,"Crime,Drama,Thriller",,,"Stars:Ivanna Sakhno, Charles B...",,39.0,,Series,...,0,0,0,0,0,0,0,0,0,6
4979,Il filo invisibile,,,,,Director:Marco Simon Puccioni| Stars:V...,,39.0,,Series,...,0,0,0,0,0,0,0,0,0,7


### Removing null values

In [36]:
df = df.dropna(subset=["GENRE", "RATING", "Year_From", "Year_To"])

### Final Checking

In [37]:
column_names = ["MOVIES", "GENRE", "RATING", "ONE-LINE",
                "STARS", "VOTES", "RunTime", "Gross", "TYPE",
                "Year_From", "Year_To"]
missing_df = df[column_names].isna().sum().to_frame()
missing_df = missing_df.rename(columns={0:"missing"})
missing_df["percentage"] = (missing_df["missing"] / df.shape[0]) * 100
missing_df

Unnamed: 0,missing,percentage
MOVIES,0,0.0
GENRE,0,0.0
RATING,0,0.0
ONE-LINE,371,4.542116
STARS,0,0.0
VOTES,0,0.0
RunTime,0,0.0
Gross,7708,94.368266
TYPE,0,0.0
Year_From,0,0.0


## Dropping Redundant Columns

In [38]:
df.drop(columns=["MOVIES", "YEAR", "GENRE", "STARS", "ONE-LINE", "Gross",
                 "Directors", "Actors", "missing"], inplace=True)

## Fixing Votes

In [39]:
df["VOTES"] = df["VOTES"].str.replace(",", "", regex=False)
df.head()

Unnamed: 0,RATING,VOTES,RunTime,TYPE,Year_From,Year_To,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Biography,...,Director_Àlex Pastor,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur
0,6.1,21062,121.0,Movie,2021,2021,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5.0,17870,25.0,Series,2021,2021,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,8.2,885805,44.0,Series,2010,2022,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9.2,414849,23.0,Series,2013,2013,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
5,7.6,25858,50.0,Series,2020,2020,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
df = df.astype({"VOTES": "int64",
                "Year_From": "int16",
                "Year_To": "int16"})
df.dtypes

RATING                            float64
VOTES                               int64
RunTime                           float64
TYPE                               object
Year_From                           int16
                                   ...   
Director_Ángel Gómez Hernández      int64
Director_Ángeles Reiné              int64
Director_Åke Sandgren               int64
Director_Óscar Pedraza              int64
Director_Ömer Ugur                  int64
Length: 21441, dtype: object

## Saving

In [43]:
df.to_parquet("../data/movies-clean.parquet.gzip", compression='gzip')