# Data Exploration and Cleaning

## 1. Necessary Python libraries are loaded

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler

## 2. Dataset is loaded from CSV file

In [2]:
# Load the cleaned data from the 'data.csv' file
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,tconst,titleType,primaryTitle,startYear,genres,averageRating,numVotes,director,actors,writer,box_office
0,0,100275,movie,The Wandering Soap Opera,2017,"Comedy,Drama,Fantasy",6.6,268,"Raoul Ruiz, Valeria Sarmiento","Luis Alarcón, Patricia Rivadeneira, Francisco ...","Pía Rey, Raoul Ruiz",3624.0
1,1,315642,movie,Wazir,2016,"Action,Crime,Drama",7.1,17409,Bejoy Nambiar,"Amitabh Bachchan, Farhan Akhtar, Aditi Rao Hyd...","Vidhu Vinod Chopra, Vidhu Vinod Chopra",5633588.0
2,2,331314,movie,Bunyan and Babe,2017,"Adventure,Animation,Comedy",5.0,341,Louis Ross,"John Goodman, Jeff Foxworthy, Kelsey Grammer, ...","Michael A. Nickles, Michael A. Nickles",72060.0
3,3,365907,movie,A Walk Among the Tombstones,2014,"Action,Crime,Drama",6.5,112855,Scott Frank,"Liam Neeson, Maurice Compte, Patrick McDade, L...","Lawrence Block, Scott Frank",58834380.0
4,4,369610,movie,Jurassic World,2015,"Action,Adventure,Sci-Fi",7.0,580377,Colin Trevorrow,"Chris Pratt, Bryce Dallas Howard, Irrfan Khan,...","Rick Jaffa, Amanda Silver",1670401000.0


In [3]:
# Identify shape of dataset
print(f"Movies dataset has {data.shape[0]} rows and {data.shape[1]} columns")

Movies dataset has 9231 rows and 12 columns


## 3. Data is split into training and testing datasets

In [4]:
# Split the dataframe into a train and test split with selected features
X = data[['primaryTitle', 'genres', 'numVotes', 'director', 'actors', 'writer', 'box_office']]
y = data['averageRating']

# 80% training data, 20% testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [5]:
# Save y_train data to csv file
y_train.to_csv("ytrain.csv", index=False)

In [6]:
# Save y_test data to csv file
y_test.to_csv("ytest.csv", index=False)

## 3. Working with X_train data

### 3a) Genres column is converted to numeric data

In [7]:
# Convert genre data from categorical to numeric
X_train.reset_index(inplace= True)
lemmatizer = WordNetLemmatizer()
genres = X_train["genres"]

li = []
all_genres = genres.unique()

for i in range(len(genres)):
    # For each row, make all genres lowercase and split genres by the commas 
    temp = genres[i].lower()
    temp = temp.split(",")
    temp = [lemmatizer.lemmatize(word) for word in temp]
    
    # Merge all entries in genres column into a list
    li.append(" ".join(temp))

In [8]:
# Create genres dataframe
genre_cat = pd.DataFrame(li, columns=['genres'])
genre_cat.head()

Unnamed: 0,genres
0,drama
1,drama history
2,comedy drama romance
3,drama thriller
4,drama history war


In [9]:
# Count vectorize - value is 1 if the row contains the genre
cv = CountVectorizer()
genres_cv = cv.fit_transform(genre_cat["genres"]).toarray()
genres_cv

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [10]:
# Display contents of count vector 
print("\nNote: First row of above count vector: ", genres_cv[0])
print("\nColumns Corresponding to above count vector is :\n",cv.get_feature_names())


Note: First row of above count vector:  [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

Columns Corresponding to above count vector is :
 ['action', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'drama', 'family', 'fantasy', 'fi', 'history', 'horror', 'music', 'musical', 'mystery', 'romance', 'sci', 'sport', 'thriller', 'war', 'western']


In [11]:
# Create genres dataframe, now numeric
df1 = pd.DataFrame(genres_cv)
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [12]:
# Rename columns to genre names
genre_names = cv.get_feature_names()
df1.columns = genre_names
df1.head()

Unnamed: 0,action,adventure,animation,biography,comedy,crime,drama,family,fantasy,fi,...,horror,music,musical,mystery,romance,sci,sport,thriller,war,western
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [13]:
# Append vectorised genres with rest of X_train data
X_train = X_train.join(df1)
X_train.head()

Unnamed: 0,index,primaryTitle,genres,numVotes,director,actors,writer,box_office,action,adventure,...,horror,music,musical,mystery,romance,sci,sport,thriller,war,western
0,1323,Perfect Obedience,Drama,839,Luis Urquiza,"Juan Manuel Bernal, Sebastián Aguirre, Juan Ig...","Ernesto Alcocer, Luis Urquiza",2170783.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5563,The Giant,"Drama,History",3737,"Aitor Arregi, Jon Garaño","Joseba Usabiaga, Eneko Sagardoy, Iñigo Aranbur...","Aitor Arregi, Jon Garaño",860750.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4840,"Love, Simon","Comedy,Drama,Romance",98623,Greg Berlanti,"Nick Robinson, Jennifer Garner, Josh Duhamel, ...","Becky Albertalli, Elizabeth Berger",66316289.0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,4514,The Fixer,"Drama,Thriller",459,Adrian Sitaru,"Tudor Istodor, Mehdi Nebbou, Nicolas Wanczycki...","Adrian Silisteanu, Claudia Silisteanu",9669.0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1491,Macbeth,"Drama,History,War",53024,Justin Kurzel,"Jack Madigan, Frank Madigan, Michael Fassbende...","Todd Louiso, Jacob Koskoff",16322067.0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [14]:
# Drop original genres column
X_train = X_train.drop(['genres'], axis=1)
X_train

Unnamed: 0,index,primaryTitle,numVotes,director,actors,writer,box_office,action,adventure,animation,...,horror,music,musical,mystery,romance,sci,sport,thriller,war,western
0,1323,Perfect Obedience,839,Luis Urquiza,"Juan Manuel Bernal, Sebastián Aguirre, Juan Ig...","Ernesto Alcocer, Luis Urquiza",2170783.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5563,The Giant,3737,"Aitor Arregi, Jon Garaño","Joseba Usabiaga, Eneko Sagardoy, Iñigo Aranbur...","Aitor Arregi, Jon Garaño",860750.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4840,"Love, Simon",98623,Greg Berlanti,"Nick Robinson, Jennifer Garner, Josh Duhamel, ...","Becky Albertalli, Elizabeth Berger",66316289.0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,4514,The Fixer,459,Adrian Sitaru,"Tudor Istodor, Mehdi Nebbou, Nicolas Wanczycki...","Adrian Silisteanu, Claudia Silisteanu",9669.0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1491,Macbeth,53024,Justin Kurzel,"Jack Madigan, Frank Madigan, Michael Fassbende...","Todd Louiso, Jacob Koskoff",16322067.0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7379,5734,The Party,17121,Sally Potter,"Timothy Spall, Kristin Scott Thomas, Patricia ...","Sally Potter, Walter Donohue",5597950.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7380,5191,Colette,19642,Wash Westmoreland,"Keira Knightley, Fiona Shaw, Dominic West, Rob...","Richard Glatzer, Wash Westmoreland",14273033.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7381,5390,All About Me,3534,Caroline Link,"Luise Heyer, Diana Amft, Sönke Möhring, Joachi...","Ruth Toma, Hape Kerkeling",31920159.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7382,860,Nise: The Heart of Madness,2217,Roberto Berliner,"Glória Pires, Luciana Fregolente, Simone Mazze...","Flávia Castro, Mauricio Lissovski",779275.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 3b) Director column is converted to numeric data

In [15]:
# Pick first director for each film and convert to numeric
directors = data['director']

li = []
all_directors = directors.unique()

for i in range(len(directors)):
    # For each row, make all directors lowercase
    temp = directors[i].lower()
    
    # Split directors by the commas and obtain only first director
    temp = temp.split(",", 1)[0]
    temp = [lemmatizer.lemmatize(word) for word in temp]
    
    # Merge all entries in director column into a list
    li.append("".join(temp))

In [16]:
# Create director dataframe
director_cat = pd.DataFrame(li, columns=['directors'])
director_cat.head()

Unnamed: 0,directors
0,raoul ruiz
1,bejoy nambiar
2,louis ross
3,scott frank
4,colin trevorrow


In [17]:
# Count vectorize - value is 1 if the row contains the director
cv = CountVectorizer(tokenizer=lambda x: x.split(","))
directors_cv = cv.fit_transform(director_cat["directors"]).toarray()
directors_cv

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [18]:
# Display contents of count vector
print("\nNote: First row of above count vector: ", directors_cv[0])
print("\nColumns Corresponding to above count vector is :\n",cv.get_feature_names())


Note: First row of above count vector:  [0 0 0 ... 0 0 0]

Columns Corresponding to above count vector is :
 ['a. karunakaran', 'a. taner elhan', 'a.b. shawky', 'a.j. edwards', 'a.k. sajan', 'a.l. vijay', 'a.m. lukas', 'a.r. murugadoss', 'a.s. ravi kumar chowdary', 'a.t. white', 'aanand l. rai', 'aaron fisher', 'aaron harvey', 'aaron horvath', 'aaron katz', 'aaron kaufman', 'aaron keeling', 'aaron mirtes', 'aaron nee', 'aaron re', 'aaron schimberg', 'aaron sorkin', 'aaron woodley', 'aashiq abu', 'abbas alibhai burmawalla', 'abby kohn', 'abd al malik', 'abdelhamid bouchnak', 'abdellatif kechiche', 'abderrahmane sissako', 'abdullah oguz', 'abdurrahman öner', 'abe forsythe', 'abe rosenberg', 'abel ferrara', 'abel vang', 'abhay chopra', 'abhijit panse', 'abhinay deo', 'abhiraj minawala', 'abhishek chaubey', 'abhishek kapoor', 'abhishek pathak', 'abhishek shah', 'abhishek sharma', 'abhishek varman', 'abner pastoll', 'abrid shine', 'adam bin amiruddin', 'adam carolla', 'adam egypt mortimer'

In [19]:
# Create director dataframe, now numeric
df3 = pd.DataFrame(directors_cv)
df3.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6835,6836,6837,6838,6839,6840,6841,6842,6843,6844
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Rename columns to director names
director_names = cv.get_feature_names()
df3.columns = director_names

# Add suffix '_d' to differentiate between directors, writers and actors
df3 = df3.add_suffix('_d')

In [21]:
# Append vectorised directors with rest of X_train data
X_train = X_train.join(df3)
X_train.head()

Unnamed: 0,index,primaryTitle,numVotes,director,actors,writer,box_office,action,adventure,animation,...,ömer can_d,ömer faruk sorak_d,ömer ugur_d,özcan deniz_d,özgür bakar_d,özgür sevimli_d,özgür yelence_d,özhan eren_d,ümit köreken_d,ümit ünal_d
0,1323,Perfect Obedience,839,Luis Urquiza,"Juan Manuel Bernal, Sebastián Aguirre, Juan Ig...","Ernesto Alcocer, Luis Urquiza",2170783.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5563,The Giant,3737,"Aitor Arregi, Jon Garaño","Joseba Usabiaga, Eneko Sagardoy, Iñigo Aranbur...","Aitor Arregi, Jon Garaño",860750.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4840,"Love, Simon",98623,Greg Berlanti,"Nick Robinson, Jennifer Garner, Josh Duhamel, ...","Becky Albertalli, Elizabeth Berger",66316289.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4514,The Fixer,459,Adrian Sitaru,"Tudor Istodor, Mehdi Nebbou, Nicolas Wanczycki...","Adrian Silisteanu, Claudia Silisteanu",9669.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1491,Macbeth,53024,Justin Kurzel,"Jack Madigan, Frank Madigan, Michael Fassbende...","Todd Louiso, Jacob Koskoff",16322067.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# Drop original director column
X_train = X_train.drop(['director'], axis=1)
X_train.head()

Unnamed: 0,index,primaryTitle,numVotes,actors,writer,box_office,action,adventure,animation,biography,...,ömer can_d,ömer faruk sorak_d,ömer ugur_d,özcan deniz_d,özgür bakar_d,özgür sevimli_d,özgür yelence_d,özhan eren_d,ümit köreken_d,ümit ünal_d
0,1323,Perfect Obedience,839,"Juan Manuel Bernal, Sebastián Aguirre, Juan Ig...","Ernesto Alcocer, Luis Urquiza",2170783.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5563,The Giant,3737,"Joseba Usabiaga, Eneko Sagardoy, Iñigo Aranbur...","Aitor Arregi, Jon Garaño",860750.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4840,"Love, Simon",98623,"Nick Robinson, Jennifer Garner, Josh Duhamel, ...","Becky Albertalli, Elizabeth Berger",66316289.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4514,The Fixer,459,"Tudor Istodor, Mehdi Nebbou, Nicolas Wanczycki...","Adrian Silisteanu, Claudia Silisteanu",9669.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1491,Macbeth,53024,"Jack Madigan, Frank Madigan, Michael Fassbende...","Todd Louiso, Jacob Koskoff",16322067.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 3c) Writer column is converted to numeric data

In [23]:
# Pick first writer for each film and convert to numeric
writers = data["writer"]

li = []
all_writers = writers.unique()

for i in range(len(writers)):
    # For each row, make all writers lowercase
    temp = writers[i].lower()
    
    # Split writers by the commas and obtain only first writer
    temp = temp.split(",", 1)[0]
    temp = [lemmatizer.lemmatize(word) for word in temp]
    
    # Merge all entries in writers column into a list
    li.append("".join(temp))

In [24]:
# Create writers dataframe
writer_cat = pd.DataFrame(li, columns=['writers'])
writer_cat.head()

Unnamed: 0,writers
0,pía rey
1,vidhu vinod chopra
2,michael a. nickles
3,lawrence block
4,rick jaffa


In [25]:
# Count vectorize - value is 1 if the row contains the writer
cv = CountVectorizer(tokenizer=lambda x: x.split(","))
writers_cv = cv.fit_transform(writer_cat["writers"]).toarray()
writers_cv

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [26]:
# Display contents of count vector
print("\nNote: First row of above count vector: ", writers_cv[0])
print("\nColumns Corresponding to above count vector is :\n",cv.get_feature_names())


Note: First row of above count vector:  [0 0 0 ... 0 0 0]

Columns Corresponding to above count vector is :
 ['a. karunakaran', 'a.a. milne', 'a.b. shawky', 'a.c. mughil', 'a.j. edwards', 'a.k. sajan', 'a.k. waters', 'a.l. vijay', 'a.n. balakrishnan', 'a.r. murugadoss', 'a.r. vikhyath', 'a.s. ravi kumar chowdary', 'a.t. white', 'aaron brooks', 'aaron drane', 'aaron fisher', 'aaron guzikowski', 'aaron harvey', 'aaron kandell', 'aaron katz', 'aaron mirtes', 'aaron nee', 'aaron re', 'aaron schimberg', 'aaron sorkin', 'aaron woodley', 'abbas dalal', 'abby johnson', 'abby kohn', 'abd al malik', 'abdel rahim kamal', 'abdelhamid bouchnak', 'abderrahmane sissako', 'abdurrahman öner', 'abe forsythe', 'abel ferrara', 'abel vang', 'abhay chopra', 'abhayakumar', 'abhijeet shirish deshpande', 'abhijit mahesh', 'abhijit panse', 'abhilash n. chandran', 'abhilash s. nair', 'abhimanyu mukherjee', 'abhiruchi chand', 'abhishek banerjee', 'abhishek chaubey', 'abhishek kapoor', 'abhishek sharma', 'abi mor

In [27]:
# Create writers dataframe, now numeric
df3 = pd.DataFrame(writers_cv)
df3.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7369,7370,7371,7372,7373,7374,7375,7376,7377,7378
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
# Rename columns to writer names
writer_names = cv.get_feature_names()
df3.columns = writer_names

# Add suffix '_w' to differentiate between directors, writers and actors
df3 = df3.add_suffix('_w')

In [29]:
# Append vectorised genres with rest of X_train data
X_train = X_train.join(df3)
X_train.head()

Unnamed: 0,index,primaryTitle,numVotes,actors,writer,box_office,action,adventure,animation,biography,...,özcan deniz_w,özge aras_w,özge efendioglu_w,özgür akbas_w,özgür bakar_w,özgür sevimli_w,özhan eren_w,øystein dolmen_w,ümit köreken_w,ümit ünal_w
0,1323,Perfect Obedience,839,"Juan Manuel Bernal, Sebastián Aguirre, Juan Ig...","Ernesto Alcocer, Luis Urquiza",2170783.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5563,The Giant,3737,"Joseba Usabiaga, Eneko Sagardoy, Iñigo Aranbur...","Aitor Arregi, Jon Garaño",860750.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4840,"Love, Simon",98623,"Nick Robinson, Jennifer Garner, Josh Duhamel, ...","Becky Albertalli, Elizabeth Berger",66316289.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4514,The Fixer,459,"Tudor Istodor, Mehdi Nebbou, Nicolas Wanczycki...","Adrian Silisteanu, Claudia Silisteanu",9669.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1491,Macbeth,53024,"Jack Madigan, Frank Madigan, Michael Fassbende...","Todd Louiso, Jacob Koskoff",16322067.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# Drop original writer column
X_train = X_train.drop(['writer'], axis=1)
X_train.head()

Unnamed: 0,index,primaryTitle,numVotes,actors,box_office,action,adventure,animation,biography,comedy,...,özcan deniz_w,özge aras_w,özge efendioglu_w,özgür akbas_w,özgür bakar_w,özgür sevimli_w,özhan eren_w,øystein dolmen_w,ümit köreken_w,ümit ünal_w
0,1323,Perfect Obedience,839,"Juan Manuel Bernal, Sebastián Aguirre, Juan Ig...",2170783.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5563,The Giant,3737,"Joseba Usabiaga, Eneko Sagardoy, Iñigo Aranbur...",860750.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4840,"Love, Simon",98623,"Nick Robinson, Jennifer Garner, Josh Duhamel, ...",66316289.0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,4514,The Fixer,459,"Tudor Istodor, Mehdi Nebbou, Nicolas Wanczycki...",9669.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1491,Macbeth,53024,"Jack Madigan, Frank Madigan, Michael Fassbende...",16322067.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 3d) Actors column is converted to numeric data

In [31]:
# Pick first 3 actors for each film 
actors = data["actors"]

li = []
all_actors = actors.unique()

for i in range(len(actors)):
    # For each row, make all actors lowercase
    temp = actors[i].lower()
    
    # Extract the first three actors from the string in each row
    temp = temp.split(",")[0:2]   
    
    # Joins the three actors by commas 
    temp = ','.join(temp)
    temp = [lemmatizer.lemmatize(word) for word in temp]
    
    # Merge all entries in genres column into a list
    li.append("".join(temp))

In [32]:
# Create actors dataframe
actors_cat = pd.DataFrame(li, columns=['actors'])
actors_cat

Unnamed: 0,actors
0,"luis alarcón, patricia rivadeneira"
1,"amitabh bachchan, farhan akhtar"
2,"john goodman, jeff foxworthy"
3,"liam neeson, maurice compte"
4,"chris pratt, bryce dallas howard"
...,...
9226,"david thorpe, john rhys-davies"
9227,"karthi, narain"
9228,"nandu anand, roshan ullas"
9229,"ahmet faik akinci, belma mamati"


In [33]:
# Count vectorize - value is 1 if the row contains the genre
cv = CountVectorizer(tokenizer=lambda x: x.split(","))
actors_cv = cv.fit_transform(actors_cat["actors"]).toarray()
actors_cv

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [34]:
# Display contents of count vector
print("\nNote: First row of above count vector: ", actors_cv[0])
print("\nColumns Corresponding to above count vector is :\n",cv.get_feature_names())


Note: First row of above count vector:  [0 0 0 ... 0 0 0]

Columns Corresponding to above count vector is :
 [' 5gang', ' aadhi', ' aaron altaras', ' aaron chow', ' aaron costa ganis', ' aaron eckhart', ' aaron kissiov', ' aaron kwok', ' aaron paul', ' aarshi banerjee', ' aayush sharma', ' abbey hoes', ' abbey lee', ' abdoulaye diallo', ' abel jafri', ' abhay deol', ' abhishek bachchan', ' abhishek bharate', ' abigail breslin', ' abigail eames', ' abigail hardingham', ' abigél szõke', ' abra', ' abram rooney', ' achim barremstrein', ' ada condeescu', ' adah sharma', ' adam b. shapiro', ' adam bartley', ' adam bousdoukos', ' adam brody', ' adam brudnicki', ' adam devine', ' adam driver', ' adam green', ' adam gutniak', ' adam horovitz', ' adam nee', ' adam pally', ' adam pearson', ' adam scott', ' adam woronowicz', ' addison timlin', ' adeeja rochele anderson', ' adeel husain', ' adelaide clemens', ' adelina pestritu', ' ademola adedoyin', ' aden young', ' adil hussain', ' adil koukouh

In [35]:
# Create actors dataframe, now numeric
df4 = pd.DataFrame(actors_cv)
df4.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13102,13103,13104,13105,13106,13107,13108,13109,13110,13111
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
# Rename columns to actors names
actor_names = cv.get_feature_names()
df4.columns = actor_names

# Add suffix '_a' to differentiate between directors, writers and actors
df4 = df4.add_suffix('_a')

In [37]:
# Append vectorised genres with rest of X_train data
X_train = X_train.join(df4)
X_train

Unnamed: 0,index,primaryTitle,numVotes,actors,box_office,action,adventure,animation,biography,comedy,...,ólafur darri ólafsson_a,óscar jaenada_a,ömer acar_a,önder k. açikbas_a,öner erkan_a,öykü celik_a,özcan deniz_a,özge gürel_a,ümit bülent dinçer_a,ümit kantarcilar_a
0,1323,Perfect Obedience,839,"Juan Manuel Bernal, Sebastián Aguirre, Juan Ig...",2170783.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5563,The Giant,3737,"Joseba Usabiaga, Eneko Sagardoy, Iñigo Aranbur...",860750.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4840,"Love, Simon",98623,"Nick Robinson, Jennifer Garner, Josh Duhamel, ...",66316289.0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,4514,The Fixer,459,"Tudor Istodor, Mehdi Nebbou, Nicolas Wanczycki...",9669.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1491,Macbeth,53024,"Jack Madigan, Frank Madigan, Michael Fassbende...",16322067.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7379,5734,The Party,17121,"Timothy Spall, Kristin Scott Thomas, Patricia ...",5597950.0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7380,5191,Colette,19642,"Keira Knightley, Fiona Shaw, Dominic West, Rob...",14273033.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7381,5390,All About Me,3534,"Luise Heyer, Diana Amft, Sönke Möhring, Joachi...",31920159.0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
7382,860,Nise: The Heart of Madness,2217,"Glória Pires, Luciana Fregolente, Simone Mazze...",779275.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# Drop original actors column
X_train = X_train.drop(['actors'], axis=1)
X_train

Unnamed: 0,index,primaryTitle,numVotes,box_office,action,adventure,animation,biography,comedy,crime,...,ólafur darri ólafsson_a,óscar jaenada_a,ömer acar_a,önder k. açikbas_a,öner erkan_a,öykü celik_a,özcan deniz_a,özge gürel_a,ümit bülent dinçer_a,ümit kantarcilar_a
0,1323,Perfect Obedience,839,2170783.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5563,The Giant,3737,860750.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4840,"Love, Simon",98623,66316289.0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,4514,The Fixer,459,9669.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1491,Macbeth,53024,16322067.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7379,5734,The Party,17121,5597950.0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7380,5191,Colette,19642,14273033.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7381,5390,All About Me,3534,31920159.0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
7382,860,Nise: The Heart of Madness,2217,779275.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
# Drop index column so that the new index starts from 0
X_train = X_train.drop(['index'], axis=1)
X_train

Unnamed: 0,primaryTitle,numVotes,box_office,action,adventure,animation,biography,comedy,crime,drama,...,ólafur darri ólafsson_a,óscar jaenada_a,ömer acar_a,önder k. açikbas_a,öner erkan_a,öykü celik_a,özcan deniz_a,özge gürel_a,ümit bülent dinçer_a,ümit kantarcilar_a
0,Perfect Obedience,839,2170783.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,The Giant,3737,860750.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,"Love, Simon",98623,66316289.0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,The Fixer,459,9669.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,Macbeth,53024,16322067.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7379,The Party,17121,5597950.0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
7380,Colette,19642,14273033.0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7381,All About Me,3534,31920159.0,0,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
7382,Nise: The Heart of Madness,2217,779275.0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [40]:
# Drop the categorical primaryTitle column
X_train = X_train.drop(['primaryTitle'], axis=1)
X_train

Unnamed: 0,numVotes,box_office,action,adventure,animation,biography,comedy,crime,drama,family,...,ólafur darri ólafsson_a,óscar jaenada_a,ömer acar_a,önder k. açikbas_a,öner erkan_a,öykü celik_a,özcan deniz_a,özge gürel_a,ümit bülent dinçer_a,ümit kantarcilar_a
0,839,2170783.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,3737,860750.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,98623,66316289.0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,459,9669.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,53024,16322067.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7379,17121,5597950.0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7380,19642,14273033.0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7381,3534,31920159.0,0,0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7382,2217,779275.0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


### 3e) All columns are scaled

In [41]:
# Scale columns so they are all normalised correctly for the algorithms
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_train_scaled

Unnamed: 0,numVotes,box_office,action,adventure,animation,biography,comedy,crime,drama,family,...,ólafur darri ólafsson_a,óscar jaenada_a,ömer acar_a,önder k. açikbas_a,öner erkan_a,öykü celik_a,özcan deniz_a,özge gürel_a,ümit bülent dinçer_a,ümit kantarcilar_a
0,0.000707,0.000776,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.003478,0.000308,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.094213,0.023703,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000343,0.000003,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.050609,0.005834,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7379,0.016276,0.002001,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7380,0.018687,0.005102,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7381,0.003284,0.011409,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7382,0.002024,0.000279,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
# Save X_train data to csv file
X_train_scaled.to_csv("xtrain.csv", index=False)

## 4. Working with X_test data

### 4a) Genres column is converted to numeric data

In [43]:
# Convert genre data from categorical to numeric
X_test.reset_index(inplace= True)
lemmatizer = WordNetLemmatizer()
genres = X_test["genres"]

li = []
all_genres = genres.unique()

for i in range(len(genres)):
    # For each row, make all genres lowercase and split genres by the commas
    temp = genres[i].lower()
    temp = temp.split(",")
    temp = [lemmatizer.lemmatize(word) for word in temp]
    
    # Merge all entries in genres column into a list
    li.append(" ".join(temp))

In [44]:
# Create genres dataframe
genre_cat = pd.DataFrame(li, columns=['genres'])
genre_cat

Unnamed: 0,genres
0,action adventure drama
1,action thriller
2,drama romance
3,drama sport
4,adventure comedy
...,...
1842,mystery thriller
1843,comedy
1844,drama
1845,action comedy fantasy


In [45]:
# Count vectorize - value is 1 if the row contains the genre
cv = CountVectorizer()
genres_cv = cv.fit_transform(genre_cat["genres"]).toarray()
genres_cv

array([[1, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [46]:
# Display contents of count vector
print("\nNote: First row of above count vector: ", genres_cv[0])
print("\nColumns Corresponding to above count vector is :\n",cv.get_feature_names())


Note: First row of above count vector:  [1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

Columns Corresponding to above count vector is :
 ['action', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'drama', 'family', 'fantasy', 'fi', 'history', 'horror', 'music', 'musical', 'mystery', 'romance', 'sci', 'sport', 'thriller', 'war', 'western']


In [47]:
# Create genres dataframe, now numeric
df5 = pd.DataFrame(genres_cv)
df5

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,1,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1842,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1843,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1844,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1845,1,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
# Rename columns to genre names
genre_names = cv.get_feature_names()
df5.columns = genre_names
df5

Unnamed: 0,action,adventure,animation,biography,comedy,crime,drama,family,fantasy,fi,...,horror,music,musical,mystery,romance,sci,sport,thriller,war,western
0,1,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1842,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1843,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1844,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1845,1,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
# Append vectorised genres with rest of X_test data
X_test = X_test.join(df5)
X_test

Unnamed: 0,index,primaryTitle,genres,numVotes,director,actors,writer,box_office,action,adventure,...,horror,music,musical,mystery,romance,sci,sport,thriller,war,western
0,1183,Kenau,"Action,Adventure,Drama",1459,Maarten Treurniet,"Barry Atsma, Sallie Harmsen, Lisa Smit, Monic ...","Marnie Blok, Karin van Holst Pellekaan",581801.0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1038,Need for Speed,"Action,Thriller",163050,Scott Waugh,"Aaron Paul, Dominic Cooper, Imogen Poots, Kid ...","George Gatins, George Gatins",203277636.0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,9220,Shubharathri,"Drama,Romance",210,Vyasan K.P.,"Dileep, Anu Sithara, Siddique, Suraj Venjaramo...","Vyasan K.P., Vyasan K.P.",105932.0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,908,Draft Day,"Drama,Sport",54806,Ivan Reitman,"Chris Berman, Dave Donaldson, Patrick St. Espr...","Scott Rothman, Rajiv Joseph",29824199.0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,8495,The King's Musketeers,"Adventure,Comedy",894,Giovanni Veronesi,"Pierfrancesco Favino, Valerio Mastandrea, Rocc...","Nicola Baldoni, Giovanni Veronesi",5727134.0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1842,6853,Ittefaq,"Mystery,Thriller",10356,Abhay Chopra,"Sidharth Malhotra, Sonakshi Sinha, Akshaye Kha...","Abhay Chopra, Shreyas Jain",2520487.0,0,0,...,0,0,0,1,0,0,0,1,0,0
1843,7293,Natale da chef,Comedy,245,Neri Parenti,"Massimo Boldi, Dario Bandiera, Rocío Muñoz, Pa...","Alessandro Bencivenni, Gianluca Bomprezzi",3268233.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1844,7493,Sasha Was Here,Drama,317,Ernestas Jankauskas,"Markas Eimontas, Jelena Kirejeva, Dalia Michel...",Birute Kapustinskaite,131500.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1845,2919,Yoga Hosers,"Action,Comedy,Fantasy",12357,Kevin Smith,"Lily-Rose Depp, Harley Quinn Smith, Adam Brody...",Kevin Smith,36585.0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
# Drop original genres column
X_test = X_test.drop(['genres'], axis=1)
X_test

Unnamed: 0,index,primaryTitle,numVotes,director,actors,writer,box_office,action,adventure,animation,...,horror,music,musical,mystery,romance,sci,sport,thriller,war,western
0,1183,Kenau,1459,Maarten Treurniet,"Barry Atsma, Sallie Harmsen, Lisa Smit, Monic ...","Marnie Blok, Karin van Holst Pellekaan",581801.0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1038,Need for Speed,163050,Scott Waugh,"Aaron Paul, Dominic Cooper, Imogen Poots, Kid ...","George Gatins, George Gatins",203277636.0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,9220,Shubharathri,210,Vyasan K.P.,"Dileep, Anu Sithara, Siddique, Suraj Venjaramo...","Vyasan K.P., Vyasan K.P.",105932.0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,908,Draft Day,54806,Ivan Reitman,"Chris Berman, Dave Donaldson, Patrick St. Espr...","Scott Rothman, Rajiv Joseph",29824199.0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,8495,The King's Musketeers,894,Giovanni Veronesi,"Pierfrancesco Favino, Valerio Mastandrea, Rocc...","Nicola Baldoni, Giovanni Veronesi",5727134.0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1842,6853,Ittefaq,10356,Abhay Chopra,"Sidharth Malhotra, Sonakshi Sinha, Akshaye Kha...","Abhay Chopra, Shreyas Jain",2520487.0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1843,7293,Natale da chef,245,Neri Parenti,"Massimo Boldi, Dario Bandiera, Rocío Muñoz, Pa...","Alessandro Bencivenni, Gianluca Bomprezzi",3268233.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1844,7493,Sasha Was Here,317,Ernestas Jankauskas,"Markas Eimontas, Jelena Kirejeva, Dalia Michel...",Birute Kapustinskaite,131500.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1845,2919,Yoga Hosers,12357,Kevin Smith,"Lily-Rose Depp, Harley Quinn Smith, Adam Brody...",Kevin Smith,36585.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### 4b) Director column is converted to numeric data

In [51]:
# Pick first director for each film
directors = data["director"]

li = []
all_directors = directors.unique()

for i in range(len(directors)):
    # For each row, make all directors lowercase     
    temp = directors[i].lower()
    
    # Split directors by the commas and obtain only first director
    temp = temp.split(",", 1)[0]
    temp = [lemmatizer.lemmatize(word) for word in temp]
    
    # Merge all entries in director column into a list
    li.append("".join(temp))

In [52]:
# Create director dataframe
director_cat = pd.DataFrame(li, columns=['directors'])
director_cat

Unnamed: 0,directors
0,raoul ruiz
1,bejoy nambiar
2,louis ross
3,scott frank
4,colin trevorrow
...,...
9226,robert fernandez
9227,lokesh kanagaraj
9228,zam
9229,ahmet faik akinci


In [53]:
# Count vectorize - value is 1 if the row contains the director
cv = CountVectorizer(tokenizer=lambda x: x.split(","))
directors_cv = cv.fit_transform(director_cat["directors"]).toarray()
directors_cv

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [54]:
# Display contents of count vector
print("\nNote: First row of above count vector: ", directors_cv[0])
print("\nColumns Corresponding to above count vector is :\n",cv.get_feature_names())


Note: First row of above count vector:  [0 0 0 ... 0 0 0]

Columns Corresponding to above count vector is :
 ['a. karunakaran', 'a. taner elhan', 'a.b. shawky', 'a.j. edwards', 'a.k. sajan', 'a.l. vijay', 'a.m. lukas', 'a.r. murugadoss', 'a.s. ravi kumar chowdary', 'a.t. white', 'aanand l. rai', 'aaron fisher', 'aaron harvey', 'aaron horvath', 'aaron katz', 'aaron kaufman', 'aaron keeling', 'aaron mirtes', 'aaron nee', 'aaron re', 'aaron schimberg', 'aaron sorkin', 'aaron woodley', 'aashiq abu', 'abbas alibhai burmawalla', 'abby kohn', 'abd al malik', 'abdelhamid bouchnak', 'abdellatif kechiche', 'abderrahmane sissako', 'abdullah oguz', 'abdurrahman öner', 'abe forsythe', 'abe rosenberg', 'abel ferrara', 'abel vang', 'abhay chopra', 'abhijit panse', 'abhinay deo', 'abhiraj minawala', 'abhishek chaubey', 'abhishek kapoor', 'abhishek pathak', 'abhishek shah', 'abhishek sharma', 'abhishek varman', 'abner pastoll', 'abrid shine', 'adam bin amiruddin', 'adam carolla', 'adam egypt mortimer'

In [55]:
# Create director dataframe, now numeric
df6 = pd.DataFrame(directors_cv)
df6

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6835,6836,6837,6838,6839,6840,6841,6842,6843,6844
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9226,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9227,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9228,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9229,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
# Rename columns to director names
director_names = cv.get_feature_names()
df6.columns = director_names

# Add suffix '_d' to differentiate between directors, writers and actors
df6 = df6.add_suffix('_d')

In [58]:
# Append vectorised genres with rest of X_test data
X_test = X_test.join(df6)
X_test

Unnamed: 0,index,primaryTitle,numVotes,director,actors,writer,box_office,action,adventure,animation,...,ömer can_d,ömer faruk sorak_d,ömer ugur_d,özcan deniz_d,özgür bakar_d,özgür sevimli_d,özgür yelence_d,özhan eren_d,ümit köreken_d,ümit ünal_d
0,1183,Kenau,1459,Maarten Treurniet,"Barry Atsma, Sallie Harmsen, Lisa Smit, Monic ...","Marnie Blok, Karin van Holst Pellekaan",581801.0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1038,Need for Speed,163050,Scott Waugh,"Aaron Paul, Dominic Cooper, Imogen Poots, Kid ...","George Gatins, George Gatins",203277636.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9220,Shubharathri,210,Vyasan K.P.,"Dileep, Anu Sithara, Siddique, Suraj Venjaramo...","Vyasan K.P., Vyasan K.P.",105932.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,908,Draft Day,54806,Ivan Reitman,"Chris Berman, Dave Donaldson, Patrick St. Espr...","Scott Rothman, Rajiv Joseph",29824199.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8495,The King's Musketeers,894,Giovanni Veronesi,"Pierfrancesco Favino, Valerio Mastandrea, Rocc...","Nicola Baldoni, Giovanni Veronesi",5727134.0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1842,6853,Ittefaq,10356,Abhay Chopra,"Sidharth Malhotra, Sonakshi Sinha, Akshaye Kha...","Abhay Chopra, Shreyas Jain",2520487.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1843,7293,Natale da chef,245,Neri Parenti,"Massimo Boldi, Dario Bandiera, Rocío Muñoz, Pa...","Alessandro Bencivenni, Gianluca Bomprezzi",3268233.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1844,7493,Sasha Was Here,317,Ernestas Jankauskas,"Markas Eimontas, Jelena Kirejeva, Dalia Michel...",Birute Kapustinskaite,131500.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1845,2919,Yoga Hosers,12357,Kevin Smith,"Lily-Rose Depp, Harley Quinn Smith, Adam Brody...",Kevin Smith,36585.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
# Drop original director column
X_test = X_test.drop(['director'], axis=1)
X_test

Unnamed: 0,index,primaryTitle,numVotes,actors,writer,box_office,action,adventure,animation,biography,...,ömer can_d,ömer faruk sorak_d,ömer ugur_d,özcan deniz_d,özgür bakar_d,özgür sevimli_d,özgür yelence_d,özhan eren_d,ümit köreken_d,ümit ünal_d
0,1183,Kenau,1459,"Barry Atsma, Sallie Harmsen, Lisa Smit, Monic ...","Marnie Blok, Karin van Holst Pellekaan",581801.0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1038,Need for Speed,163050,"Aaron Paul, Dominic Cooper, Imogen Poots, Kid ...","George Gatins, George Gatins",203277636.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9220,Shubharathri,210,"Dileep, Anu Sithara, Siddique, Suraj Venjaramo...","Vyasan K.P., Vyasan K.P.",105932.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,908,Draft Day,54806,"Chris Berman, Dave Donaldson, Patrick St. Espr...","Scott Rothman, Rajiv Joseph",29824199.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8495,The King's Musketeers,894,"Pierfrancesco Favino, Valerio Mastandrea, Rocc...","Nicola Baldoni, Giovanni Veronesi",5727134.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1842,6853,Ittefaq,10356,"Sidharth Malhotra, Sonakshi Sinha, Akshaye Kha...","Abhay Chopra, Shreyas Jain",2520487.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1843,7293,Natale da chef,245,"Massimo Boldi, Dario Bandiera, Rocío Muñoz, Pa...","Alessandro Bencivenni, Gianluca Bomprezzi",3268233.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1844,7493,Sasha Was Here,317,"Markas Eimontas, Jelena Kirejeva, Dalia Michel...",Birute Kapustinskaite,131500.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1845,2919,Yoga Hosers,12357,"Lily-Rose Depp, Harley Quinn Smith, Adam Brody...",Kevin Smith,36585.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 4c) Writer column is converted to numeric data

In [60]:
# Pick first writer for each film
writers = data["writer"]

li = []
all_writers = writers.unique()

for i in range(len(writers)):
    # For each row, make all writers lowercase 
    temp = writers[i].lower()
    
    # Split writers by the commas and obtain only first writer
    temp = temp.split(",", 1)[0]
    temp = [lemmatizer.lemmatize(word) for word in temp]
    
    # Merge all entries in writers column into a list
    li.append("".join(temp))

In [61]:
# Create writer dataframe
writer_cat = pd.DataFrame(li, columns=['writers'])
writer_cat

Unnamed: 0,writers
0,pía rey
1,vidhu vinod chopra
2,michael a. nickles
3,lawrence block
4,rick jaffa
...,...
9226,john bunyan
9227,lokesh kanagaraj
9228,rajesh k narayan
9229,ahmet faik akinci


In [62]:
# Count vectorize - value is 1 if the row contains the writer
cv = CountVectorizer(tokenizer=lambda x: x.split(","))
writers_cv = cv.fit_transform(writer_cat["writers"]).toarray()
writers_cv

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [63]:
# Display contents of count vector
print("\nNote: First row of above count vector: ", writers_cv[0])
print("\nColumns Corresponding to above count vector is :\n",cv.get_feature_names())


Note: First row of above count vector:  [0 0 0 ... 0 0 0]

Columns Corresponding to above count vector is :
 ['a. karunakaran', 'a.a. milne', 'a.b. shawky', 'a.c. mughil', 'a.j. edwards', 'a.k. sajan', 'a.k. waters', 'a.l. vijay', 'a.n. balakrishnan', 'a.r. murugadoss', 'a.r. vikhyath', 'a.s. ravi kumar chowdary', 'a.t. white', 'aaron brooks', 'aaron drane', 'aaron fisher', 'aaron guzikowski', 'aaron harvey', 'aaron kandell', 'aaron katz', 'aaron mirtes', 'aaron nee', 'aaron re', 'aaron schimberg', 'aaron sorkin', 'aaron woodley', 'abbas dalal', 'abby johnson', 'abby kohn', 'abd al malik', 'abdel rahim kamal', 'abdelhamid bouchnak', 'abderrahmane sissako', 'abdurrahman öner', 'abe forsythe', 'abel ferrara', 'abel vang', 'abhay chopra', 'abhayakumar', 'abhijeet shirish deshpande', 'abhijit mahesh', 'abhijit panse', 'abhilash n. chandran', 'abhilash s. nair', 'abhimanyu mukherjee', 'abhiruchi chand', 'abhishek banerjee', 'abhishek chaubey', 'abhishek kapoor', 'abhishek sharma', 'abi mor

In [64]:
# Create writer dataframe, now numeric
df7 = pd.DataFrame(writers_cv)
df7

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7369,7370,7371,7372,7373,7374,7375,7376,7377,7378
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9226,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9227,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9228,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9229,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
# Rename columns to writer names
writer_names = cv.get_feature_names()
df7.columns = writer_names

# Add suffix '_w' to differentiate between directors, writers and actors
df7 = df7.add_suffix('_w')

In [66]:
# Append vectorised writers with rest of X_test data
X_test = X_test.join(df7)
X_test

Unnamed: 0,index,primaryTitle,numVotes,actors,writer,box_office,action,adventure,animation,biography,...,özcan deniz_w,özge aras_w,özge efendioglu_w,özgür akbas_w,özgür bakar_w,özgür sevimli_w,özhan eren_w,øystein dolmen_w,ümit köreken_w,ümit ünal_w
0,1183,Kenau,1459,"Barry Atsma, Sallie Harmsen, Lisa Smit, Monic ...","Marnie Blok, Karin van Holst Pellekaan",581801.0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1038,Need for Speed,163050,"Aaron Paul, Dominic Cooper, Imogen Poots, Kid ...","George Gatins, George Gatins",203277636.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9220,Shubharathri,210,"Dileep, Anu Sithara, Siddique, Suraj Venjaramo...","Vyasan K.P., Vyasan K.P.",105932.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,908,Draft Day,54806,"Chris Berman, Dave Donaldson, Patrick St. Espr...","Scott Rothman, Rajiv Joseph",29824199.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8495,The King's Musketeers,894,"Pierfrancesco Favino, Valerio Mastandrea, Rocc...","Nicola Baldoni, Giovanni Veronesi",5727134.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1842,6853,Ittefaq,10356,"Sidharth Malhotra, Sonakshi Sinha, Akshaye Kha...","Abhay Chopra, Shreyas Jain",2520487.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1843,7293,Natale da chef,245,"Massimo Boldi, Dario Bandiera, Rocío Muñoz, Pa...","Alessandro Bencivenni, Gianluca Bomprezzi",3268233.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1844,7493,Sasha Was Here,317,"Markas Eimontas, Jelena Kirejeva, Dalia Michel...",Birute Kapustinskaite,131500.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1845,2919,Yoga Hosers,12357,"Lily-Rose Depp, Harley Quinn Smith, Adam Brody...",Kevin Smith,36585.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [67]:
# Drop original writer column
X_test = X_test.drop(['writer'], axis=1)
X_test

Unnamed: 0,index,primaryTitle,numVotes,actors,box_office,action,adventure,animation,biography,comedy,...,özcan deniz_w,özge aras_w,özge efendioglu_w,özgür akbas_w,özgür bakar_w,özgür sevimli_w,özhan eren_w,øystein dolmen_w,ümit köreken_w,ümit ünal_w
0,1183,Kenau,1459,"Barry Atsma, Sallie Harmsen, Lisa Smit, Monic ...",581801.0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1038,Need for Speed,163050,"Aaron Paul, Dominic Cooper, Imogen Poots, Kid ...",203277636.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9220,Shubharathri,210,"Dileep, Anu Sithara, Siddique, Suraj Venjaramo...",105932.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,908,Draft Day,54806,"Chris Berman, Dave Donaldson, Patrick St. Espr...",29824199.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8495,The King's Musketeers,894,"Pierfrancesco Favino, Valerio Mastandrea, Rocc...",5727134.0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1842,6853,Ittefaq,10356,"Sidharth Malhotra, Sonakshi Sinha, Akshaye Kha...",2520487.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1843,7293,Natale da chef,245,"Massimo Boldi, Dario Bandiera, Rocío Muñoz, Pa...",3268233.0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1844,7493,Sasha Was Here,317,"Markas Eimontas, Jelena Kirejeva, Dalia Michel...",131500.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1845,2919,Yoga Hosers,12357,"Lily-Rose Depp, Harley Quinn Smith, Adam Brody...",36585.0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


### 4d) Actors column is converted to numeric data

In [68]:
# Pick first 3 actors for each film
actors = data["actors"]

li = []
all_actors = actors.unique()

for i in range(len(actors)):
    # For each row, make all actors lowercase 
    temp = actors[i].lower()
    
    # Extract the first three actors from the string in each row
    temp = temp.split(",")[0:2] 
    
    # Join the three actors by commas 
    temp = ','.join(temp)
    temp = [lemmatizer.lemmatize(word) for word in temp]
    
    # Merge all entries in actors column into a list
    li.append("".join(temp))

In [69]:
# Create actors dataframe
actors_cat = pd.DataFrame(li, columns=['actors'])
actors_cat

Unnamed: 0,actors
0,"luis alarcón, patricia rivadeneira"
1,"amitabh bachchan, farhan akhtar"
2,"john goodman, jeff foxworthy"
3,"liam neeson, maurice compte"
4,"chris pratt, bryce dallas howard"
...,...
9226,"david thorpe, john rhys-davies"
9227,"karthi, narain"
9228,"nandu anand, roshan ullas"
9229,"ahmet faik akinci, belma mamati"


In [70]:
# Count vectorize - value is 1 if the row contains the actor
cv = CountVectorizer(tokenizer=lambda x: x.split(","))
actors_cv = cv.fit_transform(actors_cat["actors"]).toarray()
actors_cv

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [71]:
# Display contents of count vector
print("\nNote: First row of above count vector: ", actors_cv[0])
print("\nColumns Corresponding to above count vector is :\n",cv.get_feature_names())


Note: First row of above count vector:  [0 0 0 ... 0 0 0]

Columns Corresponding to above count vector is :
 [' 5gang', ' aadhi', ' aaron altaras', ' aaron chow', ' aaron costa ganis', ' aaron eckhart', ' aaron kissiov', ' aaron kwok', ' aaron paul', ' aarshi banerjee', ' aayush sharma', ' abbey hoes', ' abbey lee', ' abdoulaye diallo', ' abel jafri', ' abhay deol', ' abhishek bachchan', ' abhishek bharate', ' abigail breslin', ' abigail eames', ' abigail hardingham', ' abigél szõke', ' abra', ' abram rooney', ' achim barremstrein', ' ada condeescu', ' adah sharma', ' adam b. shapiro', ' adam bartley', ' adam bousdoukos', ' adam brody', ' adam brudnicki', ' adam devine', ' adam driver', ' adam green', ' adam gutniak', ' adam horovitz', ' adam nee', ' adam pally', ' adam pearson', ' adam scott', ' adam woronowicz', ' addison timlin', ' adeeja rochele anderson', ' adeel husain', ' adelaide clemens', ' adelina pestritu', ' ademola adedoyin', ' aden young', ' adil hussain', ' adil koukouh

In [72]:
# Create actors dataframe, now numeric
df8 = pd.DataFrame(actors_cv)
df8

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13102,13103,13104,13105,13106,13107,13108,13109,13110,13111
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9226,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9227,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9228,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9229,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [73]:
# Rename columns to actor names
actor_names = cv.get_feature_names()
df8.columns = actor_names

# Add suffix '_a' to differentiate between directors, writers and actors
df8 = df8.add_suffix('_a')

In [74]:
# Append vectorised genres with rest of X_test data
X_test = X_test.join(df8)
X_test

Unnamed: 0,index,primaryTitle,numVotes,actors,box_office,action,adventure,animation,biography,comedy,...,ólafur darri ólafsson_a,óscar jaenada_a,ömer acar_a,önder k. açikbas_a,öner erkan_a,öykü celik_a,özcan deniz_a,özge gürel_a,ümit bülent dinçer_a,ümit kantarcilar_a
0,1183,Kenau,1459,"Barry Atsma, Sallie Harmsen, Lisa Smit, Monic ...",581801.0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1038,Need for Speed,163050,"Aaron Paul, Dominic Cooper, Imogen Poots, Kid ...",203277636.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9220,Shubharathri,210,"Dileep, Anu Sithara, Siddique, Suraj Venjaramo...",105932.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,908,Draft Day,54806,"Chris Berman, Dave Donaldson, Patrick St. Espr...",29824199.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8495,The King's Musketeers,894,"Pierfrancesco Favino, Valerio Mastandrea, Rocc...",5727134.0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1842,6853,Ittefaq,10356,"Sidharth Malhotra, Sonakshi Sinha, Akshaye Kha...",2520487.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1843,7293,Natale da chef,245,"Massimo Boldi, Dario Bandiera, Rocío Muñoz, Pa...",3268233.0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1844,7493,Sasha Was Here,317,"Markas Eimontas, Jelena Kirejeva, Dalia Michel...",131500.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1845,2919,Yoga Hosers,12357,"Lily-Rose Depp, Harley Quinn Smith, Adam Brody...",36585.0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [75]:
# Drop original actors column
X_test = X_test.drop(['actors'], axis=1)
X_test

Unnamed: 0,index,primaryTitle,numVotes,box_office,action,adventure,animation,biography,comedy,crime,...,ólafur darri ólafsson_a,óscar jaenada_a,ömer acar_a,önder k. açikbas_a,öner erkan_a,öykü celik_a,özcan deniz_a,özge gürel_a,ümit bülent dinçer_a,ümit kantarcilar_a
0,1183,Kenau,1459,581801.0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1038,Need for Speed,163050,203277636.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9220,Shubharathri,210,105932.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,908,Draft Day,54806,29824199.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8495,The King's Musketeers,894,5727134.0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1842,6853,Ittefaq,10356,2520487.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1843,7293,Natale da chef,245,3268233.0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1844,7493,Sasha Was Here,317,131500.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1845,2919,Yoga Hosers,12357,36585.0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [76]:
# Drop index column so that the new index starts from 0
X_test = X_test.drop(['index'], axis=1)
X_test

Unnamed: 0,primaryTitle,numVotes,box_office,action,adventure,animation,biography,comedy,crime,drama,...,ólafur darri ólafsson_a,óscar jaenada_a,ömer acar_a,önder k. açikbas_a,öner erkan_a,öykü celik_a,özcan deniz_a,özge gürel_a,ümit bülent dinçer_a,ümit kantarcilar_a
0,Kenau,1459,581801.0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,Need for Speed,163050,203277636.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Shubharathri,210,105932.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,Draft Day,54806,29824199.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,The King's Musketeers,894,5727134.0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1842,Ittefaq,10356,2520487.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1843,Natale da chef,245,3268233.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1844,Sasha Was Here,317,131500.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1845,Yoga Hosers,12357,36585.0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [77]:
# Drop the categorical primaryTitle column
X_test = X_test.drop(['primaryTitle'], axis=1)
X_test

Unnamed: 0,numVotes,box_office,action,adventure,animation,biography,comedy,crime,drama,family,...,ólafur darri ólafsson_a,óscar jaenada_a,ömer acar_a,önder k. açikbas_a,öner erkan_a,öykü celik_a,özcan deniz_a,özge gürel_a,ümit bülent dinçer_a,ümit kantarcilar_a
0,1459,581801.0,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,163050,203277636.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,210,105932.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,54806,29824199.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,894,5727134.0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1842,10356,2520487.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1843,245,3268233.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1844,317,131500.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1845,12357,36585.0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 4e) All columns are scaled

In [78]:
# Scale columns so they are all normalised correctly for the algorithms
scaler = MinMaxScaler()
X_test_scaled = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)
X_test_scaled

Unnamed: 0,numVotes,box_office,action,adventure,animation,biography,comedy,crime,drama,family,...,ólafur darri ólafsson_a,óscar jaenada_a,ömer acar_a,önder k. açikbas_a,öner erkan_a,öykü celik_a,özcan deniz_a,özge gürel_a,ümit bülent dinçer_a,ümit kantarcilar_a
0,0.000895,0.000401,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.107323,0.140189,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000072,0.000073,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.036031,0.020568,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000523,0.003950,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1842,0.006755,0.001738,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1843,0.000096,0.002254,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1844,0.000143,0.000091,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1845,0.008073,0.000025,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
# Save X_test data to csv file
X_test.to_csv("xtest.csv", index=False)