In [0]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

from tqdm import tqdm

from sklearn.preprocessing import OneHotEncoder
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from joblib import dump

## Import Dataset

### Features
For each title (unique `tconst`,) we are interested in the following features:
<!-- - `region` categorical, one per title. -->
<!-- - `language` categorical, one per title. -->
- `startYear` discrete integer.
- `runtimeMinutes` discrete integer.
- `genres` categorical, multiple per title.
- The average rating of the director/directors.
- The average rating of the writer/writers.
- The average rating of the top three actors.

We want to predict:
- `averageRating` continuous between $0$ and $10$.


In [4]:
!wget -O /tmp/titles-with-ratings.pkl https://github.com/jacobthebanana/McGill-AI-Stereotyper/releases/download/1.0/titles-with-ratings.pkl
dataset = pd.read_pickle("/tmp/titles-with-ratings.pkl")
display(dataset)
dataset = dataset[["tconst", "startYear", "runtimeMinutes", "genres", "directorAverage", "writerAverage", "principalAverage", "averageRating", "numVotes"]]

--2020-03-25 19:18:20--  https://github.com/jacobthebanana/McGill-AI-Stereotyper/releases/download/1.0/titles-with-ratings.pkl
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/236877689/8ac6c700-6eaa-11ea-933c-8352973b278f?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200325%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200325T191820Z&X-Amz-Expires=300&X-Amz-Signature=dce5533f475c888d7883cd4f8a343c1a9ee29034887778bf759398a2b4f6d50e&X-Amz-SignedHeaders=host&actor_id=0&response-content-disposition=attachment%3B%20filename%3Dtitles-with-ratings.pkl&response-content-type=application%2Foctet-stream [following]
--2020-03-25 19:18:20--  https://github-production-release-asset-2e65be.s3.amazonaws.com/236877689/8ac6c700-6eaa-11ea-933c-8352973b278f?X-Amz-Algorithm=AWS4-HMAC-SH

Unnamed: 0,tconst,startYear,runtimeMinutes,genres,directors,writers,averageRating,numVotes,principals,directorAverage,writerAverage,principalAverage
0,tt0000001,1894.0,1.0,"Documentary,Short",nm0005690,\N,5.6,1590,"nm1588970,nm0005690,nm0374658,",5.606510,5.606510,5.267911
1,tt0000002,1892.0,5.0,"Animation,Short",nm0721526,\N,6.1,192,"nm0721526,nm1335271,",6.419985,6.419985,6.421509
2,tt0000003,1892.0,4.0,"Animation,Comedy,Romance",nm0721526,\N,6.5,1257,"nm0721526,nm5442194,nm1335271,nm5442200,",6.419985,6.419985,6.460755
3,tt0000004,1892.0,12.0,"Animation,Short",nm0721526,\N,6.2,119,"nm0721526,nm1335271,",6.419985,6.419985,6.421509
4,tt0000005,1893.0,1.0,"Comedy,Short",nm0005690,\N,6.1,2018,"nm0443482,nm0443482,nm0653042,nm0653042,nm0005...",5.606510,5.606510,5.365407
...,...,...,...,...,...,...,...,...,...,...,...,...
1023081,tt9916544,2019.0,12.0,"Drama,Short",nm3219235,nm3219235,7.2,15,"nm9424215,nm9873304,nm7395788,nm7571612,nm3219...",6.622179,6.622179,6.427478
1023082,tt9916576,2019.0,85.0,Reality-TV,\N,\N,6.4,10,"nm3939894,nm3282613,nm1700240,nm2277838,nm2353...",0.000000,0.000000,0.000000
1023083,tt9916578,2019.0,,"Adventure,Biography,Comedy",nm0373673,"nm1485603,nm1485604,nm1866876,nm0909144",8.5,16,"nm1876061,nm0104787,nm0006522,nm0358200,nm0372...",6.834996,6.179873,6.932343
1023084,tt9916720,2019.0,10.0,"Comedy,Horror,Mystery",nm10538600,\N,5.5,47,"nm10678584,nm10538601,nm10538600,nm10538603,nm...",3.971053,3.971053,5.653593


## Display Dataset

In [5]:
display(dataset)

Unnamed: 0,tconst,startYear,runtimeMinutes,genres,directorAverage,writerAverage,principalAverage,averageRating,numVotes
0,tt0000001,1894.0,1.0,"Documentary,Short",5.606510,5.606510,5.267911,5.6,1590
1,tt0000002,1892.0,5.0,"Animation,Short",6.419985,6.419985,6.421509,6.1,192
2,tt0000003,1892.0,4.0,"Animation,Comedy,Romance",6.419985,6.419985,6.460755,6.5,1257
3,tt0000004,1892.0,12.0,"Animation,Short",6.419985,6.419985,6.421509,6.2,119
4,tt0000005,1893.0,1.0,"Comedy,Short",5.606510,5.606510,5.365407,6.1,2018
...,...,...,...,...,...,...,...,...,...
1023081,tt9916544,2019.0,12.0,"Drama,Short",6.622179,6.622179,6.427478,7.2,15
1023082,tt9916576,2019.0,85.0,Reality-TV,0.000000,0.000000,0.000000,6.4,10
1023083,tt9916578,2019.0,,"Adventure,Biography,Comedy",6.834996,6.179873,6.932343,8.5,16
1023084,tt9916720,2019.0,10.0,"Comedy,Horror,Mystery",3.971053,3.971053,5.653593,5.5,47


### Webform Generator

In [6]:
for name in dataset.columns.values:
  if name[:2] != "is":
    print("<label for=\"{}\">{} (enter a number)</label><br>".format(name, name))
    print("<input type=\"text\" id=\"{}\" name=\"{}\"><br>".format(name, name))
  else:
    print("<input type=\"checkbox\" id=\"{}\" name=\"{}\">".format(name, name))
    print("<label for=\"{}\">{}</label><br>".format(name, name))

<label for="tconst">tconst (enter a number)</label><br>
<input type="text" id="tconst" name="tconst"><br>
<label for="startYear">startYear (enter a number)</label><br>
<input type="text" id="startYear" name="startYear"><br>
<label for="runtimeMinutes">runtimeMinutes (enter a number)</label><br>
<input type="text" id="runtimeMinutes" name="runtimeMinutes"><br>
<label for="genres">genres (enter a number)</label><br>
<input type="text" id="genres" name="genres"><br>
<label for="directorAverage">directorAverage (enter a number)</label><br>
<input type="text" id="directorAverage" name="directorAverage"><br>
<label for="writerAverage">writerAverage (enter a number)</label><br>
<input type="text" id="writerAverage" name="writerAverage"><br>
<label for="principalAverage">principalAverage (enter a number)</label><br>
<input type="text" id="principalAverage" name="principalAverage"><br>
<label for="averageRating">averageRating (enter a number)</label><br>
<input type="text" id="averageRating" na

### Webapp Defaults


In [23]:
print("{", end="")
i = 1
for name in dataset.columns.values:
  if i%5 == 0:
      print()
  i += 1
  print(name)
  if name[:2] != "is":
    print("\"{}\": 0".format(name), end=", ")  # Default to zero for categorical.
  elif name in ["tconst", "genres", "averageRating", "numVotes", "tconst"]:
    continue  # We aren't supposed to look at these information.
  else:
    print("\"{}\": -1".format(name), end=", ") # All other fields are required

print("}")

{tconst
"tconst": 0, startYear
"startYear": 0, runtimeMinutes
"runtimeMinutes": 0, directorAverage
"directorAverage": 0, 
writerAverage
"writerAverage": 0, principalAverage
"principalAverage": 0, averageRating
"averageRating": 0, numVotes
"numVotes": 0, isAction
"isAction": -1, 
isAdult
"isAdult": -1, isAdventure
"isAdventure": -1, isAnimation
"isAnimation": -1, isBiography
"isBiography": -1, isComedy
"isComedy": -1, 
isCrime
"isCrime": -1, isDocumentary
"isDocumentary": -1, isDrama
"isDrama": -1, isFamily
"isFamily": -1, isFantasy
"isFantasy": -1, 
isFilm-Noir
"isFilm-Noir": -1, isGame-Show
"isGame-Show": -1, isHistory
"isHistory": -1, isHorror
"isHorror": -1, isMusic
"isMusic": -1, 
isMusical
"isMusical": -1, isMystery
"isMystery": -1, isNews
"isNews": -1, isReality-TV
"isReality-TV": -1, isRomance
"isRomance": -1, 
isSci-Fi
"isSci-Fi": -1, isShort
"isShort": -1, isSport
"isSport": -1, isTalk-Show
"isTalk-Show": -1, isThriller
"isThriller": -1, 
isWar
"isWar": -1, isWestern
"isWester

## More pre-processing

### Multi-Label-Binarizer Encoding
We want to encode "genres" with binary vectors. Since these features have inequal lengths, one-hot might not be the best option. This [StackOverflow Answer](https://stackoverflow.com/questions/42391165/how-to-one-hot-encode-variant-length-features) suggested the MLB encoder, which apparently is designed for encoding features with variant lengths.

Note that these are currently comma-separated strings. We shall split them before feeding them into `sklearn.preprocessing`.




In [0]:
def split_sharky_csstr(csstr):
  if type(csstr) == type(str()):
    return csstr.split(",")
  else:
    return list()

dataset["genres"] = dataset["genres"].apply(split_sharky_csstr)

In [9]:
mlb = MultiLabelBinarizer()
mlb.fit(list(dataset["genres"]))
mlb.classes_

dataset = dataset.join(pd.DataFrame(mlb.transform(dataset["genres"]), columns=[("is" + category) for category in mlb.classes_]))
display(dataset)

Unnamed: 0,tconst,startYear,runtimeMinutes,genres,directorAverage,writerAverage,principalAverage,averageRating,numVotes,isAction,isAdult,isAdventure,isAnimation,isBiography,isComedy,isCrime,isDocumentary,isDrama,isFamily,isFantasy,isFilm-Noir,isGame-Show,isHistory,isHorror,isMusic,isMusical,isMystery,isNews,isReality-TV,isRomance,isSci-Fi,isShort,isSport,isTalk-Show,isThriller,isWar,isWestern,is\N
0,tt0000001,1894.0,1.0,"[Documentary, Short]",5.606510,5.606510,5.267911,5.6,1590,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,tt0000002,1892.0,5.0,"[Animation, Short]",6.419985,6.419985,6.421509,6.1,192,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,tt0000003,1892.0,4.0,"[Animation, Comedy, Romance]",6.419985,6.419985,6.460755,6.5,1257,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,tt0000004,1892.0,12.0,"[Animation, Short]",6.419985,6.419985,6.421509,6.2,119,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,tt0000005,1893.0,1.0,"[Comedy, Short]",5.606510,5.606510,5.365407,6.1,2018,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1023081,tt9916544,2019.0,12.0,"[Drama, Short]",6.622179,6.622179,6.427478,7.2,15,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1023082,tt9916576,2019.0,85.0,[Reality-TV],0.000000,0.000000,0.000000,6.4,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1023083,tt9916578,2019.0,,"[Adventure, Biography, Comedy]",6.834996,6.179873,6.932343,8.5,16,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1023084,tt9916720,2019.0,10.0,"[Comedy, Horror, Mystery]",3.971053,3.971053,5.653593,5.5,47,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0


### Screen dataset
We discard the following entries:
- Entries where `runtimeMinutes` is not available.
- Entries where `directorAverage` is zero.
- Entries without a `genre`.


In [0]:
dataset["directorAverage"] = dataset["directorAverage"].replace(0.0, np.nan)
dataset["averageRating"] = dataset["averageRating"].replace(0.0, np.nan)

dataset["is\\N"] = dataset["is\\N"].replace(1, np.nan)
dataset = dataset.dropna(how="any")

dataset = dataset.drop(columns=["is\\N", "genres"])

### Categorize Ratings

In [11]:
dataset["averageRating"].describe()

count    662366.000000
mean          6.848763
std           1.348421
min           1.000000
25%           6.100000
50%           7.000000
75%           7.800000
max          10.000000
Name: averageRating, dtype: float64

In [0]:
X = dataset.drop(columns=["averageRating", "numVotes", "tconst"], inplace=False)
# X = dataset[["directorAverage", "writerAverage", "principalAverage"]]
# y_avg_rating = [[element] for element in pd.cut(dataset["averageRating"], [0.5 * i for i in range(0, 21)])]
y_avg_rating = [[element] for element in pd.cut(dataset["averageRating"], [0, 7.000000, 10])]

enc_y_avg = OneHotEncoder(handle_unknown="error")
enc_y_avg.fit(y_avg_rating)

y_avg_rating = enc_y_avg.transform(y_avg_rating).toarray()

We want to know the relative size of the two categories. For example, consider the undesirable case where one category has 90% of all samples, while the other has only 10%. In that case, a 90% accuracy won't be that impressive- the model could simply predict everything to be in that 90% category.

Credits to the McGill AI Society for the following handy code snippet.

In [13]:
for i in range(2):
  print('class {}:'.format(i), 
        100 * len([0 for value in y_avg_rating if value[i] == 1])/len(y_avg_rating))

class 0: 50.43978706636512
class 1: 49.56021293363488


### Categorize Number of Votes

The procedure is quite similar as that for the average ratings.

In [14]:
dataset["numVotes"].describe()

count    6.623660e+05
mean     1.454649e+03
std      1.955575e+04
min      5.000000e+00
25%      1.100000e+01
50%      3.300000e+01
75%      1.500000e+02
max      2.198923e+06
Name: numVotes, dtype: float64

In [0]:
X = dataset.drop(columns=["averageRating", "numVotes", "tconst"], inplace=False)
y_num_votes = [[element] for element in pd.cut(dataset["numVotes"], [0, dataset["numVotes"].mean(), dataset["numVotes"].max()])]

enc_y_num = OneHotEncoder(handle_unknown="error")
enc_y_num.fit(y_num_votes)

y_num_votes = enc_y_num.transform(y_num_votes).toarray()

In [16]:
for i in range(2):
  print('class {}:'.format(i), 
        100 * len([0 for value in y_num_votes if value[i] == 1])/len(y_avg_rating))

class 0: 93.86381547362032
class 1: 6.136184526379675


## Display Dataset

In [17]:
display(dataset)

Unnamed: 0,tconst,startYear,runtimeMinutes,directorAverage,writerAverage,principalAverage,averageRating,numVotes,isAction,isAdult,isAdventure,isAnimation,isBiography,isComedy,isCrime,isDocumentary,isDrama,isFamily,isFantasy,isFilm-Noir,isGame-Show,isHistory,isHorror,isMusic,isMusical,isMystery,isNews,isReality-TV,isRomance,isSci-Fi,isShort,isSport,isTalk-Show,isThriller,isWar,isWestern
0,tt0000001,1894.0,1.0,5.606510,5.606510,5.267911,5.6,1590,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,tt0000002,1892.0,5.0,6.419985,6.419985,6.421509,6.1,192,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,tt0000003,1892.0,4.0,6.419985,6.419985,6.460755,6.5,1257,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,tt0000004,1892.0,12.0,6.419985,6.419985,6.421509,6.2,119,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,tt0000005,1893.0,1.0,5.606510,5.606510,5.365407,6.1,2018,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1023074,tt9916204,2019.0,42.0,6.843421,7.899625,7.138049,8.2,169,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1023077,tt9916348,2019.0,67.0,2.726936,2.726936,5.527753,8.9,14,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1023078,tt9916380,2019.0,43.0,7.066473,7.958657,8.743559,9.2,101,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1023081,tt9916544,2019.0,12.0,6.622179,6.622179,6.427478,7.2,15,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


### Webform Generator

In [18]:
for name in dataset.columns.values:
  if name[:2] != "is":
    print("<label for=\"{}\">{} (enter a number)</label><br>".format(name, name))
    print("<input type=\"text\" id=\"{}\" name=\"{}\"><br>".format(name, name))
  else:
    print("<input type=\"checkbox\" id=\"{}\" name=\"{}\">".format(name, name))
    print("<label for=\"{}\">{}</label><br>".format(name, name))

<label for="tconst">tconst (enter a number)</label><br>
<input type="text" id="tconst" name="tconst"><br>
<label for="startYear">startYear (enter a number)</label><br>
<input type="text" id="startYear" name="startYear"><br>
<label for="runtimeMinutes">runtimeMinutes (enter a number)</label><br>
<input type="text" id="runtimeMinutes" name="runtimeMinutes"><br>
<label for="directorAverage">directorAverage (enter a number)</label><br>
<input type="text" id="directorAverage" name="directorAverage"><br>
<label for="writerAverage">writerAverage (enter a number)</label><br>
<input type="text" id="writerAverage" name="writerAverage"><br>
<label for="principalAverage">principalAverage (enter a number)</label><br>
<input type="text" id="principalAverage" name="principalAverage"><br>
<label for="averageRating">averageRating (enter a number)</label><br>
<input type="text" id="averageRating" name="averageRating"><br>
<label for="numVotes">numVotes (enter a number)</label><br>
<input type="text" id=

### Webapp Defaults


In [19]:
print("{", end="")
i = 1
for name in dataset.columns.values:
  if i%5 == 0:
      print()
  i += 1

  if name[:2] == "is":
    print("\"{}\": 0".format(name), end=", ")  # Default to zero for categorical.
  elif name in ["tconst", "genres", "averageRating", "numVotes"]:
    continue  # We aren't supposed to look at these information.
  else:
    print("\"{}\": -1".format(name), end=", ") # All other fields are required

print("}")


{"startYear": -1, "runtimeMinutes": -1, "directorAverage": -1, 
"writerAverage": -1, "principalAverage": -1, "isAction": 0, 
"isAdult": 0, "isAdventure": 0, "isAnimation": 0, "isBiography": 0, "isComedy": 0, 
"isCrime": 0, "isDocumentary": 0, "isDrama": 0, "isFamily": 0, "isFantasy": 0, 
"isFilm-Noir": 0, "isGame-Show": 0, "isHistory": 0, "isHorror": 0, "isMusic": 0, 
"isMusical": 0, "isMystery": 0, "isNews": 0, "isReality-TV": 0, "isRomance": 0, 
"isSci-Fi": 0, "isShort": 0, "isSport": 0, "isTalk-Show": 0, "isThriller": 0, 
"isWar": 0, "isWestern": 0, }


## Regression

### Average Rating

Take a look at the dataset.

We will be running a random forest autoclassifier on this dataset.

Input:
- startYear,
- runtimeMinutes, 
- directorAverage, 
- writerAverage, 
- principalAverage,
- binary-encoded categories.

Output:
- averageRating

In [20]:
display(dataset)

Unnamed: 0,tconst,startYear,runtimeMinutes,directorAverage,writerAverage,principalAverage,averageRating,numVotes,isAction,isAdult,isAdventure,isAnimation,isBiography,isComedy,isCrime,isDocumentary,isDrama,isFamily,isFantasy,isFilm-Noir,isGame-Show,isHistory,isHorror,isMusic,isMusical,isMystery,isNews,isReality-TV,isRomance,isSci-Fi,isShort,isSport,isTalk-Show,isThriller,isWar,isWestern
0,tt0000001,1894.0,1.0,5.606510,5.606510,5.267911,5.6,1590,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,tt0000002,1892.0,5.0,6.419985,6.419985,6.421509,6.1,192,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,tt0000003,1892.0,4.0,6.419985,6.419985,6.460755,6.5,1257,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,tt0000004,1892.0,12.0,6.419985,6.419985,6.421509,6.2,119,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,tt0000005,1893.0,1.0,5.606510,5.606510,5.365407,6.1,2018,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1023074,tt9916204,2019.0,42.0,6.843421,7.899625,7.138049,8.2,169,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1023077,tt9916348,2019.0,67.0,2.726936,2.726936,5.527753,8.9,14,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1023078,tt9916380,2019.0,43.0,7.066473,7.958657,8.743559,9.2,101,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1023081,tt9916544,2019.0,12.0,6.622179,6.622179,6.427478,7.2,15,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [21]:
X_train, X_test, y_rating_train, y_rating_test = train_test_split(X, y_avg_rating, test_size=0.3, random_state=42)

clf_avg_rating_forest_10_trees_50_50 = RandomForestClassifier(verbose=2, n_estimators=10, n_jobs=-1)
clf_avg_rating_forest_10_trees_50_50.fit(X_train, y_rating_train)

print(clf_avg_rating_forest_10_trees_50_50.score(X_train, y_rating_train))
print(clf_avg_rating_forest_10_trees_50_50.score(X_test, y_rating_test))

dump(clf_avg_rating_forest_10_trees_50_50, "clf_avg_rating_forest_10_trees_50_50.joblib")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 10building tree 2 of 10
building tree 3 of 10

building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    6.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.9s finished


0.9772805700778163


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.4s finished


0.7587187358462081


['clf_avg_rating_forest_10_trees_50_50.joblib']

## Making inferences

In [22]:
display(X_test[:1])
clf_avg_rating_forest_10_trees_50_50.predict(X_test[:1])

Unnamed: 0,startYear,runtimeMinutes,directorAverage,writerAverage,principalAverage,isAction,isAdult,isAdventure,isAnimation,isBiography,isComedy,isCrime,isDocumentary,isDrama,isFamily,isFantasy,isFilm-Noir,isGame-Show,isHistory,isHorror,isMusic,isMusical,isMystery,isNews,isReality-TV,isRomance,isSci-Fi,isShort,isSport,isTalk-Show,isThriller,isWar,isWestern
905860,2016.0,23.0,7.198157,6.026935,6.717315,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


array([[0., 1.]])