In [0]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

from tqdm import tqdm

from sklearn.preprocessing import OneHotEncoder
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Import Dataset

### Features
For each title (unique `tconst`,) we are interested in the following features:
<!-- - `region` categorical, one per title. -->
<!-- - `language` categorical, one per title. -->
- `startYear` discrete integer.
- `runtimeMinutes` discrete integer.
- `genres` categorical, multiple per title.
- The average rating of the director/directors.
- The average rating of the writer/writers.
- The average rating of the top three actors.

We want to predict:
- `averageRating` continuous between $0$ and $10$.


In [2]:
!ls -lh "/content/drive/My Drive/Colab Notebooks/Dataset/2020-03-16-titles-with-ratings.pkl"
dataset = pd.read_pickle("/content/drive/My Drive/Colab Notebooks/Dataset/2020-03-16-titles-with-ratings.pkl")
display(dataset)
dataset = dataset[["tconst", "startYear", "runtimeMinutes", "genres", "directorAverage", "writerAverage", "principalAverage", "averageRating", "numVotes"]]

-rw------- 1 root root 212M Mar 16 14:13 '/content/drive/My Drive/Colab Notebooks/Dataset/2020-03-16-titles-with-ratings.pkl'


Unnamed: 0,tconst,startYear,runtimeMinutes,genres,directors,writers,averageRating,numVotes,principals,directorAverage,writerAverage,principalAverage
0,tt0000001,1894.0,1.0,"Documentary,Short",nm0005690,\N,5.6,1590,"nm1588970,nm0005690,nm0374658,",5.606510,5.606510,5.267911
1,tt0000002,1892.0,5.0,"Animation,Short",nm0721526,\N,6.1,192,"nm0721526,nm1335271,",6.419985,6.419985,6.421509
2,tt0000003,1892.0,4.0,"Animation,Comedy,Romance",nm0721526,\N,6.5,1257,"nm0721526,nm5442194,nm1335271,nm5442200,",6.419985,6.419985,6.460755
3,tt0000004,1892.0,12.0,"Animation,Short",nm0721526,\N,6.2,119,"nm0721526,nm1335271,",6.419985,6.419985,6.421509
4,tt0000005,1893.0,1.0,"Comedy,Short",nm0005690,\N,6.1,2018,"nm0443482,nm0443482,nm0653042,nm0653042,nm0005...",5.606510,5.606510,5.365407
...,...,...,...,...,...,...,...,...,...,...,...,...
1023081,tt9916544,2019.0,12.0,"Drama,Short",nm3219235,nm3219235,7.2,15,"nm9424215,nm9873304,nm7395788,nm7571612,nm3219...",6.622179,6.622179,6.427478
1023082,tt9916576,2019.0,85.0,Reality-TV,\N,\N,6.4,10,"nm3939894,nm3282613,nm1700240,nm2277838,nm2353...",0.000000,0.000000,0.000000
1023083,tt9916578,2019.0,,"Adventure,Biography,Comedy",nm0373673,"nm1485603,nm1485604,nm1866876,nm0909144",8.5,16,"nm1876061,nm0104787,nm0006522,nm0358200,nm0372...",6.834996,6.179873,6.932343
1023084,tt9916720,2019.0,10.0,"Comedy,Horror,Mystery",nm10538600,\N,5.5,47,"nm10678584,nm10538601,nm10538600,nm10538603,nm...",3.971053,3.971053,5.653593


## Display Dataset

In [3]:
display(dataset)

Unnamed: 0,tconst,startYear,runtimeMinutes,genres,directorAverage,writerAverage,principalAverage,averageRating,numVotes
0,tt0000001,1894.0,1.0,"Documentary,Short",5.606510,5.606510,5.267911,5.6,1590
1,tt0000002,1892.0,5.0,"Animation,Short",6.419985,6.419985,6.421509,6.1,192
2,tt0000003,1892.0,4.0,"Animation,Comedy,Romance",6.419985,6.419985,6.460755,6.5,1257
3,tt0000004,1892.0,12.0,"Animation,Short",6.419985,6.419985,6.421509,6.2,119
4,tt0000005,1893.0,1.0,"Comedy,Short",5.606510,5.606510,5.365407,6.1,2018
...,...,...,...,...,...,...,...,...,...
1023081,tt9916544,2019.0,12.0,"Drama,Short",6.622179,6.622179,6.427478,7.2,15
1023082,tt9916576,2019.0,85.0,Reality-TV,0.000000,0.000000,0.000000,6.4,10
1023083,tt9916578,2019.0,,"Adventure,Biography,Comedy",6.834996,6.179873,6.932343,8.5,16
1023084,tt9916720,2019.0,10.0,"Comedy,Horror,Mystery",3.971053,3.971053,5.653593,5.5,47


## More pre-processing

### Multi-Label-Binarizer Encoding
We want to encode "genres" with binary vectors. Since these features have inequal lengths, one-hot might not be the best option. This [StackOverflow Answer](https://stackoverflow.com/questions/42391165/how-to-one-hot-encode-variant-length-features) suggested the MLB encoder, which apparently is designed for encoding features with variant lengths.

Note that these are currently comma-separated strings. We shall split them before feeding them into `sklearn.preprocessing`.




In [0]:
def split_sharky_csstr(csstr):
  if type(csstr) == type(str()):
    return csstr.split(",")
  else:
    return list()

dataset["genres"] = dataset["genres"].apply(split_sharky_csstr)

In [5]:
mlb = MultiLabelBinarizer()
mlb.fit(list(dataset["genres"]))
mlb.classes_

dataset = dataset.join(pd.DataFrame(mlb.transform(dataset["genres"]), columns=[("is" + category) for category in mlb.classes_]))
display(dataset)

Unnamed: 0,tconst,startYear,runtimeMinutes,genres,directorAverage,writerAverage,principalAverage,averageRating,numVotes,isAction,isAdult,isAdventure,isAnimation,isBiography,isComedy,isCrime,isDocumentary,isDrama,isFamily,isFantasy,isFilm-Noir,isGame-Show,isHistory,isHorror,isMusic,isMusical,isMystery,isNews,isReality-TV,isRomance,isSci-Fi,isShort,isSport,isTalk-Show,isThriller,isWar,isWestern,is\N
0,tt0000001,1894.0,1.0,"[Documentary, Short]",5.606510,5.606510,5.267911,5.6,1590,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,tt0000002,1892.0,5.0,"[Animation, Short]",6.419985,6.419985,6.421509,6.1,192,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,tt0000003,1892.0,4.0,"[Animation, Comedy, Romance]",6.419985,6.419985,6.460755,6.5,1257,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,tt0000004,1892.0,12.0,"[Animation, Short]",6.419985,6.419985,6.421509,6.2,119,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,tt0000005,1893.0,1.0,"[Comedy, Short]",5.606510,5.606510,5.365407,6.1,2018,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1023081,tt9916544,2019.0,12.0,"[Drama, Short]",6.622179,6.622179,6.427478,7.2,15,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1023082,tt9916576,2019.0,85.0,[Reality-TV],0.000000,0.000000,0.000000,6.4,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1023083,tt9916578,2019.0,,"[Adventure, Biography, Comedy]",6.834996,6.179873,6.932343,8.5,16,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1023084,tt9916720,2019.0,10.0,"[Comedy, Horror, Mystery]",3.971053,3.971053,5.653593,5.5,47,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0


### Screen dataset
We discard the following entries:
- Entries where `runtimeMinutes` is not available.
- Entries where `directorAverage` is zero.
- Entries without a `genre`.


In [0]:
dataset["directorAverage"] = dataset["directorAverage"].replace(0.0, np.nan)
dataset["averageRating"] = dataset["averageRating"].replace(0.0, np.nan)

dataset["is\\N"] = dataset["is\\N"].replace(1, np.nan)
dataset = dataset.dropna(how="any")

dataset = dataset.drop(columns=["is\\N", "genres"])

## Regression

Take a look at the dataset.

In [7]:
display(dataset)

Unnamed: 0,tconst,startYear,runtimeMinutes,directorAverage,writerAverage,principalAverage,averageRating,numVotes,isAction,isAdult,isAdventure,isAnimation,isBiography,isComedy,isCrime,isDocumentary,isDrama,isFamily,isFantasy,isFilm-Noir,isGame-Show,isHistory,isHorror,isMusic,isMusical,isMystery,isNews,isReality-TV,isRomance,isSci-Fi,isShort,isSport,isTalk-Show,isThriller,isWar,isWestern
0,tt0000001,1894.0,1.0,5.606510,5.606510,5.267911,5.6,1590,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,tt0000002,1892.0,5.0,6.419985,6.419985,6.421509,6.1,192,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,tt0000003,1892.0,4.0,6.419985,6.419985,6.460755,6.5,1257,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,tt0000004,1892.0,12.0,6.419985,6.419985,6.421509,6.2,119,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,tt0000005,1893.0,1.0,5.606510,5.606510,5.365407,6.1,2018,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1023074,tt9916204,2019.0,42.0,6.843421,7.899625,7.138049,8.2,169,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1023077,tt9916348,2019.0,67.0,2.726936,2.726936,5.527753,8.9,14,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1023078,tt9916380,2019.0,43.0,7.066473,7.958657,8.743559,9.2,101,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1023081,tt9916544,2019.0,12.0,6.622179,6.622179,6.427478,7.2,15,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


### Support Vector Machine

We need to convert the average ratings into intervals and then one-hot before fitting the SVM.



In [0]:
# X = dataset.drop(columns=["averageRating", "numVotes", "tconst"], inplace=False).copy()
# X["startYear"] = X["startYear"].astype(int)
# y_int_avg_rating = [int(rating) for rating in dataset["averageRating"]]

In [0]:
# clf = svm.SVC(verbose=1)
# clf.fit(X, y_int_avg_rating)

### Random Forest

In [0]:
X = dataset.drop(columns=["averageRating", "numVotes", "tconst"], inplace=False)
y_avg_rating = [[element] for element in pd.cut(dataset["averageRating"], [0.5 * i for i in range(0, 21)])]

enc_y_avg = OneHotEncoder(handle_unknown="error")
enc_y_avg.fit(y_avg_rating)

y_avg_rating = enc_y_avg.transform(y_avg_rating).toarray()

In [0]:
X_train, X_test, y_rating_train, y_rating_test = train_test_split(X, y_avg_rating, test_size=0.3, random_state=42)

clf_avg_rating_forest_50 = RandomForestClassifier(verbose=2, n_estimators=50)
clf_avg_rating_forest_50.fit(X, y_avg_rating)

## Compute Accuracy

In [0]:
def predict_accuracy(predictor):
  y_ratings_test_predictions = predictor.predict(X_test)
  print("Accuracy (exact hit) score: {}".format(accuracy_score(y_rating_test, y_ratings_test_predictions)))
  
  diff = 0
  for rating_expected, rating_predicted in zip(y_rating_test, y_ratings_test_predictions):
    diff += abs(np.argmax(rating_expected) - np.argmax(rating_predicted))

  print("Mean difference in rating is {}".format(((diff/len(y_rating_test))/2)))

In [15]:
predict_accuracy(clf_avg_rating_forest_50)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:   24.4s finished


Accuracy (exact hit) score: 0.9611443812591214
Mean difference in rating is 0.1692365759146495
