In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
netflix_rating = pd.read_csv("netflix_ratings_data.csv")

In [3]:
all_genres = netflix_rating['genres'].str.split("\\s*,\\s*").explode(ignore_index=True).drop_duplicates()
all_genres = all_genres.sort_values()
genres = all_genres.to_list()
all_genres.to_csv('netflix_genres_data.csv', sep=',', index=False)

In [4]:
netflix_genre = pd.DataFrame([netflix_rating.title, netflix_rating.genres]).transpose()
netflix_genre['genres'] = netflix_genre['genres'].str.split("\\s*,\\s*")
# netflix_genre['score'] = netflix_rating['averageRating']
# netflix_genre = netflix_genre.explode('genres')
netflix_genre

Unnamed: 0,title,genres
0,dick johnson is dead,"[Biography, Documentary, Drama]"
1,ganglands,"[Action, Crime, Drama]"
2,jailbirds new orleans,"[Documentary, Reality-TV]"
3,midnight mass,"[Drama, Fantasy, Horror]"
4,my little pony: a new generation,"[Adventure, Animation, Comedy]"
...,...,...
6415,zindagi gulzar hai,[Romance]
6416,zodiac,"[Crime, Drama, Mystery]"
6417,zombieland,"[Action, Comedy, Horror]"
6418,zoom,"[Action, Adventure, Comedy]"


In [5]:
#Reference: https://www.analyticsvidhya.com/blog/2019/04/predicting-movie-genres-nlp-multi-label-classification/?fbclid=IwAR3L_2ZH4mMH3TDtU3Xo70o5I5oVvPu25god32CR3IilRz_1MZOzOvzkmbs
mlb = MultiLabelBinarizer()
genre_mlb = mlb.fit(netflix_genre['genres'])

# transform target variable
binarized_df = mlb.transform(netflix_genre['genres'])

movie_genres_mlb = pd.DataFrame(binarized_df, columns = mlb.classes_)
movie_genres_mlb = movie_genres_mlb.drop(['\\N'], axis=1)


X = movie_genres_mlb
y = netflix_rating['ranking']

movie_genres_mlb

Unnamed: 0,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,...,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,0,0,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6415,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
6416,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6417,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6418,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=9)

knn = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=1))
rf = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=10, max_depth=8, min_samples_leaf=8))
nb = make_pipeline(StandardScaler(), GaussianNB())

models = [['KNN', knn], ['Random Forest', rf], ['Naive Bayes', nb]]

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    print('name: {}, EM: {}, F1: {}'.format(name, accuracy_score(y_valid, y_pred), f1_score(y_valid, y_pred, average='weighted')))

name: KNN, EM: 0.4602803738317757, F1: 0.47605450528562787
name: Random Forest, EM: 0.6355140186915887, F1: 0.5283753964819489
name: Naive Bayes, EM: 0.1043613707165109, F1: 0.11786656261716841
