In [6]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np

In [7]:
import re
import string
import nltk
from collections import defaultdict

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1123)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1123)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1123)>


False

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import ConfusionMatrixDisplay

In [9]:
# loading data files

# has clean review text
dataset = pd.read_json("dramainfo_revclean.json")

# tfidf
tfidf_rev = np.load("tfidfvec.npy")

# word2vec
w2v_rev = np.load("w2featvec.npy")

In [10]:
dataset.columns

Index(['country', 'crew', 'ep_duration', 'episodes', 'genres', 'id',
       'main_cast', 'num_ratings', 'num_watchers', 'orig_network', 'rating',
       'reviews', 'synopsis', 'tags', 'title', 'year', 'romance',
       'sitcom_comedy', 'comedy', 'war_historical', 'political_drama',
       'thriller', 'friendship', 'melodrama_romance', 'drama', 'action',
       'historical', 'youth_school', 'fantasy_supernatural_horror', 'mystery',
       'life', 'family', 'clean_reviews'],
      dtype='object')

### start with genre = romance

In [14]:
Xtfidf_train, Xtfidf_test, ytfidf_train, ytfidf_test = train_test_split(tfidf_rev,
                                                                        dataset.romance,
                                                                        test_size=0.3,
                                                                        random_state=18)

In [15]:
Xw2v_train, Xw2v_test, yw2v_train, yw2v_test = train_test_split(w2v_rev,
                                                                dataset.romance,
                                                                test_size=0.3,
                                                                random_state=18)

In [16]:
# trying different classifiers
# adapted from:
## https://towardsdatascience.com/quickly-test-multiple-models-a98477476f0

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [18]:
models = [("logreg", LogisticRegression(max_iter=200)), 
          ("rf", RandomForestClassifier()),
          ("gnb", GaussianNB())]

In [19]:
scoring = ["accuracy",
           "precision",
           "recall",
           "f1",
           "roc_auc"]

In [20]:
def try_models(X_train,y_train,X_test,y_test,models=models,scoring=scoring):
    target_names = ["is_not_genre",
                    "is_genre"]

    result_dfs = []

    for name, model in models:
        # split data into sections
        kfold = model_selection.KFold(n_splits=5,
                                      shuffle=True,
                                      random_state=1)

        # metrics for each fold
        cv_results = model_selection.cross_validate(model,
                                                    X_train,y_train,
                                                    cv=kfold,
                                                    scoring=scoring)

        clf = model.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        # save each fold result to the list
        model_df = pd.DataFrame(cv_results)
        model_df['model'] = name
        result_dfs.append(model_df)

        print(name)
        print(classification_report(y_test, y_pred,
                                    target_names=target_names))

    # single df of all results
    return pd.concat(result_dfs, ignore_index=True)

In [108]:
tfidf_results = try_models(Xtfidf_train,ytfidf_train,Xtfidf_test,ytfidf_test,models)

logreg
              precision    recall  f1-score   support

is_not_genre       0.84      0.55      0.67       444
    is_genre       0.83      0.95      0.88       987

    accuracy                           0.83      1431
   macro avg       0.83      0.75      0.78      1431
weighted avg       0.83      0.83      0.82      1431

rf
              precision    recall  f1-score   support

is_not_genre       0.90      0.25      0.39       444
    is_genre       0.75      0.99      0.85       987

    accuracy                           0.76      1431
   macro avg       0.82      0.62      0.62      1431
weighted avg       0.79      0.76      0.71      1431

gnb
              precision    recall  f1-score   support

is_not_genre       0.57      0.32      0.41       444
    is_genre       0.74      0.89      0.81       987

    accuracy                           0.71      1431
   macro avg       0.66      0.60      0.61      1431
weighted avg       0.69      0.71      0.69      1431



In [109]:
tfidf_results.groupby("model")[["test_accuracy", "test_precision", "test_recall",
                                "test_f1", "test_roc_auc"]].agg([np.mean,np.std])

Unnamed: 0_level_0,test_accuracy,test_accuracy,test_precision,test_precision,test_recall,test_recall,test_f1,test_f1,test_roc_auc,test_roc_auc
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
gnb,0.690231,0.017021,0.727299,0.014993,0.873319,0.009296,0.793582,0.010628,0.584693,0.031364
logreg,0.824744,0.008954,0.820774,0.010036,0.950285,0.007482,0.880775,0.008014,0.90574,0.006586
rf,0.747149,0.020366,0.735005,0.019695,0.984207,0.008986,0.841416,0.013559,0.861579,0.021222


In [110]:
w2v_results = try_models(Xw2v_train,yw2v_train,Xw2v_test,yw2v_test,models)

logreg
              precision    recall  f1-score   support

is_not_genre       0.62      0.02      0.04       444
    is_genre       0.69      0.99      0.82       987

    accuracy                           0.69      1431
   macro avg       0.65      0.51      0.43      1431
weighted avg       0.67      0.69      0.57      1431

rf
              precision    recall  f1-score   support

is_not_genre       0.45      0.22      0.30       444
    is_genre       0.71      0.88      0.79       987

    accuracy                           0.67      1431
   macro avg       0.58      0.55      0.54      1431
weighted avg       0.63      0.67      0.63      1431

gnb
              precision    recall  f1-score   support

is_not_genre       0.44      0.36      0.40       444
    is_genre       0.73      0.79      0.76       987

    accuracy                           0.66      1431
   macro avg       0.59      0.58      0.58      1431
weighted avg       0.64      0.66      0.65      1431



In [111]:
w2v_results.groupby("model")[["test_accuracy", "test_precision", "test_recall",
                                "test_f1", "test_roc_auc"]].agg([np.mean,np.std])

Unnamed: 0_level_0,test_accuracy,test_accuracy,test_precision,test_precision,test_recall,test_recall,test_f1,test_f1,test_roc_auc,test_roc_auc
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
gnb,0.653385,0.019428,0.727817,0.015276,0.785233,0.020271,0.755351,0.015366,0.637239,0.018981
logreg,0.686338,0.017608,0.687096,0.016556,0.991643,0.004206,0.811656,0.011957,0.652463,0.01483
rf,0.692629,0.023956,0.727245,0.019564,0.879358,0.028457,0.795836,0.017215,0.693891,0.030787


In [112]:
dataset.romance.mean()

0.6842105263157895

### trying again with genre = action

to see if I need to upsample/downsample and perhaps combine more genres

In [11]:
dataset.action.mean()

0.10589222059131893

In [12]:
Xtfidf_train, Xtfidf_test, ytfidf_train, ytfidf_test = train_test_split(tfidf_rev,
                                                                        dataset.action,
                                                                        test_size=0.3,
                                                                        random_state=18)

In [13]:
Xw2v_train, Xw2v_test, yw2v_train, yw2v_test = train_test_split(w2v_rev,
                                                                dataset.action,
                                                                test_size=0.3,
                                                                random_state=18)

In [None]:
tfidf_results2 = try_models(Xtfidf_train,ytfidf_train,Xtfidf_test,ytfidf_test,models)

  _warn_prf(average, modifier, msg_start, len(result))


logreg
              precision    recall  f1-score   support

is_not_genre       0.89      1.00      0.94      1276
    is_genre       0.75      0.02      0.04       155

    accuracy                           0.89      1431
   macro avg       0.82      0.51      0.49      1431
weighted avg       0.88      0.89      0.85      1431



In [99]:
# okay so i def need to upsample/downsample
# and possibly delete/combine more genres <--- did this
## which means i might have to run the pain w2v average function again ughughughguhg

### ._.

In [100]:
dataset.iloc[:,16:-1].mean().sort_values(ascending=False)

melodrama_romance              0.695324
romance                        0.684211
political_drama                0.525477
drama                          0.513315
sitcom_comedy                  0.369469
comedy                         0.368421
mystery                        0.153282
youth_school                   0.150346
fantasy_supernatural_horror    0.146152
family                         0.143007
war_historical                 0.132103
historical                     0.130216
friendship                     0.114280
life                           0.114070
action                         0.105892
thriller                       0.102118
dtype: float64

### trying with stratified kfold, genres = romance, action

In [21]:
def try_models2(X_train,y_train,X_test,y_test,models=models,scoring=scoring):
    target_names = ["is_not_genre",
                    "is_genre"]

    result_dfs = []

    for name, model in models:
        # split data into sections
        kfold = model_selection.StratifiedKFold(n_splits=5,
                                      shuffle=True,
                                      random_state=1)

        # metrics for each fold
        cv_results = model_selection.cross_validate(model,
                                                    X_train,y_train,
                                                    groups=np.sort(y_train),
                                                    cv=kfold,
                                                    scoring=scoring)

        clf = model.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        # save each fold result to the list
        model_df = pd.DataFrame(cv_results)
        model_df['model'] = name
        result_dfs.append(model_df)

        print(name)
        print(classification_report(y_test, y_pred,
                                    target_names=target_names))

    # single df of all results
    return pd.concat(result_dfs, ignore_index=True)

In [22]:
tfidf_results_rom2 = try_models2(Xtfidf_train,ytfidf_train,Xtfidf_test,ytfidf_test,models)

logreg
              precision    recall  f1-score   support

is_not_genre       0.84      0.55      0.67       444
    is_genre       0.83      0.95      0.88       987

    accuracy                           0.83      1431
   macro avg       0.83      0.75      0.78      1431
weighted avg       0.83      0.83      0.82      1431

rf
              precision    recall  f1-score   support

is_not_genre       0.89      0.30      0.45       444
    is_genre       0.76      0.98      0.86       987

    accuracy                           0.77      1431
   macro avg       0.82      0.64      0.65      1431
weighted avg       0.80      0.77      0.73      1431

gnb
              precision    recall  f1-score   support

is_not_genre       0.57      0.32      0.41       444
    is_genre       0.74      0.89      0.81       987

    accuracy                           0.71      1431
   macro avg       0.66      0.60      0.61      1431
weighted avg       0.69      0.71      0.69      1431



In [23]:
tfidf_results_rom2.groupby("model")[["test_accuracy", "test_precision", "test_recall",
                                "test_f1", "test_roc_auc"]].agg([np.mean,np.std])

Unnamed: 0_level_0,test_accuracy,test_accuracy,test_precision,test_precision,test_recall,test_recall,test_f1,test_f1,test_roc_auc,test_roc_auc
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
gnb,0.690829,0.00833,0.728565,0.006812,0.871259,0.010581,0.793507,0.0056,0.587678,0.012158
logreg,0.825344,0.008061,0.82079,0.006196,0.951668,0.008382,0.881378,0.005563,0.907531,0.008578
rf,0.754645,0.008228,0.738988,0.006825,0.989898,0.005503,0.846213,0.004479,0.870064,0.012829


In [24]:
# for above romance
# mostly the same, some differences in random forest