In [2]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import ConfusionMatrixDisplay

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [5]:
# cuz i installed imblearn in the 3.9 folder
# but i usually launch jupyter lab from the 3.8 folder
## if 3.8, relaunch using: python3 -m notebook
from platform import python_version
print(python_version())

3.9.1


In [6]:
# loading data files

# has clean review text
dataset = pd.read_json("dramainfo_revclean.json")

# tfidf
tfidf_rev = np.load("tfidfvec.npy")

# word2vec
w2v_rev = np.load("w2featvec.npy")

In [57]:
# no need for adjustment
'''
political_drama
drama
'''

# undersample 1, oversample 0
'''
melodrama_romance
romance
'''

# undersample 0, oversample 1
'''
sitcom_comedy
comedy
mystery
youth_school
fantasy_supernatural_horror
family
war_historical
historical
friendship
life
action
thriller
'''

dataset.iloc[:,16:-1].mean().sort_values(ascending=False)

melodrama_romance              0.695324
romance                        0.684211
political_drama                0.525477
drama                          0.513315
sitcom_comedy                  0.369469
comedy                         0.368421
mystery                        0.153282
youth_school                   0.150346
fantasy_supernatural_horror    0.146152
family                         0.143007
war_historical                 0.132103
historical                     0.130216
friendship                     0.114280
life                           0.114070
action                         0.105892
thriller                       0.102118
dtype: float64

In [7]:
import imblearn

In [8]:
# !!!!!!!!!!

In [9]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as imbPipeline

In [184]:
Xtfidf_train, Xtfidf_test, ytfidf_train, ytfidf_test = train_test_split(tfidf_rev,
                                                                        dataset.action,
                                                                        test_size=0.3,
                                                                        random_state=18)

In [187]:
target_counts = Counter(ytfidf_train)
print(target_counts)

Counter({0: 2988, 1: 350})


In [16]:
def over_under_strat(minority,majority):
    strat_lst = []
    
    over_ratio = np.round(minority/majority,2) + 0.1
    
    while over_ratio < 0.8:
        strat_lst.append((over_ratio, 0.8))
        over_ratio = over_ratio + 0.1
    
    strat_lst.append((0.8,0.8))
    
    return strat_lst

In [236]:
strat_lst = over_under_strat(target_counts[1],target_counts[0])
strat_lst

[(0.22, 0.8),
 (0.32, 0.8),
 (0.42000000000000004, 0.8),
 (0.52, 0.8),
 (0.62, 0.8),
 (0.72, 0.8),
 (0.8, 0.8)]

In [18]:
models = [("logreg", LogisticRegression(max_iter=200)), 
          ("rf", RandomForestClassifier()),
          ("gnb", GaussianNB())]

scoring = ["accuracy",
           "balanced_accuracy",
           "precision",
           "recall",
           "f1",
           "roc_auc"]

In [19]:
def run_pipeline(steps, X_train, y_train, X_test, scoring=scoring):
    pipeline = imbPipeline(steps=steps)
    
    cv = model_selection.RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)
    scores = model_selection.cross_validate(pipeline, X_train, y_train,
                                            scoring=scoring, cv=cv, n_jobs=-1)
    
    clf = pipeline.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    return scores, y_pred

In [295]:
result_dfs = []
pred_list = []

for name, model in models:
    for over_strat, under_strat in strat_lst:
        print("Running ... {:<9}{:<.2}".format(name,over_strat))
        steps = [("over", RandomOverSampler(sampling_strategy=over_strat)),
                 ("under",RandomUnderSampler(sampling_strategy=under_strat)),
                 (name, model)]
        result,pred = run_pipeline(steps,
                                   Xtfidf_train,ytfidf_train,
                                   Xtfidf_test)
        model_df = pd.DataFrame(result)
        model_df["model"] = name
        model_df["oversample_strat"] = over_strat
        result_dfs.append(model_df)

        pred_list.append(pred)

Running ... logreg   0.22
Running ... logreg   0.32
Running ... logreg   0.42
Running ... logreg   0.52
Running ... logreg   0.62
Running ... logreg   0.72
Running ... logreg   0.8
Running ... rf       0.22
Running ... rf       0.32
Running ... rf       0.42


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Running ... rf       0.52


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Running ... rf       0.62


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Running ... rf       0.72


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Running ... rf       0.8


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Running ... gnb      0.22
Running ... gnb      0.32
Running ... gnb      0.42
Running ... gnb      0.52
Running ... gnb      0.62
Running ... gnb      0.72
Running ... gnb      0.8


In [296]:
results = pd.concat(result_dfs, ignore_index=True)

In [318]:
results_grp = results.groupby(["model","oversample_strat"]).agg([np.mean,np.std])
results_grp

Unnamed: 0_level_0,Unnamed: 1_level_0,fit_time,fit_time,score_time,score_time,test_accuracy,test_accuracy,test_balanced_accuracy,test_balanced_accuracy,test_precision,test_precision,test_recall,test_recall,test_f1,test_f1,test_roc_auc,test_roc_auc
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
model,oversample_strat,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
gnb,0.22,21.110729,9.441018,2.20939,0.525522,0.789097,0.010631,0.580762,0.018094,0.192508,0.017216,0.317143,0.043016,0.239173,0.023783,0.580762,0.018094
gnb,0.32,27.945038,12.150217,2.13188,0.59328,0.82984,0.009449,0.558744,0.021848,0.204089,0.032902,0.215714,0.045897,0.209154,0.037595,0.558744,0.021848
gnb,0.42,33.452614,15.091641,2.034965,0.565789,0.847811,0.00854,0.54482,0.018407,0.207493,0.034742,0.161429,0.040434,0.1807,0.037818,0.54482,0.018407
gnb,0.52,36.129575,15.864311,2.278499,0.661459,0.855899,0.006999,0.536094,0.018616,0.204951,0.046745,0.131429,0.038568,0.159493,0.041602,0.536094,0.018616
gnb,0.62,45.158232,20.0269,2.015513,0.564236,0.86249,0.006291,0.5341,0.021761,0.21288,0.058775,0.118571,0.04367,0.151847,0.050387,0.5341,0.021761
gnb,0.72,77.374633,36.267202,1.803522,0.576432,0.865485,0.006226,0.531359,0.018272,0.214806,0.055674,0.108571,0.037008,0.143701,0.044396,0.531359,0.018272
gnb,0.8,79.853122,36.810557,2.407952,0.870287,0.867131,0.00838,0.528495,0.017941,0.21408,0.065945,0.1,0.034339,0.135771,0.044587,0.528495,0.017941
logreg,0.22,20.2248,8.147673,0.24394,0.099017,0.895745,0.00723,0.695827,0.030387,0.508276,0.042541,0.442857,0.07063,0.468905,0.038974,0.87128,0.014713
logreg,0.32,27.162181,10.396669,0.313554,0.17124,0.899339,0.007791,0.697203,0.024554,0.529085,0.047589,0.441429,0.057321,0.477943,0.033603,0.869043,0.016043
logreg,0.42,35.822748,11.990034,0.348154,0.212228,0.899337,0.008403,0.699094,0.03052,0.524928,0.047033,0.445714,0.063817,0.480055,0.049381,0.873836,0.015243


In [335]:
for i in range(len(pred_list)):
    name = models[int(i/len(strat_lst))][0]
    oversamp = strat_lst[i%7][0]
    
    print("\n{:<9}{:<.2}".format(name,oversamp))
    print(classification_report(ytfidf_test, pred_list[i]))


logreg   0.22
              precision    recall  f1-score   support

           0       0.94      0.93      0.94      1276
           1       0.47      0.48      0.47       155

    accuracy                           0.89      1431
   macro avg       0.70      0.71      0.71      1431
weighted avg       0.89      0.89      0.89      1431


logreg   0.32
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      1276
           1       0.50      0.48      0.49       155

    accuracy                           0.89      1431
   macro avg       0.72      0.71      0.71      1431
weighted avg       0.89      0.89      0.89      1431


logreg   0.42
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      1276
           1       0.47      0.46      0.47       155

    accuracy                           0.89      1431
   macro avg       0.70      0.70      0.70      1431
weighted avg       0.89      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [336]:
# logistic regression seems to do the best
# should upsample to at least around 0.4

In [10]:
# now for w2v

In [11]:
Xw2v_train, Xw2v_test, yw2v_train, yw2v_test = train_test_split(w2v_rev,
                                                                dataset.action,
                                                                test_size=0.3,
                                                                random_state=18)

In [14]:
target_counts2 = Counter(yw2v_train)
print(target_counts2)

Counter({0: 2988, 1: 350})


In [17]:
strat_lst2 = over_under_strat(target_counts2[1],target_counts2[0])
strat_lst2

[(0.22, 0.8),
 (0.32, 0.8),
 (0.42000000000000004, 0.8),
 (0.52, 0.8),
 (0.62, 0.8),
 (0.72, 0.8),
 (0.8, 0.8)]

In [20]:
result_dfs2 = []
pred_list2 = []

for name, model in models:
    for over_strat, under_strat in strat_lst2:
        print("Running ... {:<9}{:<.2}".format(name,over_strat))
        steps = [("over", RandomOverSampler(sampling_strategy=over_strat)),
                 ("under",RandomUnderSampler(sampling_strategy=under_strat)),
                 (name, model)]
        result,pred = run_pipeline(steps,
                                   Xw2v_train,yw2v_train,
                                   Xw2v_test)
        model_df = pd.DataFrame(result)
        model_df["model"] = name
        model_df["oversample_strat"] = over_strat
        result_dfs2.append(model_df)

        pred_list2.append(pred)

Running ... logreg   0.22
Running ... logreg   0.32
Running ... logreg   0.42
Running ... logreg   0.52
Running ... logreg   0.62
Running ... logreg   0.72
Running ... logreg   0.8
Running ... rf       0.22
Running ... rf       0.32
Running ... rf       0.42
Running ... rf       0.52
Running ... rf       0.62
Running ... rf       0.72


  _warn_prf(average, modifier, msg_start, len(result))


Running ... rf       0.8


  _warn_prf(average, modifier, msg_start, len(result))


Running ... gnb      0.22
Running ... gnb      0.32
Running ... gnb      0.42
Running ... gnb      0.52
Running ... gnb      0.62
Running ... gnb      0.72
Running ... gnb      0.8


In [21]:
results2 = pd.concat(result_dfs2, ignore_index=True)

In [22]:
results_grp2 = results2.groupby(["model","oversample_strat"]).agg([np.mean,np.std])
results_grp2

Unnamed: 0_level_0,Unnamed: 1_level_0,fit_time,fit_time,score_time,score_time,test_accuracy,test_accuracy,test_balanced_accuracy,test_balanced_accuracy,test_precision,test_precision,test_recall,test_recall,test_f1,test_f1,test_roc_auc,test_roc_auc
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
model,oversample_strat,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
gnb,0.22,0.006684,0.001502,0.005805,0.00244,0.489068,0.127958,0.518489,0.026849,0.113318,0.010184,0.555714,0.145133,0.185688,0.013093,0.536148,0.045899
gnb,0.32,0.010896,0.003072,0.008961,0.004188,0.390204,0.071543,0.510562,0.025109,0.108483,0.008449,0.662857,0.080306,0.185992,0.012151,0.545694,0.047742
gnb,0.42,0.010231,0.004272,0.006271,0.001987,0.400682,0.079361,0.517044,0.041681,0.110831,0.013154,0.664286,0.092152,0.189389,0.0202,0.533354,0.051477
gnb,0.52,0.011836,0.003489,0.007639,0.002551,0.415069,0.051363,0.527604,0.048198,0.113676,0.014604,0.67,0.085303,0.194188,0.024052,0.551037,0.055866
gnb,0.62,0.01275,0.005084,0.007163,0.002667,0.444431,0.096333,0.537698,0.035075,0.118627,0.014798,0.655714,0.104426,0.199698,0.019737,0.556431,0.048026
gnb,0.72,0.013662,0.004214,0.006118,0.001581,0.40428,0.066049,0.526622,0.043564,0.113727,0.014766,0.681429,0.077386,0.194497,0.022381,0.545073,0.053974
gnb,0.8,0.012003,0.002696,0.00586,0.001809,0.429597,0.078433,0.538871,0.040723,0.118197,0.015216,0.677143,0.094329,0.200406,0.021647,0.55537,0.049118
logreg,0.22,0.02486,0.007985,0.005538,0.002836,0.849758,0.012463,0.514376,0.013278,0.147352,0.038417,0.09,0.033026,0.10997,0.033015,0.597543,0.029759
logreg,0.32,0.027563,0.006714,0.003403,0.00013,0.817859,0.014262,0.521154,0.021818,0.139992,0.038498,0.145714,0.050305,0.1418,0.04251,0.601043,0.025349
logreg,0.42,0.033913,0.011346,0.003464,0.000181,0.814111,0.011198,0.522213,0.015896,0.140492,0.026727,0.152857,0.039297,0.145789,0.03152,0.599422,0.038135


In [23]:
for i in range(len(pred_list2)):
    name = models[int(i/len(strat_lst2))][0]
    oversamp = strat_lst2[i%7][0]
    
    print("\n{:<9}{:<.2}".format(name,oversamp))
    print(classification_report(yw2v_test, pred_list2[i]))


logreg   0.22
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      1276
           1       0.15      0.12      0.14       155

    accuracy                           0.83      1431
   macro avg       0.53      0.52      0.52      1431
weighted avg       0.82      0.83      0.82      1431


logreg   0.32
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      1276
           1       0.16      0.15      0.15       155

    accuracy                           0.82      1431
   macro avg       0.53      0.53      0.53      1431
weighted avg       0.82      0.82      0.82      1431


logreg   0.42
              precision    recall  f1-score   support

           0       0.90      0.89      0.89      1276
           1       0.16      0.17      0.17       155

    accuracy                           0.81      1431
   macro avg       0.53      0.53      0.53      1431
weighted avg       0.82      

In [24]:
# looks like it's:
## tfidf
## logistic regression
## oversample to ratio 0.4-0.6