In [1]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np

In [10]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection

In [314]:
import imblearn
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline

In [7]:
# making sure it's 3.9
from platform import python_version
print(python_version())

3.9.1


In [8]:
# based on previous testing, decided to use:
## tfidf
## logistic regression

In [23]:
# loading data files

# has clean review text
dataset = pd.read_json("dramainfo_revclean.json")

# tfidf
tfidf_rev = np.load("tfidfvec.npy")
tfidf_vocab = np.load("tfidfvocab.npy")

In [245]:
# make a df of the tfidf features
tfidf_df = pd.DataFrame(tfidf_rev, columns=tfidf_vocab)

In [363]:
# names of all genre columns
bat_targets = list(dataset.iloc[:,-17:-1].columns)
print(bat_targets)

['romance', 'sitcom_comedy', 'comedy', 'war_historical', 'political_drama', 'thriller', 'friendship', 'melodrama_romance', 'drama', 'action', 'historical', 'youth_school', 'fantasy_supernatural_horror', 'mystery', 'life', 'family']


In [337]:
def round_nearest(x, a):
    return np.ceil(x/a)*a

# return ratios for over and undersampling
# oversample strat between 0.4,0.6
# undersample strat 0.8 for a ~10 percent pt diff
def over_under_strat(minority,majority):
    # min and max ratios for oversampling
    minr, maxr = (0.4,0.6)
    
    # rounded up to nearest 0.05
    # keep at 2 digit decimal
    orig_ratio = np.round(round_nearest(minority/majority,0.05),2)
    
    if (orig_ratio >= minr) and (orig_ratio <= maxr):
        minr = orig_ratio
    elif orig_ratio < minr:
        pass
    else: return None
    
            
    # get the num points for 0.05 intervals
    pnum = int(np.round((maxr-minr)*10*2+1))

    overs = np.linspace(minr,maxr,pnum)
    
    # overstrat, understrat
    # 0.8 for under will result in
    # ~10 percentage point difference
    return [(num, 0.8) for num in overs]

# if ratio is <0.4, do oversample w strat 0.5
# else only downsample
def single_strat(minority,majority):
    minr, maxr = (0.4,0.6)
    
    orig_ratio = minority/majority
    
    if orig_ratio >= minr and orig_ratio <= maxr:
        return (None,0.8)
    elif orig_ratio > maxr:
        return (None,0.8)
    else: return (0.5,0.8)

In [366]:
def samp_strats(dataset=dataset, bat_targets=bat_targets):
    # dict[genre] = (over strat, under strat)
    samp_dict = dict()

    for gen in bat_targets:
        counts = Counter(dataset[gen])   
        is_frac = counts[1]/(counts[0]+counts[1])

        if is_frac >= 0.45 and is_frac <= 0.55:
            samp_dict[gen]= (None,None)

        else:
            samp_dict[gen] = single_strat(min(counts[0],counts[1]),
                                          max(counts[0],counts[1]))
            #samp_dict[gen]=over_under_strat(min(counts[0],counts[1],
            #                                max(counts[0],counts[1]))

    return samp_dict

In [367]:
samp_dict = samp_strats()
samp_dict

{'romance': (None, 0.8),
 'sitcom_comedy': (None, 0.8),
 'comedy': (None, 0.8),
 'war_historical': (0.5, 0.8),
 'political_drama': (None, None),
 'thriller': (0.5, 0.8),
 'friendship': (0.5, 0.8),
 'melodrama_romance': (None, 0.8),
 'drama': (None, None),
 'action': (0.5, 0.8),
 'historical': (0.5, 0.8),
 'youth_school': (0.5, 0.8),
 'fantasy_supernatural_horror': (0.5, 0.8),
 'mystery': (0.5, 0.8),
 'life': (0.5, 0.8),
 'family': (0.5, 0.8)}

In [368]:
gen = bat_targets[0]

ov_strat,un_strat = samp_dict[gen]

X_train, X_test, y_train, y_test = train_test_split(tfidf_rev,
                                                    dataset[gen],
                                                    test_size=0.3,
                                                    random_state=18)

# resampling
if ov_strat != None and un_strat != None:
    over = RandomOverSampler(sampling_strategy=ov_strat)
    under = RandomUnderSampler(sampling_strategy=un_strat)
    X_train,y_train = over.fit_resample(X_train,y_train)
    X_train,y_train = under.fit_resample(X_train,y_train)
elif ov_strat == None and un_strat != None:
    under = RandomUnderSampler(sampling_strategy=un_strat)
    X_train,y_train = under.fit_resample(X_train,y_train)
else: pass

In [362]:
# pipeline, grid search for best parameters

In [357]:
pipeline = Pipeline([("logreg", LogisticRegression())])

params_grid = [{"logreg__penalty": ["l1"],
                "logreg__C": np.logspace(-2, 2, 5),
                "logreg__solver": ["lbfgs"]
               },
               {"logreg__penalty": ["l1","l2"],
                "logreg__C": np.logspace(-2, 2, 5),
                "logreg__solver": ["saga"],
                "logreg__random_state": [1]
               }]

In [358]:
clf = model_selection.GridSearchCV(pipeline, param_grid=params_grid,
                                   cv=5, verbose=10, n_jobs=-1)

In [359]:
clf.fit(X_train,y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


25 fits failed out of a total of 75.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, s

GridSearchCV(cv=5, estimator=Pipeline(steps=[('logreg', LogisticRegression())]),
             n_jobs=-1,
             param_grid=[{'logreg__C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                          'logreg__penalty': ['l1'],
                          'logreg__solver': ['lbfgs']},
                         {'logreg__C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                          'logreg__penalty': ['l1', 'l2'],
                          'logreg__random_state': [1],
                          'logreg__solver': ['saga']}],
             verbose=10)

[CV 1/5; 1/15] START logreg__C=0.01, logreg__penalty=l1, logreg__solver=lbfgs...
[CV 1/5; 1/15] END logreg__C=0.01, logreg__penalty=l1, logreg__solver=lbfgs;, score=nan total time=   3.2s
[CV 5/5; 3/15] START logreg__C=1.0, logreg__penalty=l1, logreg__solver=lbfgs....
[CV 5/5; 3/15] END logreg__C=1.0, logreg__penalty=l1, logreg__solver=lbfgs;, score=nan total time=   1.4s
[CV 3/5; 5/15] START logreg__C=100.0, logreg__penalty=l1, logreg__solver=lbfgs..
[CV 3/5; 5/15] END logreg__C=100.0, logreg__penalty=l1, logreg__solver=lbfgs;, score=nan total time=   1.3s
[CV 1/5; 7/15] START logreg__C=0.01, logreg__penalty=l2, logreg__random_state=1, logreg__solver=saga
[CV 1/5; 7/15] END logreg__C=0.01, logreg__penalty=l2, logreg__random_state=1, logreg__solver=saga;, score=0.554 total time=  31.7s
[CV 4/5; 8/15] START logreg__C=0.1, logreg__penalty=l1, logreg__random_state=1, logreg__solver=saga
[CV 4/5; 8/15] END logreg__C=0.1, logreg__penalty=l1, logreg__random_state=1, logreg__solver=saga;, sco

[CV 2/5; 2/15] START logreg__C=0.1, logreg__penalty=l1, logreg__solver=lbfgs....
[CV 2/5; 2/15] END logreg__C=0.1, logreg__penalty=l1, logreg__solver=lbfgs;, score=nan total time=   2.6s
[CV 5/5; 2/15] START logreg__C=0.1, logreg__penalty=l1, logreg__solver=lbfgs....
[CV 5/5; 2/15] END logreg__C=0.1, logreg__penalty=l1, logreg__solver=lbfgs;, score=nan total time=   1.4s
[CV 2/5; 4/15] START logreg__C=10.0, logreg__penalty=l1, logreg__solver=lbfgs...
[CV 2/5; 4/15] END logreg__C=10.0, logreg__penalty=l1, logreg__solver=lbfgs;, score=nan total time=   1.3s
[CV 5/5; 5/15] START logreg__C=100.0, logreg__penalty=l1, logreg__solver=lbfgs..
[CV 5/5; 5/15] END logreg__C=100.0, logreg__penalty=l1, logreg__solver=lbfgs;, score=nan total time=   1.3s
[CV 3/5; 7/15] START logreg__C=0.01, logreg__penalty=l2, logreg__random_state=1, logreg__solver=saga
[CV 3/5; 7/15] END logreg__C=0.01, logreg__penalty=l2, logreg__random_state=1, logreg__solver=saga;, score=0.556 total time=  31.2s
[CV 1/5; 9/15] S

In [365]:
clf.best_params_

{'logreg__C': 10.0,
 'logreg__penalty': 'l2',
 'logreg__random_state': 1,
 'logreg__solver': 'saga'}

In [361]:
clf.best_score_

0.8308816434655227