In [3]:
import random
import os
from tqdm import tqdm
from typing import *
from collections import defaultdict
from math import log2
import string
from nltk.stem import PorterStemmer
import numpy as np

from sklearn.linear_model import LogisticRegression

from predeal_dataset import *

DICTIONARY_SIZE = 1000
BIGRAM_DICT_SIZE = 1200
TRIGRAM_DICT_SIZE = 1400

puncts = string.punctuation

stemmer = PorterStemmer()

tokenizer_cache = dict()
def tokenize(sentence:str,use_stemmer:bool=True,cache=tokenizer_cache)->List[str]:
    sentence = sentence.replace("(","( ").replace("[","[ ").replace("{","{ ")
    if cache is not None:
        if (sentence,use_stemmer) in cache:
            return cache[(sentence,use_stemmer)]
    res = list()
    tmp = sentence.split()
    for word in tmp:
        if len(word)==0:
            continue
        if word[-1] in puncts:
            p = word[-1]
            word = word[:-1]
            if len(word)>0:
                word = word.lower()
                if use_stemmer:
                    word = stemmer.stem(word)
                res.append(word)
            res.append(p)
        else:
            word = word.lower()
            if use_stemmer:
                word = stemmer.stem(word)
            res.append(word)
    if cache is not None:
        cache[(sentence,use_stemmer)] = res
    return res


tokenize("I love eatting bananas!")

['i', 'love', 'eat', 'banana', '!']

In [22]:
def log_text_to_file(text:str)->None:
    with open("./output/log.txt","a") as fout:
        fout.write(text+"\n")

In [2]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [4]:
spoiler_dataset = sample_sub_spoiler_set(SUBSET_SENTENCE_CNT)

In [20]:
word_cnt = defaultdict(int)
bigram_word_cnt = defaultdict(int)
trigram_word_cnt = defaultdict(int)

for datum in tqdm(spoiler_dataset):
    sentence = datum['review_sentence']
    words = tokenize(sentence)
    for word in words:
        word_cnt[word]+=1
    if len(words)>=2:
        for (word1,word2) in zip(words[:-1],words[1:]):
            bigram_word_cnt[(word1,word2)]+=1
    if len(words)>=3:
        for (word1,word2,word3) in zip(words[:-2],words[1:-1],words[2:]):
            trigram_word_cnt[(word1,word2,word3)]+=1

100%|██████████| 50000/50000 [01:53<00:00, 440.62it/s]


In [7]:
word_with_freq = list(word_cnt.items())
word_with_freq.sort(key=lambda tup:tup[1],reverse=True)

In [8]:
dictionary = word_with_freq[:DICTIONARY_SIZE]
dictionary = list(map(lambda tup:tup[0],dictionary))
word2id = {word:i for i,word in enumerate(dictionary)}

In [36]:
def len_only_feature(datum):
    return [len(datum['review_sentence'])]

In [9]:
def features(datum):
    res = [0]*DICTIONARY_SIZE
    words = tokenize(datum['review_sentence'])
    for word in words:
        if not word in dictionary:
            continue
        res[word2id[word]]+=1
    res.append(len(words)+1)
    return res

In [32]:
bigram_word_with_freq = list()
bigram_word_with_freq.extend(word_cnt.items())
bigram_word_with_freq.extend(bigram_word_cnt.items())
bigram_word_with_freq.sort(key=lambda tup:tup[1],reverse=True)
bigram_dictionary = bigram_word_with_freq[:BIGRAM_DICT_SIZE]
bigram_word2id = {word:i for i,word in enumerate(bigram_dictionary)}
def bigram_features(datum):
    res = [0]*DICTIONARY_SIZE
    words = tokenize(datum['review_sentence'])
    for word in words:
        if not word in dictionary:
            continue
        res[word2id[word]]+=1
    if len(words)>=2:
        for bi_word in zip(words[:-1],words[1:]):
            if not bi_word in dictionary:
                continue
            res[word2id[bi_word]]+=1
    res.append(len(words)+1)
    return res

In [34]:
trigram_word_with_freq = list()
trigram_word_with_freq.extend(word_cnt.items())
trigram_word_with_freq.extend(bigram_word_cnt.items())
trigram_word_with_freq.extend(trigram_word_cnt.items())
trigram_word_with_freq.sort(key=lambda tup:tup[1],reverse=True)
trigram_dictionary = trigram_word_with_freq[:TRIGRAM_DICT_SIZE]
trigram_word2id = {word:i for i,word in enumerate(trigram_dictionary)}
def trigram_features(datum):
    res = [0]*DICTIONARY_SIZE
    words = tokenize(datum['review_sentence'])
    for word in words:
        if not word in dictionary:
            continue
        res[word2id[word]]+=1
    if len(words)>=2:
        for bi_word in zip(words[:-1],words[1:]):
            if not bi_word in dictionary:
                continue
            res[word2id[bi_word]]+=1
    if len(words)>=3:
        for tri_word in zip(words[:-2],words[1:-1],words[2:]):
            if not tri_word in dictionary:
                continue
            res[word2id[tri_word]]+=1
    res.append(len(words)+1)
    return res

In [13]:
def getXsAndYs(spoiler_dataset:List[dict],feature_func)->Tuple[np.ndarray,np.ndarray]:
    resX = list()
    resY = list()
    for datum in tqdm(spoiler_dataset):
        resX.append(feature_func(datum))
        resY.append(datum['label'])
    return np.array(resX,dtype=float),np.array(resY)

In [11]:
random.seed(42)
random.shuffle(spoiler_dataset)
trainset,validset,testset = spoiler_dataset[:TRAIN_SET_CNT],spoiler_dataset[TRAIN_SET_CNT:-TEST_SET_CNT],spoiler_dataset[-TEST_SET_CNT:]

In [14]:
trainX,trainY = getXsAndYs(trainset,features)
validX,validY = getXsAndYs(validset,features)
testX,testY = getXsAndYs(testset,features)

100%|██████████| 40000/40000 [01:37<00:00, 411.14it/s]
100%|██████████| 5000/5000 [00:12<00:00, 411.26it/s]
100%|██████████| 5000/5000 [00:12<00:00, 388.41it/s]


In [15]:
class_weights = dict(zip(np.unique(trainY), len(trainY) / (len(np.unique(trainY)) * np.bincount(trainY))))
model = LogisticRegression(penalty='l2', C=1.0, class_weight=class_weights)
model.fit(trainX,trainY)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
def get_best_ths_with_ber(pred_prop_with_label): 
    pred_prop_with_label.sort(reverse=True)
    valid_set_pos_cnt = sum(tup[1] for tup in pred_prop_with_label)
    valid_set_neg_cnt = len(pred_prop_with_label)-valid_set_pos_cnt
    best_ths = 1.0
    best_ber = 0.5
    curr_false_positive = 0
    curr_false_negative = valid_set_pos_cnt
    for (prob,label) in pred_prop_with_label:
        ths = prob-0.00001
        if label==1:
            curr_false_negative-=1
        else:
            curr_false_positive+=1
        ber = 0.5*(curr_false_negative/valid_set_pos_cnt+curr_false_positive/valid_set_neg_cnt)
        if ber<best_ber:
            best_ber = ber
            best_ths = ths
    return best_ths,best_ber


(0.3703233614800921, 0.2989233643714776)

In [17]:
validYPred = model.predict(validX)

In [18]:
def get_performance_info(y_actual,y_predict):
    y_actual = np.array(y_actual)
    y_predict = np.array(y_predict)
    y_actual = y_actual.reshape((-1,))
    y_predict = y_predict.reshape((-1,))
    TP = np.sum((y_actual == 1) & (y_predict == 1))
    FP = np.sum((y_actual == 0) & (y_predict == 1))
    TN = np.sum((y_actual == 0) & (y_predict == 0))
    FN = np.sum((y_actual == 1) & (y_predict == 0))
    TPR = TP / (TP + FN)
    FPR = FP / (FP + TN)
    TNR = TN / (TN + FP)
    FNR = FN / (TP + FN)
    BER = 1 - (0.5 * (TPR + TNR))
    accu = np.sum(y_actual==y_predict)/len(y_actual)
    return accu,TP,FP,TN,FN,TPR, FPR, TNR, FNR, BER

In [19]:
get_performance_info(validY,validYPred)

(0.7742,
 204,
 997,
 3667,
 132,
 0.6071428571428571,
 0.21376500857632932,
 0.7862349914236707,
 0.39285714285714285,
 0.3033110757167361)

In [29]:
def pipeline(feature_func,description:str):
    def getXsAndYs(spoiler_dataset:List[dict])->Tuple[np.ndarray,np.ndarray]:
        resX = list()
        resY = list()
        for datum in tqdm(spoiler_dataset):
            resX.append(feature_func(datum))
            resY.append(datum['label'])
        return np.array(resX,dtype=float),np.array(resY)
    random.seed(42)
    random.shuffle(spoiler_dataset)
    print("start processing dataset")
    trainset,validset,testset = spoiler_dataset[:TRAIN_SET_CNT],spoiler_dataset[TRAIN_SET_CNT:-TEST_SET_CNT],spoiler_dataset[-TEST_SET_CNT:]
    trainX,trainY = getXsAndYs(trainset)
    validX,validY = getXsAndYs(validset)
    testX,testY = getXsAndYs(testset)
    print("process dataset finished")
    best_model,ths,best_ber,best_c = None,0.5,0.5,0.0
    for c in tqdm([0.1,0.15,0.2,0.25,0.35,0.5,0.7,1.0,1.4,2.0,2.8,4.0]):
        class_weights = dict(zip(np.unique(trainY), len(trainY) / (len(np.unique(trainY)) * np.bincount(trainY))))
        model = LogisticRegression(penalty='l2', C=c, class_weight=class_weights)
        model.fit(trainX,trainY)
        best_ths,ber = get_best_ths_with_ber(list(zip(map(lambda tup:tup[1],model.predict_proba(validX)),validY)))
        if ber<best_ber:
            best_ber = ber
            best_model=model
            ths = best_ths
            best_c = c
    prob_testset = best_model.predict_proba(testX)
    pred_testset = list(int(prob[1]>ths) for prob in prob_testset)
    accu,TP,FP,TN,FN,TPR, FPR, TNR, FNR, BER = get_performance_info(testY,pred_testset)
    msg = "%s , best_l2:%f\n    %7s%7s%7s%7s%7s%7s%7s\n     %.4f %.4f %.4f %.4f %.4f %.4f %.4f "%(description,best_c,"accu","ber","tpr","fpr","tnr","fnr","ths",accu,BER,TPR,FPR,TNR,FNR,ths)
    print(msg)
    log_text_to_file(msg)

In [30]:
pipeline(features,"1-gram bow model")

start processing dataset


  0%|          | 0/40000 [00:00<?, ?it/s]

100%|██████████| 40000/40000 [01:37<00:00, 409.34it/s]
100%|██████████| 5000/5000 [00:12<00:00, 412.95it/s]
100%|██████████| 5000/5000 [00:12<00:00, 407.52it/s]


process dataset finished


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

1-gram bow model , best_l2:2.000000
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.6966 0.2845 0.7370 0.3061 0.6939 0.2630 0.4173 





In [33]:
pipeline(bigram_features,"bi-gram bow model")

start processing dataset


100%|██████████| 40000/40000 [03:08<00:00, 212.65it/s]
100%|██████████| 5000/5000 [00:23<00:00, 213.02it/s]
100%|██████████| 5000/5000 [00:23<00:00, 209.00it/s]


process dataset finished


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

bi-gram bow model , best_l2:0.700000
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.7818 0.2976 0.6111 0.2064 0.7936 0.3889 0.5093 





In [35]:
pipeline(trigram_features,"tri-gram bow model")

start processing dataset


100%|██████████| 40000/40000 [04:39<00:00, 143.28it/s]
100%|██████████| 5000/5000 [00:34<00:00, 143.70it/s]
100%|██████████| 5000/5000 [00:34<00:00, 145.25it/s]


process dataset finished


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

tri-gram bow model , best_l2:4.000000
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.7258 0.2874 0.6973 0.2721 0.7279 0.3027 0.4361 





In [37]:
pipeline(len_only_feature,"length only baseline")

start processing dataset


100%|██████████| 40000/40000 [00:00<00:00, 231831.97it/s]
100%|██████████| 5000/5000 [00:00<00:00, 1002558.56it/s]
100%|██████████| 5000/5000 [00:00<00:00, 1253602.73it/s]


process dataset finished


100%|██████████| 8/8 [00:00<00:00, 17.44it/s]

length only baseline , best_l2:0.350000
       accu    ber    tpr    fpr    tnr    fnr    ths
     0.5350 0.3628 0.7540 0.4796 0.5204 0.2460 0.4227 



