# Models: Bag of Words - Evaluation

In [None]:
import numpy as np
import os
import pandas as pd
import re
import time

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neural_network import MLPClassifier

from pprint import pprint

## Set input data folder

In [None]:
data_in = '../data/03_preprocessed_01_non_letters_to_empty_space'

pprint(sorted(os.listdir(data_in)))

In [None]:
data_out = '../data/04_models_bag_of_words'
# pprint(sorted(os.listdir(data_out)))

## Read data into data frames

In [None]:
clck = pd.read_csv(f'{data_in}/clck.csv', header=None)
clck.columns = ['_headlines']
clck['clickbait'] = pd.Series(np.ones(len(clck)) == 1)
clck[0:len(clck):5000]

In [None]:
news = pd.read_csv(f'{data_in}/news.csv', header=None)
news.columns = ['_headlines']
news['clickbait'] = pd.Series(np.ones(len(news)) == 0)
news[0:len(news):5000]

### Merge data frames

In [None]:
df = pd.concat([clck, news], ignore_index=True)
df[0:len(df):5000]

## Preprocessing

In [None]:
if os.path.exists(f'{data_out}/../data.csv'):
    df = pd.read_csv(f'{data_out}/../data.csv', index_col='Unnamed: 0')
    df = df.fillna('<nan>')
    
else:
    for regex in [None]:
        for stop_words in [None, set(stopwords.words('english'))]:
            for lemmatizer in [None, WordNetLemmatizer()]:
                for stemmer in [None, PorterStemmer()]:

                    settings  = ''
                    settings += '' if regex is None else '_re'
                    settings += '' if stop_words is None else '_sw'
                    settings += '' if lemmatizer is None else '_lm'
                    settings += '' if stemmer is None else '_sm'

                    print(f'headlines{settings:20s} ...', end=' ')

                    t = time.time()
                    headlines = list()

                    for i in range(len(df)):
                        # if (i+1) % 1000 == 0:
                        # print(f'Review {i+1:>6d} of {len(df):>6d}')

                        text = df['_headlines'][i]

                        # split the headline into words
                        text = re.split('\s+', text.lower().strip())

                        # replace unnecessary patterns
                        if regex is not None:
                            text = regex.sub(' ', text)

                        # remove stop-words
                        if stop_words is not None:
                            text = [w for w in text if w not in stop_words]

                        # lemmatization
                        if lemmatizer is not None:
                            text = [lemmatizer.lemmatize(w) for w in text]

                        # stemming
                        if stemmer is not None:
                            text = [stemmer.stem(w) for w in text]

                        headlines.append(' '.join(text))

                    df[f'headlines{settings}'] = pd.Series(headlines)

                    print(f'Done in {int(np.ceil(time.time() - t))} seconds!')

    df.to_csv(f'{data_out}/../data.csv')

## 10-fold cross validation

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=24101989)

## Classification

### Functions

In [None]:
def init_dfs():
    acc_df = pd.DataFrame(index=[i+1 for i in range(kf.n_splits)])
    out_df = pd.DataFrame(columns=['i'], index=df.index)
    tim_df = pd.DataFrame(index=[i+1 for i in range(kf.n_splits)])

    i = 1
    for train_index, test_index in kf.split(df):
        out_df['i'][test_index] = i
        i += 1
        
    return acc_df, out_df, tim_df

In [None]:
def init_series():
    acc_s = pd.Series(data=[np.nan for _ in range(kf.n_splits)],
                      index=acc_df.index)
    out_s = pd.Series(data=[np.nan for _ in range(len(df))],
                      index=out_df.index)
    tim_s = pd.Series(data=[np.nan for _ in range(kf.n_splits)],
                      index=tim_df.index)

    return acc_s, out_s, tim_s

In [None]:
def fit_and_predict(cl, cl_name):
    acc_s, out_s, tim_s = init_series()
    
    i = 1
    for train_index, test_index in kf.split(df):

        # initialize count vectorizer
        vectorizer = CountVectorizer(analyzer='word',
                                     max_features=ft)

        t = time.time()

        train_x = df[f'headlines{setting}'][train_index]
        train_y = df['clickbait'][train_index]
        test_x  = df[f'headlines{setting}'][test_index]
        test_y  = df['clickbait'][test_index]

        train_ft = vectorizer.fit_transform(train_x)
        train_ft = train_ft.toarray()

        test_ft = vectorizer.transform(test_x)
        test_ft = test_ft.toarray()

        # fit & predict
        cl = cl.fit(train_ft, train_y)
        results = pd.Series(cl.predict(test_ft),
                            index=test_y.index)

        # save results of the i-th split
        acc_s[i]            = np.sum(results == test_y) / len(test_y)
        out_s[test_y.index] = results
        tim_s[i]            = np.around(time.time() - t, decimals=3)
        print(f'{cl_name:30s} [{i:>2d}] | {(acc_s[i] * 100):>6.3f} % | {int(np.ceil(tim_s[i])):>4d} s')
        i += 1

    return acc_s, out_s, tim_s

In [None]:
def run(cl, cl_name):
    acc_s, out_s, tim_s = fit_and_predict(cl, cl_name)
    acc_df[cl_name] = acc_s
    out_df[cl_name] = out_s
    tim_df[cl_name] = tim_s
    
    print()
    return

### Fit & Predict

#### Initial Training

In [None]:
# settings of interest
soi = [
       'headlines',
       'headlines_sw',
       'headlines_lm',
       'headlines_sm',
       'headlines_sw_lm',
       'headlines_sw_sm',
       'headlines_lm_sm',
       'headlines_sw_lm_sm',
      ]

acc_df, out_df, tim_df = init_dfs()

now = ''.join([f'{item:02d}' for item in time.localtime()[:6]])

for s in soi:
    setting = s[9:]
    
    for ft in [1000, 2500, 5000]:
        cycle_time = time.time()

        # Extra Trees
        for est in [100]:
            run(ExtraTreesClassifier(n_estimators=est, n_jobs=-1,
                                     verbose=True, bootstrap=True),
                f'{ft}f{setting}_ET_{est}est')

        # Logistic Regression
        for c in [1.00]:
            run(LogisticRegression(C=c, max_iter=1000,
                                   n_jobs=-1, verbose=True),
                f'{ft}f{setting}_LogReg_{c:.2f}C')

        # Naïve Bayes
        for alpha in [1.00]:
            run(BernoulliNB(alpha=alpha), 
                f'{ft}f{setting}_NB_{alpha:.2f}alpha')
            
        # Neural Network
        for max_iter in [1000]:
            run(MLPClassifier(max_iter=max_iter, verbose=True),
                f'{ft}f{setting}_NN_{max_iter}iter')
            
        # Random Forests
        for est in [100]:
            run(RandomForestClassifier(n_estimators=est, n_jobs=-1,
                                       verbose=True, bootstrap=True),
                f'{ft}f{setting}_RF_{est}e')
            
        print(f'\n========== {ft}f{setting} cycle finished in {int(np.ceil(time.time() - cycle_time))} seconds. ==========\n\n\n')

acc_df = acc_df.transpose()
tim_df = tim_df.transpose()

acc_df.to_csv(f'{data_out}/acc/{now}.csv')
out_df.to_csv(f'{data_out}/out/{now}.csv')
tim_df.to_csv(f'{data_out}/tim/{now}.csv')

#### Additional Training

In [None]:
# settings of interest
soi = [
       'headlines',
       'headlines_lm',
       'headlines_sm',
       'headlines_lm_sm',
      ]

acc_df, out_df, tim_df = init_dfs()

now = ''.join([f'{item:02d}' for item in time.localtime()[:6]])

for s in soi:
    setting = s[9:]
    
    for ft in [5000]:
        cycle_time = time.time()

        # Logistic Regression
        for c in [0.0001, 0.25, 0.50, 0.75, 1.00, 1.25, 1.50]:
            run(LogisticRegression(C=c, max_iter=1000,
                                   n_jobs=-1, verbose=True),
                f'{ft}f{setting}_LogReg_{c:.2f}C')

        # Naïve Bayes
        for alpha in [0.00, 0.25, 0.50, 0.75, 1.00, 1.25, 1.50]:
            run(BernoulliNB(alpha=alpha), 
                f'{ft}f{setting}_NB_{alpha:.2f}alpha')
            
        print(f'\n========== {ft}f{setting} cycle finished in {int(np.ceil(time.time() - cycle_time))} seconds. ==========\n\n\n')

acc_df = acc_df.transpose()
tim_df = tim_df.transpose()

acc_df.to_csv(f'{data_out}/acc/{now}.csv')
out_df.to_csv(f'{data_out}/out/{now}.csv')
tim_df.to_csv(f'{data_out}/tim/{now}.csv')