In [66]:
import pandas as pd
import matplotlib.pyplot as plt
from ast import literal_eval
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix, vstack
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

import transformers as ppb # pytorch-pretrained-bert
import torch

import pipeline as pp
import models as ml

import importlib
import math

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\madsv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# The number of rows to train the model
BATCH_SIZE = 10000

In [87]:
data = pd.read_csv("../datasets/big/cleaned_input_cols.csv", nrows=BATCH_SIZE, dtype=str)

In [85]:
importlib.reload(pp)

def Clean_data(file, new_file):
    stopwords_lst = stopwords.words('english')
    pp.apply_pipeline(file, [
            (pp.Binary_labels(), 'type', 'type_binary'),
            (pp.Clean_data(), 'content'),
            (pp.Tokenizer(), "content"),
            (pp.Remove_stopwords(stopwords_lst), "content"),
            (pp.Stem(), "content"),
            (pp.Combine_Content(), "content", "content_combined"),
            (pp.Clean_author(), "authors"),
            (pp.Clean_data(), 'title'),
            (pp.Tokenizer(), "title"),
            (pp.Remove_stopwords(stopwords_lst), "title"),
            (pp.Stem(), "title"),
            (pp.Combine_Content(), "title"),
            (pp.Clean_domain(), 'domain')
        ],
        new_file=new_file,
        progress_bar=True,
        nrows=BATCH_SIZE
    )

cleaned_data = Clean_data("../datasets/big/dataset.csv", "../datasets/big/cleaned.csv")

100%|██████████| 10000/10000 [00:00<00:00, 598946.71it/s]
100%|██████████| 10000/10000 [00:23<00:00, 433.40it/s]
100%|██████████| 10000/10000 [00:00<00:00, 23047.08it/s]
100%|██████████| 10000/10000 [00:08<00:00, 1136.04it/s]
100%|██████████| 10000/10000 [01:05<00:00, 152.99it/s]
100%|██████████| 10000/10000 [00:00<00:00, 88913.05it/s]
100%|██████████| 10000/10000 [00:00<00:00, 156229.31it/s]
100%|██████████| 10000/10000 [00:00<00:00, 17453.70it/s]
100%|██████████| 10000/10000 [00:00<00:00, 244076.26it/s]
100%|██████████| 10000/10000 [00:00<00:00, 54293.23it/s]
100%|██████████| 10000/10000 [00:01<00:00, 6477.39it/s]
100%|██████████| 10000/10000 [00:00<00:00, 175660.73it/s]
100%|██████████| 10000/10000 [00:00<00:00, 156271.22it/s]


finish time: 102.85886311531067


In [56]:
  
def vectorize_content(data, col="content", new_col="count_vectorized"):
    # Prepare the tf-idf (term frequency-inverse document frequency) TODO: read up on this for report
    start_time = time() 
    count_vectorizer = CountVectorizer(ngram_range=(1, 2))
    tf_idf_transformer = TfidfTransformer(smooth_idf=False)

    # fit and transform train data to count vectorizer
    count_vectorizer.fit(data[col].values)
    count_vect_train = count_vectorizer.transform(data[col].values)
    # fit the counts vector to tfidf transformer
    tf_idf_transformer.fit(count_vect_train)
    count_vect_train = tf_idf_transformer.transform(count_vect_train)
    data[new_col] = [x for x in count_vect_train]
    #cleaned_data_combined['count_vectorized'] = cleaned_data_combined['count_vectorized'].apply(lambda x: tf_idf_transformer.transform([x]))

    end_time = time()
    print("Time elapsed for TF IDF transform: ,", end_time - start_time)

In [86]:
importlib.reload(pp)
def Create_input_cols(file, new_file):
    cleaned_data_combined = pp.apply_pipeline_pd_tqdm(pd.read_csv(file, dtype=str), [
            (pp.Join_str_columns(["content_combined", "authors"]), None, "content_authors"),
            (pp.Join_str_columns(["content_combined", "title"]), None, "content_title"),
            (pp.Join_str_columns(["content_combined", "domain"]), None, "content_domain"),
            (pp.Join_str_columns(["content_combined", "domain", "authors", "title"]), None, "content_domain_authors_title")
        ])
    vectorize_content(cleaned_data_combined, col="content_combined", new_col="content_combined_vectorized")
    vectorize_content(cleaned_data_combined, col="content_authors", new_col="content_authors_vectorized")
    vectorize_content(cleaned_data_combined, col="content_title", new_col="content_title_vectorized")
    vectorize_content(cleaned_data_combined, col="content_domain", new_col="content_domain_vectorized")
    vectorize_content(cleaned_data_combined, col="content_domain_authors_title", new_col="content_domain_authors_title_vectorized")
    cleaned_data_combined.to_csv(new_file, index=False)

cleaned_data = Create_input_cols("../datasets/big/cleaned.csv", "../datasets/big/cleaned_input_cols.csv")

100%|██████████| 10000/10000 [00:00<00:00, 28094.52it/s]
100%|██████████| 10000/10000 [00:00<00:00, 28761.42it/s]
100%|██████████| 10000/10000 [00:00<00:00, 27845.35it/s]
100%|██████████| 10000/10000 [00:00<00:00, 14129.00it/s]


Time elapsed for TF IDF transform: , 20.43290376663208
Time elapsed for TF IDF transform: , 18.50907826423645
Time elapsed for TF IDF transform: , 19.508578538894653
Time elapsed for TF IDF transform: , 18.717485904693604
Time elapsed for TF IDF transform: , 19.075685739517212


In [90]:
def split_csr_data(data, col="content", get_val=True):
    train = data[data["set"] == 0]
    val = data[data["set"] == 1]
    test = data[data["set"] == 2]
    X_train, y_train = vstack(train[col]), train["type"].astype(int)
    X_val, y_val = vstack(val[col]), val["type"].astype(int)
    X_test, y_test = vstack(test[col]), test["type"].astype(int)
    if not get_val:
        return X_train, X_test, y_train, y_test
    return X_train, X_val, X_test, y_train, y_val, y_test

In [91]:
backup = pd.DataFrame()
def try_models(models, X_train, X_test, y_train, y_test, name=None):
    global backup
    metrics = []
    for model in models:
        start_time = time() 
        model.fit(X_train, y_train)
        train_time = time() - start_time
        y_train_pred = model.predict(X_train)
        y_pred = model.predict(X_test)
        
        if name == None:
            name = type(model).__name__
        metrics.append({
            "name": name,
            "train_acc": accuracy_score(y_train, y_train_pred),
            "test_acc": accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred),
            "recall": recall_score(y_test, y_pred),
            "f1": f1_score(y_test, y_pred), 
            "time": "{:.2f}".format(train_time)
        })
        backup = pd.DataFrame(metrics)
        print(f"{name} finished in {(time() - start_time):.2f} seconds")
    return pd.DataFrame(metrics)

In [92]:
class Test_baseline():
    def __init__(self):
        self.metrics = pd.DataFrame()

    def test_baseline(self, X_train, X_test, y_train, y_test, name=None, model=None):
        if model == None:
            model = LogisticRegression()
        metric = try_models([model], X_train, X_test, y_train, y_test, name=name)
        self.metrics = pd.concat([self.metrics, metric])
        
tests = Test_baseline()

In [None]:
tests.test_baseline(*split_csr_data(cleaned_data_combined, col="count_vectorized", get_val=False))