In [1]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.metrics as metrics
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression
import gzip
import pandas as pd
import numpy as np

In [2]:
# globals

SEED = 0

In [3]:
with gzip.open('../articlesXXXXXXXX_wapo_all_opinion.tsv.gz', mode='rt') as f:
    dfo = pd.read_csv(f, names=range(11), delimiter='\t')

with gzip.open('../articlesXXXXXXXX_wapo_all_nopinion.tsv.gz', mode='rt') as f:
    dfno = pd.read_csv(f, names=range(11), delimiter='\t')

In [4]:
import re

new_lines = re.compile(r'(\s+)|(\n+)')

def clean(article):
    if type(article) != str:
        return ''

    # clean garbage
    if 'Today’s Headlines' in article: 
        return ''
    
    return re.sub(new_lines, ' ', article)

In [5]:
opinion = dfo[6].map(clean)
nopinion = dfno[6].map(clean)

opinion = opinion[opinion != '']
nopinion = nopinion[nopinion != '']

print('Opinion articles:', len(opinion))
print('News (non-opinion) articles:', len(nopinion))

Opinion articles: 18177
News (non-opinion) articles: 106008


In [6]:
opinion_labels = np.ones(opinion.shape)
nopinion_labels = np.zeros(nopinion.shape)

articles = np.concatenate((np.array(opinion), np.array(nopinion)))
labels = np.concatenate((opinion_labels, nopinion_labels))

In [7]:
n_samples = 10000

x_train, x_val, y_train, y_val = train_test_split(articles, labels, train_size=n_samples, test_size=n_samples, random_state=SEED)


In [8]:
tfidf_vectorizer = TfidfVectorizer(
    max_features=40000,
    analyzer='word', 
    stop_words='english', 
    ngram_range=(1, 1), 
    strip_accents='ascii')

tfidf_vectorizer.fit(x_train)

x_train = tfidf_vectorizer.transform(x_train)
x_val = tfidf_vectorizer.transform(x_val)

In [9]:
model = DummyClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_val)

print(f'F-1 Score: {metrics.f1_score(y_val, y_pred):.4f}')

F-1 Score: 0.0000


In [10]:
model = ComplementNB()
model.fit(x_train, y_train)

y_pred = model.predict(x_val)

print(f'Accuracy: {metrics.accuracy_score(y_val, y_pred):.4f}')
print(f'Precision: {metrics.precision_score(y_val, y_pred):.4f}')
print(f'Recall: {metrics.recall_score(y_val, y_pred):.4f}')
print(f'F-1 Score: {metrics.f1_score(y_val, y_pred):.4f}')

Accuracy: 0.8553
Precision: 0.9412
Recall: 0.0425
F-1 Score: 0.0813


In [11]:
model = LogisticRegression(solver='lbfgs', random_state=SEED)
model.fit(x_train, y_train)

y_pred = model.predict(x_val)

print(f'Accuracy: {metrics.accuracy_score(y_val, y_pred):.4f}')
print(f'Precision: {metrics.precision_score(y_val, y_pred):.4f}')
print(f'Recall: {metrics.recall_score(y_val, y_pred):.4f}')
print(f'F-1 Score: {metrics.f1_score(y_val, y_pred):.4f}')


Accuracy: 0.8885
Precision: 0.8368
Recall: 0.3232
F-1 Score: 0.4663
