In [89]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.metrics as metrics
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression
import gzip
import pandas as pd
import numpy as np

In [81]:
with gzip.open('../articlesXXXXXXXX_wapo_all_opinion.tsv.gz', mode='rt') as f:
    dfo = pd.read_csv(f, names=range(11), delimiter='\t')

with gzip.open('../articlesXXXXXXXX_wapo_all_nopinion.tsv.gz', mode='rt') as f:
    dfno = pd.read_csv(f, names=range(11), delimiter='\t')

In [82]:
import re

regex = re.compile(r'(\s+)|(\n+)')

def clean(article):
    if type(article) != str:
        return ''
    return re.sub(regex, ' ', article)

In [103]:
opinion = dfo[6].map(clean)
nopinion = dfno[6].map(clean)

opinion = opinion[opinion != '']
nopinion = nopinion[nopinion != '']

opinion.shape, nopinion.shape

((21220,), (114940,))

In [106]:
opinion_labels = np.ones(opinion.shape)
nopinion_labels = np.zeros(nopinion.shape)

articles = np.concatenate((np.array(opinion), np.array(nopinion)))
labels = np.concatenate((opinion_labels, nopinion_labels))

In [108]:

n_samples = 10000

x_train, x_val, y_train, y_val = train_test_split(articles, labels, train_size=n_samples, test_size=n_samples, random_state=0)

tfidf_vectorizer = TfidfVectorizer(
    max_features=40000,
    analyzer='word', 
    stop_words='english', 
    ngram_range=(1, 1), 
    strip_accents='ascii')

tfidf_vectorizer.fit(x_train)

x_train = tfidf_vectorizer.transform(x_train)
x_val = tfidf_vectorizer.transform(x_val)



In [112]:
model = DummyClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_val)

metrics.f1_score(y_val, y_pred)


0.0

In [113]:
model = ComplementNB()
model.fit(x_train, y_train)

y_pred = model.predict(x_val)

metrics.f1_score(y_val, y_pred)


0.04987531172069825

In [116]:
model = LogisticRegression(solver='lbfgs', random_state=0)
model.fit(x_train, y_train)

y_pred = model.predict(x_val)

metrics.f1_score(y_val, y_pred)


0.38801571709233795