In [25]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [2]:
# load the cleaned data

clean_reuters_data = pd.read_pickle('clean_reuters_data.pkl')

In [15]:
# TF-IDF vectorize news contents and binarize topics for training and testing sample

vectorizer = TfidfVectorizer()
mlb = MultiLabelBinarizer()

train_documents = clean_reuters_data[clean_reuters_data.lewis_split == 'TRAIN']['text'].values
test_documents = clean_reuters_data[clean_reuters_data.lewis_split == 'TEST']['text'].values
train_labels = clean_reuters_data[clean_reuters_data.lewis_split == 'TRAIN']['topics'].values
test_labels = clean_reuters_data[clean_reuters_data.lewis_split == 'TEST']['topics'].values

vectorized_train_documents = vectorizer.fit_transform(train_documents)
vectorized_test_documents = vectorizer.transform(test_documents)
binarized_train_labels = mlb.fit_transform(train_labels)
binarized_test_labels = mlb.transform(test_labels)

In [28]:
# SVM classification
model = OneVsRestClassifier(LinearSVC()).fit(vectorized_train_documents, binarized_train_labels)

y_pred = model.predict(vectorized_test_documents)
y_true = binarized_test_labels

f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
accuracy = accuracy_score(y_true, y_pred)

In [38]:
f1_micro_average, roc_auc, accuracy

(0.849429535593765, 0.8850380040772946, 0.7919536063791228)

In [43]:
# ML-kNN classification

model = KNeighborsClassifier().fit(vectorized_train_documents, binarized_train_labels)

y_pred = model.predict(vectorized_test_documents)
y_true = binarized_test_labels

f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
accuracy = accuracy_score(y_true, y_pred)

In [44]:
f1_micro_average, roc_auc, accuracy

(0.7882113183884621, 0.8657793080340668, 0.7375860819137369)

In [45]:
# Naive Bayes classification
model = OneVsRestClassifier(MultinomialNB()).fit(vectorized_train_documents, binarized_train_labels)

y_pred = model.predict(vectorized_test_documents)
y_true = binarized_test_labels

f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
accuracy = accuracy_score(y_true, y_pred)

In [46]:
f1_micro_average, roc_auc, accuracy

(0.4798404962339389, 0.6578701270242665, 0.39180862631388186)

In [47]:
# logistic regression classification

model = OneVsRestClassifier(LogisticRegression()).fit(vectorized_train_documents, binarized_train_labels)
y_pred = model.predict(vectorized_test_documents)
y_true = binarized_test_labels
f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
accuracy = accuracy_score(y_true, y_pred)

In [48]:
f1_micro_average, roc_auc, accuracy

(0.7366901281357155, 0.7974103494246034, 0.6629213483146067)

In [49]:
# random forest classification

model = RandomForestClassifier().fit(vectorized_train_documents, binarized_train_labels)
y_pred = model.predict(vectorized_test_documents)
y_true = binarized_test_labels
f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
accuracy = accuracy_score(y_true, y_pred)

In [50]:
f1_micro_average, roc_auc, accuracy

(0.6760509796461861, 0.7589582462570573, 0.604929322218195)