# Assignment <span style="color:red">option Four</span> - News Categorization using PyTorch

Download the dataset from https://www.kaggle.com/uciml/news-aggregator-dataset and develop a news classification or categorization model. The dataset contain only titles of a news item and some metadata. The categories of the news items include one of: –<span  style="color:red"> b</span> : business – <span  style="color:red">t</span> : science and technology – <span  style="color:red">e</span> : entertainment and –<span  style="color:red">m</span> : health.

1. Prepare training and test dataset: Split the data into training and test set (80% train and 20% test). Make sure they are balanced, otherwise if all <span  style="color:red">b</span> files are on training, your model fails to predict <span  style="color:red">t</span> files in test.
2. Binary classification: produce training data for each two categories, such as <span  style="color:red">b </span> and <span  style="color:red"> t</span>, <span  style="color:red">b</span> and <span  style="color:red"> m</span>, <span  style="color:red">e</span> and <span  style="color:red">t</span> and so on. Evaluate the performance and report which categories are easier for the models.
3. Adapt the Text Categorization PyTorch code (see above) and evaluate the performance of the system for these task
4. Use a pre-trained embeddings and compare your result. When you use pre-trained embeddings, you have to average the word embeddings of each tokens in ach document to get the unique representation of the document. DOC_EMBEDDING = (TOKEN1_EMBEDDING + ... + TOKENn_EMBEDDING). You can also use some of the <span  style="color:red">spacy/FLAIR </span>document embedding methods
5. Report the recall, precision, and F1 scores for both binary and multi-class classification.


# Task 1

1. Prepare training and test dataset: Split the data into training and test set (80% train and 20% test). Make sure they are balanced, otherwise if all <span  style="color:red">b</span> files are on training, your model fails to predict <span  style="color:red">t</span> files in test.


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# read data
data = pd.read_csv("data/uci-news-aggregator.csv")
# remove unnecessary columns
frame = data[["TITLE", "CATEGORY"]]

TEST_SIZE = 0.2

# Division into training and test data. The stratify parameter causes the "Category" feature to be split equally
training_data, testing_data = train_test_split(
    frame, test_size=TEST_SIZE, random_state=0, stratify=data["CATEGORY"]
)

# print size of train and test set
print("Trainingsdaten: ", len(training_data))
print("Testdaten: ", len(testing_data))

Trainingsdaten:  337935
Testdaten:  84484


# Task 2

Binary classification: produce training data for each two categories, such as b and t, b
and m, e and t and so on. Evaluate the performance and report which categories are
easier for the models.


In [2]:
from itertools import combinations

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import pickle


# Define the categories
categories = ["b", "t", "e", "m"]

# get all possible combinations
combinations_categories = list(combinations(categories, 2))

# print combinations
# for combination in possible_combinations:
#    print(combination)

# loop through each category combination
for category_pair in combinations_categories:
    category_1, category_2 = category_pair

    # only keep data of category pair
    filtered_training_data = training_data[
        (training_data["CATEGORY"] == category_1)
        | (training_data["CATEGORY"] == category_2)
    ]
    filtered_test_data = testing_data[
        (testing_data["CATEGORY"] == category_1)
        | (testing_data["CATEGORY"] == category_2)
    ]

    # Create a binary dataset for the current category pair
    cat_mapping = {category_1: 1, category_2: 0}
    filtered_training_data["CATEGORY_IN_BINARY"] = filtered_training_data[
        "CATEGORY"
    ].map(cat_mapping)
    filtered_test_data["CATEGORY_IN_BINARY"] = filtered_test_data["CATEGORY"].map(
        cat_mapping
    )

    # print(filtered_training_data)

    # split the binary dataset into features (X) und labels (y)
    X_train = filtered_training_data["TITLE"]
    y_train = filtered_training_data["CATEGORY_IN_BINARY"]
    X_test = filtered_test_data["TITLE"]
    y_test = filtered_test_data["CATEGORY_IN_BINARY"]

    # vectorize the titles using TF-IDF
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # save vectorizer
    with open(f'vectorizer/tasktwo_{category_1}_{category_2}.pkl', 'wb') as vectorizer_file:
        pickle.dump(vectorizer, vectorizer_file)

    # train a Naive Bayes classifier
    classifier = MultinomialNB()
    classifier.fit(X_train_tfidf, y_train)



    # Save the trained model
    with open(f'models/tasktwo_{category_1}_{category_2}.pkl', 'wb') as model_file:
        pickle.dump(classifier, model_file)

    # make predictions on the test set
    predictions = classifier.predict(X_test_tfidf)

    # evaluate performance
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)

    # this report gives further information
    report = classification_report(y_test, predictions)

    # print results
    print("----------------------------------------------------------")
    print(
        f"Category Pair: {category_1} ({cat_mapping[category_1]}) vs {category_2} ({cat_mapping[category_2]})"
    )
    print("------------------PERFORMANCE-----------------------------")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1_score: {f1:.2f}")
    print("--------------------REPORT--------------------------------")
    print("Classification Report:\n", report)
    print("----------------------------------------------------------")
    print("\n\n")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_training_data["CATEGORY_IN_BINARY"] = filtered_training_data[
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_test_data["CATEGORY_IN_BINARY"] = filtered_test_data["CATEGORY"].map(


----------------------------------------------------------
Category Pair: b (1) vs t (0)
------------------PERFORMANCE-----------------------------
Accuracy: 0.93
Precision: 0.93
Recall: 0.93
F1_score: 0.93
--------------------REPORT--------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.93      0.92     21669
           1       0.93      0.93      0.93     23193

    accuracy                           0.93     44862
   macro avg       0.93      0.93      0.93     44862
weighted avg       0.93      0.93      0.93     44862

----------------------------------------------------------





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_training_data["CATEGORY_IN_BINARY"] = filtered_training_data[
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_test_data["CATEGORY_IN_BINARY"] = filtered_test_data["CATEGORY"].map(


----------------------------------------------------------
Category Pair: b (1) vs e (0)
------------------PERFORMANCE-----------------------------
Accuracy: 0.98
Precision: 0.98
Recall: 0.97
F1_score: 0.97
--------------------REPORT--------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98     30494
           1       0.98      0.97      0.97     23193

    accuracy                           0.98     53687
   macro avg       0.98      0.98      0.98     53687
weighted avg       0.98      0.98      0.98     53687

----------------------------------------------------------





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_training_data["CATEGORY_IN_BINARY"] = filtered_training_data[
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_test_data["CATEGORY_IN_BINARY"] = filtered_test_data["CATEGORY"].map(


----------------------------------------------------------
Category Pair: b (1) vs m (0)
------------------PERFORMANCE-----------------------------
Accuracy: 0.97
Precision: 0.97
Recall: 0.99
F1_score: 0.98
--------------------REPORT--------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.93      0.95      9128
           1       0.97      0.99      0.98     23193

    accuracy                           0.97     32321
   macro avg       0.97      0.96      0.97     32321
weighted avg       0.97      0.97      0.97     32321

----------------------------------------------------------





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_training_data["CATEGORY_IN_BINARY"] = filtered_training_data[
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_test_data["CATEGORY_IN_BINARY"] = filtered_test_data["CATEGORY"].map(


----------------------------------------------------------
Category Pair: t (1) vs e (0)
------------------PERFORMANCE-----------------------------
Accuracy: 0.98
Precision: 0.97
Recall: 0.97
F1_score: 0.97
--------------------REPORT--------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98     30494
           1       0.97      0.97      0.97     21669

    accuracy                           0.98     52163
   macro avg       0.98      0.98      0.98     52163
weighted avg       0.98      0.98      0.98     52163

----------------------------------------------------------





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_training_data["CATEGORY_IN_BINARY"] = filtered_training_data[
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_test_data["CATEGORY_IN_BINARY"] = filtered_test_data["CATEGORY"].map(


----------------------------------------------------------
Category Pair: t (1) vs m (0)
------------------PERFORMANCE-----------------------------
Accuracy: 0.98
Precision: 0.97
Recall: 0.99
F1_score: 0.98
--------------------REPORT--------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.94      0.96      9128
           1       0.97      0.99      0.98     21669

    accuracy                           0.98     30797
   macro avg       0.98      0.97      0.97     30797
weighted avg       0.98      0.98      0.98     30797

----------------------------------------------------------





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_training_data["CATEGORY_IN_BINARY"] = filtered_training_data[
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_test_data["CATEGORY_IN_BINARY"] = filtered_test_data["CATEGORY"].map(


----------------------------------------------------------
Category Pair: e (1) vs m (0)
------------------PERFORMANCE-----------------------------
Accuracy: 0.98
Precision: 0.98
Recall: 0.99
F1_score: 0.99
--------------------REPORT--------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.93      0.95      9128
           1       0.98      0.99      0.99     30494

    accuracy                           0.98     39622
   macro avg       0.98      0.96      0.97     39622
weighted avg       0.98      0.98      0.98     39622

----------------------------------------------------------





# Task 3

Adapt the Text Categorization PyTorch code (see above) and evaluate the performance
of the system for these task


In [3]:
from collections import Counter
import numpy as np
import torch
import torch.nn as nn

# used code from lecture notebook but exchanged the data
vocab = Counter()
for text in training_data["TITLE"]:
    for word in text.split(" "):
        vocab[word.lower()] += 1

for text in testing_data["TITLE"]:
    for word in text.split(" "):
        vocab[word.lower()] += 1

total_words = len(vocab)


def get_word_2_index(vocab):
    word2index = {}
    for i, word in enumerate(vocab):
        word2index[word.lower()] = i
    return word2index


word2index = get_word_2_index(vocab)


def get_batch(df, i, batch_size):
    batches = []
    results = []

    # used iloc from pandas package because working with dataframe not array
    # extracting batch of data from dataframe
    texts = df["TITLE"].iloc[i * batch_size : i * batch_size + batch_size]
    categories = df["CATEGORY"].iloc[i * batch_size : i * batch_size + batch_size]

    for text in texts:
        layer = np.zeros(total_words, dtype=float)
        for word in text.split(" "):
            layer[word2index[word.lower()]] += 1
        batches.append(layer)

    # convert categories to numbers
    for category in categories:
        index_y = -1
        if category == "b":
            index_y = 0
        elif category == "t":
            index_y = 1
        elif category == "e":
            index_y = 2
        elif category == "m":
            index_y = 3
        results.append(index_y)

    return np.array(batches), np.array(results)


# Parameters
learning_rate = 0.05
num_epochs = (
    1  # changed epoch size so training is faster, you can increase it if you want
)
batch_size = 150
display_step = 1

# Network Parameters
hidden_size = 100  # 1st layer and 2nd layer number of feature
input_size = total_words  # Words in vocab
print(input_size)
print("--------------------")
num_classes = 4

# select gpu (cuda) as method for faster training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Verwendetes Gerät:", device)

if device.type == "cuda":
    torch.cuda.empty_cache()  # empty cache -> otherwise there were sometimes errors


class NewsNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NewsNN, self).__init__()
        self.layer_1 = nn.Linear(input_size, hidden_size, bias=True)
        self.relu = nn.ReLU()
        self.layer_2 = nn.Linear(hidden_size, hidden_size, bias=True)
        self.output_layer = nn.Linear(hidden_size, num_classes, bias=True)

    def forward(self, x):
        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.output_layer(out)
        return out


# with "to()" you can easily switch between CPU and GPU without changing the rest of your code
# had some problems with it so we added it
news_net = NewsNN(input_size, hidden_size, num_classes).to(device)
# Loss and Optimizer
criterion = nn.CrossEntropyLoss()  # This includes the Softmax loss function
optimizer = torch.optim.Adam(news_net.parameters(), lr=learning_rate)

# Train the Model
for epoch in range(num_epochs):
    # determine the number of min-batches based on the batch size and size of training data - exchanged the data
    total_batch = int(len(training_data) / batch_size)
    # Loop over all batches
    for i in range(total_batch):
        batch_x, batch_y = get_batch(training_data, i, batch_size)
        articles = torch.FloatTensor(batch_x).to(device)
        labels = torch.LongTensor(batch_y).to(device)
        # print("articles",articles)
        # print(batch_x, labels)
        # print("size labels",labels.size())

        # Forward + Backward + Optimize
        optimizer.zero_grad()  # zero the gradient buffer
        outputs = news_net(articles)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if (i + 1) % 4 == 0:
            print(
                "Epoch [%d/%d], Step [%d/%d], Loss: %.4f"
                % (
                    epoch + 1,
                    num_epochs,
                    i + 1,
                    len(training_data) / batch_size,
                    loss.data,
                )
            )

# Save the model
torch.save(news_net.state_dict(), 'models/taskthree.pth')

# show the different trained parameters
for name, param in news_net.named_parameters():
    if param.requires_grad:
        print("Name--->", name, "\nValues--->", param.data)

# set model to evaluation mode
news_net.eval()
total_test_batches = int(len(testing_data) / batch_size)

with torch.no_grad():
    # create empty result arrays
    all_predicted = []
    all_labels = []
    # iterate through each of the batches
    for i in range(total_test_batches):
        # get data of corresponding batch
        test_batch_x, test_batch_y = get_batch(testing_data, i, batch_size)
        test_articles = torch.FloatTensor(test_batch_x).to(device)
        test_labels = torch.LongTensor(test_batch_y).to(device)
        # get data into NN and get predicted labels
        test_outputs = news_net(test_articles)
        _, predicted = torch.max(test_outputs.data, 1)
        
        # we need .cpu() because we did .to(device) which was mainly gpu
        all_predicted.extend(predicted.cpu().numpy())
        all_labels.extend(test_labels.cpu().numpy())

# print / create classification report for the predicted data
print("-------------------------------")
print(
    classification_report(all_labels, all_predicted, target_names=["b", "t", "e", "m"])
)

135402
--------------------
Verwendetes Gerät: cpu
Epoch [1/1], Step [4/2252], Loss: 1.2026
Epoch [1/1], Step [8/2252], Loss: 0.9040
Epoch [1/1], Step [12/2252], Loss: 0.7454
Epoch [1/1], Step [16/2252], Loss: 0.8232
Epoch [1/1], Step [20/2252], Loss: 0.7347
Epoch [1/1], Step [24/2252], Loss: 0.5539
Epoch [1/1], Step [28/2252], Loss: 0.4534
Epoch [1/1], Step [32/2252], Loss: 0.6852
Epoch [1/1], Step [36/2252], Loss: 0.4110
Epoch [1/1], Step [40/2252], Loss: 0.5999
Epoch [1/1], Step [44/2252], Loss: 0.5751
Epoch [1/1], Step [48/2252], Loss: 0.5467
Epoch [1/1], Step [52/2252], Loss: 0.4947
Epoch [1/1], Step [56/2252], Loss: 0.4191
Epoch [1/1], Step [60/2252], Loss: 0.4290
Epoch [1/1], Step [64/2252], Loss: 0.4390
Epoch [1/1], Step [68/2252], Loss: 0.4488
Epoch [1/1], Step [72/2252], Loss: 0.4311
Epoch [1/1], Step [76/2252], Loss: 0.3283
Epoch [1/1], Step [80/2252], Loss: 0.5231
Epoch [1/1], Step [84/2252], Loss: 0.4287
Epoch [1/1], Step [88/2252], Loss: 0.5416
Epoch [1/1], Step [92/2252]

# Task 4

Use a pre-trained embeddings and compare your result. When you use pre-trained
embeddings, you have to average the word embeddings of each tokens in ach
document to get the unique representation of the document. DOC_EMBEDDING =
(TOKEN1_EMBEDDING + ... + TOKENn_EMBEDDING). You can also use some of the
spacy/FLAIR document embedding methods


In [8]:
import spacy
import torch.optim as optim
import torch
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    f1_score,
    precision_score,
    recall_score,
)
import torch.nn as nn

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np


# read data
data = pd.read_csv("data/uci-news-aggregator.csv")
# remove unnecessary columns
frame = data[["TITLE", "CATEGORY"]]
# uncomment this cell if you have enough performance
frame = frame.head(10000)
TEST_SIZE = 0.2

# Division into training and test data. The stratify parameter causes the "Category" feature to be split equally
training_data, testing_data = train_test_split(
    frame, test_size=TEST_SIZE, random_state=0, stratify=frame["CATEGORY"]
)

# Set hyperparameters
input_size = (
    96  # spacy provides vectors with dimension of 96 thats why we need to set that size
)
num_classes = 4
hidden_dim = 100
num_epochs = 1
batch_size = 150
learning_rate = 0.01

nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])


def average_embedding_token(text):
    doc = nlp(text)
    # use vector parameter of spacy
    embeddings = [token.vector for token in doc]
    # if token exists in spacy return value otherwise return zero array
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(input_size)


# print unique categories
print(training_data["CATEGORY"].unique())

# tokenize and apply spacy embeddings to dataset
training_data["SPACY_EMBEDDING"] = training_data["TITLE"].apply(average_embedding_token)
testing_data["SPACY_EMBEDDING"] = testing_data["TITLE"].apply(average_embedding_token)

# dictionary with name -> int class
mapping = {"b": 0, "t": 1, "e": 2, "m": 3}
# labels need to be converted to tensors for training model
train_conv_label = torch.tensor(training_data["CATEGORY"].map(mapping).to_numpy())
# embeddings need to be converted to tensors for training model
train_conv_features = torch.tensor(
    np.vstack(training_data["SPACY_EMBEDDING"].to_numpy())
)


class NewsNN(nn.Module):
    def _init_(self, input_size, hidden_size, num_classes):
        super(NewsNN, self)._init_()
        self.layer_1 = nn.Linear(input_size, hidden_size, bias=True)
        self.relu = nn.ReLU()
        self.layer_2 = nn.Linear(hidden_size, hidden_size, bias=True)
        self.output_layer = nn.Linear(hidden_size, num_classes, bias=True)

    def forward(self, x):
        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.output_layer(out)
        return out


# create model
news_nn = NewsNN(input_size, hidden_dim, num_classes)

# Loss and optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(news_nn.parameters(), lr=learning_rate)


for epoch in range(num_epochs):
    for i in range(0, len(train_conv_features), batch_size):
        data_f = train_conv_features[i : i + batch_size]
        labels = train_conv_label[i : i + batch_size]

        optimizer.zero_grad()
        outputs = news_nn(data_f.float())
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()
        if ((i // batch_size) + 1) % 4 == 0:
            print(
                "Epoch [%d/%d], Step [%d/%d], Loss: %.4f"
                % (
                    epoch + 1,
                    num_epochs,
                    i // batch_size,
                    len(train_conv_features) // batch_size,
                    loss.data,
                )
            )

test_conv_features = torch.tensor(np.vstack(testing_data["SPACY_EMBEDDING"].to_numpy()))
test_conv_label = torch.tensor(testing_data["CATEGORY"].map(mapping).to_numpy())
# Evaluate on the test set
with torch.no_grad():
    output = news_nn(test_conv_features.float())
    _, test_pred = torch.max(output, 1)

test_pred = test_pred.numpy()
test_conv_label = test_conv_label.numpy()

# print classification report with metrics
print(
    f"Classification Report: {classification_report(test_conv_label, test_pred, target_names=mapping, zero_division=1)}"
)

['e' 't' 'm' 'b']




TypeError: NewsNN.__init__() takes 1 positional argument but 4 were given