### Classification
---------------------
Kilian Lüders & Bent Stohlmann

8.6.2023 (Submitted Version)

Classical ML models are computed here.

Input: training_data.pkl

Output: performance_data.csv

Make sure you have Cuda available.

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import itertools

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt 

import rule_based_classification as rbc

In [None]:
#If you want to run all models, you definitely want to have CUDA available (trust me).

torch.cuda.is_available()

In [None]:
# training data
training_data = pd.read_pickle("data/training_data.pkl")
training_data['prop'] = (training_data.prop > 0).astype('int')

In [None]:
# list for resuluts
data_metrics = list()

# function to report results
def report_result(y_pre, y_tes, model_name, feature_name, info_txt, class_type = "sent", fold_num=np.nan, loss=np.nan):
    precision_ind, recall_ind, fscore_ind, scores = precision_recall_fscore_support(y_tes, y_pre, average=None)
    precision_micro, recall_micro, fscore_micro, scores_micro = precision_recall_fscore_support(y_tes, y_pre, average='micro')
    print("{} - {} - {} \t {} \t F1:\t{:.2f}".format(model_name,feature_name, info_txt, class_type, fscore_micro))
    tn, fp, fn, tp = confusion_matrix(y_tes, y_pre).ravel()
    result = {
        'model': model_name,
        'feature': feature_name,
        'info': info_txt,
        'type': class_type,
        'fold': fold_num,
        'tn': tn,
        'fp': fp,
        'fn': fn,
        'tp': tp,
        'precision_ind_0' : precision_ind[0],
        'precision_ind_1' : precision_ind[1],
        'precision_micro': precision_micro,
        'recall_ind_0': recall_ind[0],
        'recall_ind_1': recall_ind[1],
        'recall_micro': recall_micro,
        'fscore_ind_0': fscore_ind[0],
        'fscore_ind_1': fscore_ind[1],
        'fscore_mirco': fscore_micro,
        'loss': loss
    }
    return result

In [None]:
# dokumentation of Stratified K Fold Split

data_y = training_data['prop']
data_X = np.zeros(len(data_y))

kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

doc_fold = list()

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    doc_fold.append({
        'fold': i,
        'train_n': len(train_index),
        'train_prop': data_y[train_index].sum(),
        'test_n': len(test_index),
        'test_prop': data_y[test_index].sum()
    })

pd.DataFrame(doc_fold).to_csv("output/documentation_fold_dec_class.csv")

### Majority Model

In [None]:
data_y = training_data['prop']
data_X = np.zeros(len(data_y))

kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    y_test = data_y[test_index]

    y_predict = [0] * len(y_test)

    data_metrics.append(report_result(y_predict, y_test, "majority", "", "", "dec", i))

### Rule Based

In [None]:
data_y = training_data['prop']
data_X = training_data['text']

kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    y_test = data_y[test_index]

    y_predict = training_data.iloc[test_index,:].apply(lambda row: rbc.check_entscheidung(row.entscheidung, row.text), axis=1).apply(lambda x: int(x))
    
    data_metrics.append(report_result(y_predict, y_test, "rule based", "", "", "dec", i))

## LogisticRegression

In [None]:
# 1. Count Vectorizer with token features

data_X = training_data['X_token']
data_y = training_data['prop']


kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    x_train, x_test = data_X[train_index], data_X[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

    cv = CountVectorizer()
    x_train = cv.fit_transform(x_train)
    x_test = cv.transform(x_test)

    lr = LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
    lr.fit(x_train, y_train)

    y_predict = lr.predict(x_test)

    data_metrics.append(report_result(y_predict, y_test, "LR", "cv", "token", "dec", i))

In [None]:
# 2. Count Vectorizer with lemma features

data_X = training_data['X_lemma']
data_y = training_data['prop']


kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    x_train, x_test = data_X[train_index], data_X[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

    cv = CountVectorizer()
    x_train = cv.fit_transform(x_train)
    x_test = cv.transform(x_test)

    lr = LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
    lr.fit(x_train, y_train)

    y_predict = lr.predict(x_test)

    data_metrics.append(report_result(y_predict, y_test, "LR", "cv", "lemma", "dec", i))

In [None]:
# 3. Tfidf Vectorizer with token features
data_X = training_data['X_token']
data_y = training_data['prop']


kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    x_train, x_test = data_X[train_index], data_X[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

    tfidf = TfidfVectorizer()
    x_train = tfidf.fit_transform(x_train)
    x_test = tfidf.transform(x_test)

    lr = LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
    lr.fit(x_train, y_train)

    y_predict = lr.predict(x_test)

    data_metrics.append(report_result(y_predict, y_test, "LR", "tfidf", "token", "dec", i))

In [None]:
# 4. Tfidf Vectorizer with lemma features
data_X = training_data['X_lemma']
data_y = training_data['prop']


kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    x_train, x_test = data_X[train_index], data_X[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

    tfidf = TfidfVectorizer()
    x_train = tfidf.fit_transform(x_train)
    x_test = tfidf.transform(x_test)

    lr = LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
    lr.fit(x_train, y_train)

    y_predict = lr.predict(x_test)

    data_metrics.append(report_result(y_predict, y_test, "LR", "tfidf", "lemma", "dec", i))

In [None]:
# 5. WE features
data_X = np.vstack(np.ravel(training_data['X_we']))
data_y = training_data['prop']


kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    x_train, x_test = data_X[train_index], data_X[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

    lr = LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
    lr.fit(x_train, y_train)

    y_predict = lr.predict(x_test)
    data_metrics.append(report_result(y_predict, y_test, "LR", "WE", "", "dec", i))

### SVC

In [None]:
# 1. Count Vectorizer with token features

data_X = training_data['X_token']
data_y = training_data['prop']

kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    x_train, x_test = data_X[train_index], data_X[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

    cv = CountVectorizer()
    x_train = cv.fit_transform(x_train)
    x_test = cv.transform(x_test)

    svc = SVC()
    svc.fit(x_train, y_train)

    y_predict = svc.predict(x_test)

    data_metrics.append(report_result(y_predict, y_test, "SVC", "cv", "token", "dec", i))

In [None]:
# 2. Count Vectorizer with lemma features

data_X = training_data['X_lemma']
data_y = training_data['prop']


kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    x_train, x_test = data_X[train_index], data_X[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

    cv = CountVectorizer()
    x_train = cv.fit_transform(x_train)
    x_test = cv.transform(x_test)

    svc = SVC()
    svc.fit(x_train, y_train)

    y_predict = svc.predict(x_test)

    data_metrics.append(report_result(y_predict, y_test, "SVC", "cv", "lemma", "dec", i))

In [None]:
# 3. Tfidf Vectorizer with token features
data_X = training_data['X_token']
data_y = training_data['prop']


kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    x_train, x_test = data_X[train_index], data_X[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

    tfidf = TfidfVectorizer()
    x_train = tfidf.fit_transform(x_train)
    x_test = tfidf.transform(x_test)

    svc = SVC()
    svc.fit(x_train, y_train)

    y_predict = svc.predict(x_test)

    data_metrics.append(report_result(y_predict, y_test, "SVC", "tfidf", "token", "dec", i))

In [None]:
# 4. Tfidf Vectorizer with lemma features
data_X = training_data['X_lemma']
data_y = training_data['prop']

kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    x_train, x_test = data_X[train_index], data_X[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

    tfidf = TfidfVectorizer()
    x_train = tfidf.fit_transform(x_train)
    x_test = tfidf.transform(x_test)

    svc = SVC()
    svc.fit(x_train, y_train)

    y_predict = svc.predict(x_test)

    data_metrics.append(report_result(y_predict, y_test, "SVC", "tfidf", "lemma", "dec", i))

In [None]:
# 5. WE features
data_X = np.vstack(np.ravel(training_data['X_we']))
data_y = training_data['prop']


kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    x_train, x_test = data_X[train_index], data_X[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

    svc = SVC()
    svc.fit(x_train, y_train)

    y_predict = svc.predict(x_test)

    data_metrics.append(report_result(y_predict, y_test, "SVC", "WE", "", "sent", i))

## XGBoost

In [None]:
# 1. Count Vectorizer with token features

data_X = training_data['X_token']
data_y = training_data['prop']

kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    x_train, x_test = data_X[train_index], data_X[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

    cv = CountVectorizer()
    x_train = cv.fit_transform(x_train)
    x_test = cv.transform(x_test)

    xgb_cl = xgb.XGBClassifier()
    xgb_cl.fit(x_train, y_train)

    y_predict = xgb_cl.predict(x_test)

    data_metrics.append(report_result(y_predict, y_test, "XGBoost", "cv", "token", "dec", i))

In [None]:
# 2. Count Vectorizer with lemma features

data_X = training_data['X_lemma']
data_y = training_data['prop']


kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    x_train, x_test = data_X[train_index], data_X[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

    cv = CountVectorizer()
    x_train = cv.fit_transform(x_train)
    x_test = cv.transform(x_test)

    xgb_cl = xgb.XGBClassifier()
    xgb_cl.fit(x_train, y_train)

    y_predict = xgb_cl.predict(x_test)

    data_metrics.append(report_result(y_predict, y_test, "XGBoost", "cv", "lemma", "dec", i))

In [None]:
# 3. Tfidf Vectorizer with token features
data_X = training_data['X_token']
data_y = training_data['prop']


kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    x_train, x_test = data_X[train_index], data_X[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

    tfidf = TfidfVectorizer()
    x_train = tfidf.fit_transform(x_train)
    x_test = tfidf.transform(x_test)

    xgb_cl = xgb.XGBClassifier()
    xgb_cl.fit(x_train, y_train)

    y_predict = xgb_cl.predict(x_test)

    data_metrics.append(report_result(y_predict, y_test, "XGBoost", "tfidf", "token", "dec", i))

In [None]:
# 4. Tfidf Vectorizer with lemma features
data_X = training_data['X_lemma']
data_y = training_data['prop']

kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    x_train, x_test = data_X[train_index], data_X[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

    tfidf = TfidfVectorizer()
    x_train = tfidf.fit_transform(x_train)
    x_test = tfidf.transform(x_test)

    xgb_cl = xgb.XGBClassifier()
    xgb_cl.fit(x_train, y_train)

    y_predict = xgb_cl.predict(x_test)

    data_metrics.append(report_result(y_predict, y_test, "XGBoost", "tfidf", "lemma", "dec", i))

In [None]:
# 5. WE features
data_X = np.vstack(np.ravel(training_data['X_we']))
data_y = training_data['prop']


kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    x_train, x_test = data_X[train_index], data_X[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

    xgb_cl = xgb.XGBClassifier()
    xgb_cl.fit(x_train, y_train)

    y_predict = xgb_cl.predict(x_test)

    data_metrics.append(report_result(y_predict, y_test, "XGBoost", "WE", "", "sent", i))

### NN

In [None]:
class Data(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X.astype(np.float32))
        self.y = torch.from_numpy(y.astype(np.float32))
        self.len = self.X.shape[0]
       
    def __getitem__(self, index):
        if torch.cuda.is_available():
            return self.X[index].to('cuda'), self.y[index].to('cuda')
        else:
            return self.X[index], self.y[index]
   
    def __len__(self):
        return self.len
   
batch_size = 64

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(NeuralNetwork, self).__init__()
        self.layer_1 = nn.Linear(input_dim, hidden_dim)
        nn.init.kaiming_uniform_(self.layer_1.weight, nonlinearity="relu")
        self.layer_2 = nn.Linear(hidden_dim, output_dim)
       
    def forward(self, x):
        x = torch.nn.functional.relu(self.layer_1(x))
        x = torch.sigmoid(self.layer_2(x))
        return x

In [None]:
# 1. Count Vectorizer with token features
data_X = training_data['X_token']
data_y = training_data['prop'].to_numpy()

kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    X_train, X_test = data_X[train_index], data_X[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

    cv = CountVectorizer()
    X_train = cv.fit_transform(X_train).toarray()
    X_test = cv.transform(X_test).toarray()

    train_data = Data(X_train, y_train)
    train_dataloader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)

    test_data = Data(X_test, y_test)
    test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=True)
    
    input_dim = X_train.shape[1]
    hidden_dim = 1500
    output_dim = 1

    model = NeuralNetwork(input_dim, hidden_dim, output_dim)
    if torch.cuda.is_available():
        model.to('cuda', non_blocking=True)
    print(model)

    learning_rate = 0.1
    loss_fn = nn.BCELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    num_epochs = 1500
    loss_values = []

    for epoch in range(num_epochs):
        print("Fold: {} \t Epoche : {} von {}".format(str(i+1), epoch, num_epochs), end='\r')
        for X, y in train_dataloader:
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            pred = model(X)
            loss = loss_fn(pred, y.unsqueeze(-1))
            loss_values.append(loss.item())
            loss.backward()
            optimizer.step()

    y_pred = []
    y_test = []
    total = 0
    correct = 0
    
    with torch.no_grad():
        for X, y in test_dataloader:
            outputs = model(X)
            if torch.cuda.is_available():
                outputs = outputs.to("cpu", non_blocking=True)
                y = y.to("cpu", non_blocking=True)
            predicted = np.where(outputs < 0.5, 0, 1)
            predicted = list(itertools.chain(*predicted))
            y_pred.append(predicted)
            y_test.append(y)
            total += y.size(0)
            correct += (predicted == y.numpy()).sum().item()
    
    y_pred = list(itertools.chain(*y_pred))
    y_test = list(itertools.chain(*y_test))

    data_metrics.append(report_result(y_predict, y_test, "NN", "cv", "token", "dec", i, loss_values))

In [None]:
metrics_df = pd.DataFrame(data_metrics)
metrics_df = metrics_df[(metrics_df['model'] == "NN") & (metrics_df['feature'] == "cv") & (metrics_df['info'] == "token")]

step = np.linspace(0, num_epochs, len(metrics_df.iloc[0,-1]))

for i in range(5):
    loss = metrics_df.iloc[i,-1]
    fig, ax = plt.subplots(figsize=(4,3))
    plt.plot(step, np.array(loss))
    plt.title("Step-wise Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.show()

In [None]:
# 2. Count Vectorizer with lemma features
data_X = training_data['X_lemma']
data_y = training_data['prop'].to_numpy()

kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    X_train, X_test = data_X[train_index], data_X[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

    cv = CountVectorizer()
    X_train = cv.fit_transform(X_train).toarray()
    X_test = cv.transform(X_test).toarray()

    train_data = Data(X_train, y_train)
    train_dataloader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)

    test_data = Data(X_test, y_test)
    test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=True)
    
    input_dim = X_train.shape[1]
    hidden_dim = 1500
    output_dim = 1

    model = NeuralNetwork(input_dim, hidden_dim, output_dim)
    if torch.cuda.is_available():
        model.to('cuda', non_blocking=True)
    print(model)

    learning_rate = 0.1
    loss_fn = nn.BCELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    num_epochs = 1500
    loss_values = []

    for epoch in range(num_epochs):
        print("Fold: {} \t Epoche : {} von {}".format(str(i+1), epoch, num_epochs), end='\r')
        for X, y in train_dataloader:
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            pred = model(X)
            loss = loss_fn(pred, y.unsqueeze(-1))
            loss_values.append(loss.item())
            loss.backward()
            optimizer.step()

    y_pred = []
    y_test = []
    total = 0
    correct = 0
    
    with torch.no_grad():
        for X, y in test_dataloader:
            outputs = model(X)
            if torch.cuda.is_available():
                outputs = outputs.to("cpu", non_blocking=True)
                y = y.to("cpu", non_blocking=True)
            predicted = np.where(outputs < 0.5, 0, 1)
            predicted = list(itertools.chain(*predicted))
            y_pred.append(predicted)
            y_test.append(y)
            total += y.size(0)
            correct += (predicted == y.numpy()).sum().item()
    
    y_pred = list(itertools.chain(*y_pred))
    y_test = list(itertools.chain(*y_test))

    data_metrics.append(report_result(y_predict, y_test, "NN", "cv", "lemma", "dec", i, loss_values))

In [None]:
metrics_df = pd.DataFrame(data_metrics)
metrics_df = metrics_df[(metrics_df['model'] == "NN") & (metrics_df['feature'] == "cv") & (metrics_df['info'] == "lemma")]

step = np.linspace(0, num_epochs, len(metrics_df.iloc[0,-1]))

for i in range(5):
    loss = metrics_df.iloc[i,-1]
    fig, ax = plt.subplots(figsize=(4,3))
    plt.plot(step, np.array(loss))
    plt.title("Step-wise Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.show()

In [None]:
# 3. tfidf Vectorizer with token features
data_X = training_data['X_token']
data_y = training_data['prop'].to_numpy()

kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    X_train, X_test = data_X[train_index], data_X[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

    tfidf = TfidfVectorizer()
    X_train = tfidf.fit_transform(X_train).toarray()
    X_test = tfidf.transform(X_test).toarray()

    train_data = Data(X_train, y_train)
    train_dataloader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)

    test_data = Data(X_test, y_test)
    test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=True)
    
    input_dim = X_train.shape[1]
    hidden_dim = 1500
    output_dim = 1

    model = NeuralNetwork(input_dim, hidden_dim, output_dim)
    if torch.cuda.is_available():
        model.to('cuda', non_blocking=True)
    print(model)

    learning_rate = 0.1
    loss_fn = nn.BCELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    num_epochs = 1500
    loss_values = []

    for epoch in range(num_epochs):
        print("Fold: {} \t Epoche : {} von {}".format(str(i+1), epoch, num_epochs), end='\r')
        for X, y in train_dataloader:
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            pred = model(X)
            loss = loss_fn(pred, y.unsqueeze(-1))
            loss_values.append(loss.item())
            loss.backward()
            optimizer.step()

    y_pred = []
    y_test = []
    total = 0
    correct = 0
    
    with torch.no_grad():
        for X, y in test_dataloader:
            outputs = model(X)
            if torch.cuda.is_available():
                outputs = outputs.to("cpu", non_blocking=True)
                y = y.to("cpu", non_blocking=True)
            predicted = np.where(outputs < 0.5, 0, 1)
            predicted = list(itertools.chain(*predicted))
            y_pred.append(predicted)
            y_test.append(y)
            total += y.size(0)
            correct += (predicted == y.numpy()).sum().item()
    
    y_pred = list(itertools.chain(*y_pred))
    y_test = list(itertools.chain(*y_test))

    data_metrics.append(report_result(y_predict, y_test, "NN", "tfidf", "token", "dec", i, loss_values))

In [None]:
metrics_df = pd.DataFrame(data_metrics)
metrics_df = metrics_df[(metrics_df['model'] == "NN") & (metrics_df['feature'] == "tfidf") & (metrics_df['info'] == "token")]

step = np.linspace(0, num_epochs, len(metrics_df.iloc[0,-1]))

for i in range(5):
    loss = metrics_df.iloc[i,-1]
    fig, ax = plt.subplots(figsize=(4,3))
    plt.plot(step, np.array(loss))
    plt.title("Step-wise Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.show()

In [None]:
# 4. tfidf Vectorizer with lemma features
data_X = training_data['X_lemma']
data_y = training_data['prop'].to_numpy()

kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    X_train, X_test = data_X[train_index], data_X[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

    tfidf = TfidfVectorizer()
    X_train = tfidf.fit_transform(X_train).toarray()
    X_test = tfidf.transform(X_test).toarray()

    train_data = Data(X_train, y_train)
    train_dataloader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)

    test_data = Data(X_test, y_test)
    test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=True)
    
    input_dim = X_train.shape[1]
    hidden_dim = 1500
    output_dim = 1

    model = NeuralNetwork(input_dim, hidden_dim, output_dim)
    if torch.cuda.is_available():
        model.to('cuda', non_blocking=True)
    print(model)

    learning_rate = 0.1
    loss_fn = nn.BCELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    num_epochs = 1500
    loss_values = []

    for epoch in range(num_epochs):
        print("Fold: {} \t Epoche : {} von {}".format(str(i+1), epoch, num_epochs), end='\r')
        for X, y in train_dataloader:
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            pred = model(X)
            loss = loss_fn(pred, y.unsqueeze(-1))
            loss_values.append(loss.item())
            loss.backward()
            optimizer.step()

    y_pred = []
    y_test = []
    total = 0
    correct = 0
    
    with torch.no_grad():
        for X, y in test_dataloader:
            outputs = model(X)
            if torch.cuda.is_available():
                outputs = outputs.to("cpu", non_blocking=True)
                y = y.to("cpu", non_blocking=True)
            predicted = np.where(outputs < 0.5, 0, 1)
            predicted = list(itertools.chain(*predicted))
            y_pred.append(predicted)
            y_test.append(y)
            total += y.size(0)
            correct += (predicted == y.numpy()).sum().item()
    
    y_pred = list(itertools.chain(*y_pred))
    y_test = list(itertools.chain(*y_test))

    data_metrics.append(report_result(y_predict, y_test, "NN", "tfidf", "lemma", "dec", i, loss_values))

In [None]:
metrics_df = pd.DataFrame(data_metrics)
metrics_df = metrics_df[(metrics_df['model'] == "NN") & (metrics_df['feature'] == "tfidf") & (metrics_df['info'] == "lemma")]

step = np.linspace(0, num_epochs, len(metrics_df.iloc[0,-1]))

for i in range(5):
    loss = metrics_df.iloc[i,-1]
    fig, ax = plt.subplots(figsize=(4,3))
    plt.plot(step, np.array(loss))
    plt.title("Step-wise Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.show()

In [None]:
# 5. WE
data_X = np.vstack(np.ravel(training_data.X_we))
data_y = training_data['prop'].to_numpy()

kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data_X, data_y)):
    X_train, X_test = data_X[train_index], data_X[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

    train_data = Data(X_train, y_train)
    train_dataloader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)

    test_data = Data(X_test, y_test)
    test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=True)
    
    input_dim = X_train.shape[1]
    hidden_dim = 100
    output_dim = 1

    model = NeuralNetwork(input_dim, hidden_dim, output_dim)
    if torch.cuda.is_available():
        model.to('cuda', non_blocking=True)
    print(model)

    learning_rate = 0.1
    loss_fn = nn.BCELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    num_epochs = 5000
    loss_values = []

    for epoch in range(num_epochs):
        print("Fold: {} \t Epoche : {} von {}".format(str(i+1), epoch, num_epochs), end='\r')
        for X, y in train_dataloader:
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            pred = model(X)
            loss = loss_fn(pred, y.unsqueeze(-1))
            loss_values.append(loss.item())
            loss.backward()
            optimizer.step()

    y_pred = []
    y_test = []
    total = 0
    correct = 0
    
    with torch.no_grad():
        for X, y in test_dataloader:
            outputs = model(X)
            if torch.cuda.is_available():
                outputs = outputs.to("cpu", non_blocking=True)
                y = y.to("cpu", non_blocking=True)
            predicted = np.where(outputs < 0.5, 0, 1)
            predicted = list(itertools.chain(*predicted))
            y_pred.append(predicted)
            y_test.append(y)
            total += y.size(0)
            correct += (predicted == y.numpy()).sum().item()
    
    y_pred = list(itertools.chain(*y_pred))
    y_test = list(itertools.chain(*y_test))

    data_metrics.append(report_result(y_predict, y_test, "NN", "WE", "", "dec", i, loss_values))

In [None]:
metrics_df = pd.DataFrame(data_metrics)
metrics_df = metrics_df[(metrics_df['model'] == "NN") & (metrics_df['feature'] == "WE") & (metrics_df['info'] == "")]

step = np.linspace(0, num_epochs, len(metrics_df.iloc[0,-1]))

for i in range(5):
    loss = metrics_df.iloc[i,-1]
    fig, ax = plt.subplots(figsize=(4,3))
    plt.plot(step, np.array(loss))
    plt.title("Step-wise Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.show()

## Metrics

In [None]:
output = pd.DataFrame(data_metrics)
output.tail()

In [None]:
output.to_csv("data/performance_data.csv")