# Import Libraries

In [None]:
import pandas as pd
import numpy as np

import nltk

import itertools

from sentence_level_preprocess import *
from word_level_preprocess import *
from indexation import *
from featurize import *
from embeddings import Embeddings

from typing import Tuple, List

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.base import BaseEstimator

In [None]:
numpy_records = List[Tuple]

def load_data(path: str) -> numpy_records:
    df = pd.read_csv(path, sep='\t')
    return df.to_records(index=False)

def preprocess(data: numpy_records) -> numpy_records:
    # remove line breaks (e.g. "elec- tron" -> "electron")
    data = [(rm_linebreaks(t), l) for t, l in data]

    # lower case
    data = [(t.lower(), l) for t, l in data]

    # unify expressions for temperature (e.g. '° c' -> '<temp>')
    data = [(c2temp_2(t), l) for t, l in data]

    data = [(nltk.tokenize.word_tokenize(t), l) for t, l in data]

    # recognize integer as '<int>' (e.g. '60' -> '<int>')
    data = [([put_int_together(w) for w in t], l) for t, l in data]

    # recognize decimal as '<dec>' (e.g. '0.5' -> '<dec>')
    data = [([put_decimal_together(w) for w in t], l) for t, l in data]

    # recognize ratioas '<ratio>' (e.g. '1:1' -> '<ratio>')
    data = [([put_ratio_together(w) for w in t], l) for t, l in data]

    # split slash (e.g. 'g/mol' -> '['g', '/', 'mol'])
    data = [([split_slash(w) for w in t], l) for t, l in data]
    data = [(list(itertools.chain.from_iterable(t)), l) for t, l in data] # flatten

    # unify expressions for temperature (e.g. '°c' -> '<temp>')
    data = [([c2temp(w) for w in t], l) for t, l in data]
    
    return data

def index_words(data: numpy_records):
    texts   = [x[0] for x in data]
    targets = [[x[1]] for x in data]
    texts, vocab_size, _   = word2idx(texts)
    targets, _, _          = word2idx(targets)
    targets = [l[0] for l in targets]
    data_idx = list(zip(texts, targets))
    return (data_idx, vocab_size, targets)

def BOW_featurize(data_idx, vocab_size):
    return [(bow(t, vocab_size), l) for t, l in data_idx]

def train_val_split(data: numpy_records) -> Tuple[List[str]]:
    X = [t for t, _ in data]
    y = [l for _, l in data]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    return (X_train, X_val, y_train, y_val)

def embed(embedding, train_data, val_data):
    return embedding.transform(
        train_data.text.apply(lambda x: ' '.join([str(y) for y in x]))
    ), embedding.transform(
        val_data.text.apply(lambda x: ' '.join([str(y) for y in x]))
    )

# def generate_embeddings(path: str, embed):
#     data_idx, vocab_size, targets = index_words(preprocess(load_data(path)))
#     X_total = pd.DataFrame(data_idx, columns=['text', 'classification'])
#     y_total = X_total.pop('classification')
#     X_train, X_val, y_train, y_val = train_test_split(X_total, y_total, test_size=0.2)
#     embeddings = Embeddings(X_train, X_val)
#     X_train, X_val = embed(embed, X_train, X_val)
#     return (X_train, X_val, y_train, y_val)

def generate_split(path):
    data_idx, vocab_size, targets = index_words(preprocess(load_data(path)))
    X_total = pd.DataFrame(data_idx, columns=['text', 'classification'])
    y_total = X_total.pop('classification')
    X_train, X_val, y_train, y_val = train_test_split(X_total, y_total, test_size=0.2)
    return (X_train, X_val, y_train, y_val)

def find_avg_performance(test_model: BaseEstimator, embedding_type, num_trial = 20) -> None:
    f1s = []

    for i in range(num_trial):
        X_train_avg, X_val_avg, y_train_avg, y_val_avg = generate_split('../data/train.tsv')
        X_train_avg, X_val_avg = embed(embedding_type, X_train_avg, X_val_avg)
        test_model.fit(X_train_avg, y_train_avg)
        y_pred = test_model.predict(X_val_avg)
        f1 = f1_score(y_val_avg, y_pred, average='micro')
        f1s.append(f1)

    print("mean:", np.mean(f1s))
    print("std :", np.std(f1s))

In [None]:
X_train, X_val, y_train, y_val = generate_split('../data/train.tsv')
embeddings = Embeddings(X_train, X_val)

### BOW Data

Uncomment **one** of these three cells to experiment with that type of word-representation.

In [None]:
# from sklearn.preprocessing import StandardScaler

# embed_type = embeddings.BOW()
# X_train, X_val = embed(embed_type, X_train, X_val)
# X_train = StandardScaler().fit_transform(X_train.todense())
# X_val = StandardScaler().fit_transform(X_val.todense())

### TF-IDF Data

In [None]:
# embed_type = embeddings.TFIDF()
# X_train, X_val = embed(embeddings.TFIDF(), X_train, X_val)

### word2vec Data

In [None]:
# embed_type = embeddings.word2vec()
# X_train, X_val = embed(embed_type, X_train, X_val)

# Learn Classifiers

## Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=5000)
model.fit(X_train, y_train)
model.score(X_val, y_val)

### Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

import seaborn as sns

In [None]:
y_pred = model.predict(X_val)
cm = confusion_matrix(y_val, y_pred)

In [None]:
sns.heatmap(cm, annot=True)

In [None]:
sns.heatmap(cm / np.tile(np.sum(cm, axis=1), (4, 1)).T, annot=True)

### Average performance

In [None]:
find_avg_performance(LogisticRegression(max_iter=5000), embed_type)

### Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform

In [None]:
param_grid_lr = {
    'C': uniform(0.001, 1000)
}

scores = []

lr1_grid = RandomizedSearchCV(
    LogisticRegression(solver='saga', max_iter=2000, penalty='l1'), 
    param_distributions=param_grid_lr, 
    n_jobs=-1, 
    scoring='f1_micro'
)
lr1_grid.fit(X_train, y_train)
scores.append(('L1', lr1_grid.score(X_val, y_val)))

In [None]:
lr2_grid = RandomizedSearchCV(
    LogisticRegression(solver='saga', max_iter=1000, penalty='l2'), 
    param_distributions=param_grid_lr, 
    n_jobs=-1, 
    scoring='f1_micro'
)
lr2_grid.fit(X_train, y_train)
scores.append(('L2', lr2_grid.score(X_val, y_val)))

In [None]:
param_grid_lr['l1_ratio'] = uniform(0,1)
lre_grid = RandomizedSearchCV(
    LogisticRegression(solver='saga', max_iter=1000, penalty='elasticnet'), 
    param_distributions=param_grid_lr, 
    n_jobs=-1, 
    scoring='f1_micro'
)
lre_grid.fit(X_train, y_train)
scores.append(('Elastic', lre_grid.score(X_val, y_val)))

In [None]:
for pen, score in scores:
    print(f'{pen}: {score}')

In [None]:
find_avg_performance(lr1_grid.best_estimator_, embed_type)
find_avg_performance(lr2_grid.best_estimator_, embed_type)
find_avg_performance(lre_grid.best_estimator_, embed_type)

In [None]:
y_pred = lr_grid.predict(X_val)
cm = confusion_matrix(y_val, y_pred)
sns.heatmap(cm, annot=True)

## SVM

In [None]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_model.score(X_val, y_val)

In [None]:
param_grid_svm = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 'scale']
}

svm_grid = GridSearchCV(SVC(), param_grid_svm, refit=True, n_jobs=-1, scoring='f1_micro')
svm_grid.fit(X_train, y_train)
svm_grid.score(X_val, y_val)

In [None]:
y_pred = svm_grid.predict(X_val)
cm = confusion_matrix(y_val, y_pred)
sns.heatmap(cm, annot=True)

## Attention

In [None]:
from neural_network import RNN, NNTrainer
import torch

In [None]:
train_set = pd.concat((X_train, y_train))
val_set = pd.concat((X_val, y_val))

train_loader = torch.utils.data.DataLoader(train_set, batch_size=10, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=100, shuffle=False)

In [None]:


nn_embed = RNN(4, 50, num_words, 100, False, word_types)
nn_w2v = RNN(4, 50, num_words, 100, False, word_types)
nn_w2v.set_embedding_weights()
nn_trainer_e = NNTrainer(
    nn_embed,
    5,
    torch.optim.Adam,
    torch.nn.CrossEntropyLoss(),
    train_loader,
    val_loader
)
nn_trainer_w = NNTrainer(
    nn_embed,
    5,
    torch.optim.Adam,
    torch.nn.CrossEntropyLoss(),
    train_loader,
    val_loader
)