# Import Libraries

In [8]:
import pandas as pd
import numpy as np

import nltk

from collections import Counter

# Load Data

In [2]:
df = pd.read_csv('../data/train.tsv', sep='\t')
train_data = df.to_records(index=False)
train_data[:3]

rec.array([('nano composites were prepared by dispersing barium titanate  BT  and/or oMMT clay particles in epoxy resin.', 'action'),
           ('In this process, the magnetic stirring lasted 2 h at room temperature.', 'action'),
           ('This coating heating cycle was repeated several times to achieve the desired film thickness', 'action')],
          dtype=[('text', 'O'), ('label', 'O')])

## Preprocess

In [3]:
from sentence_level_preprocess import *

In [4]:
# remove line breaks (e.g. "elec- tron" -> "electron")
train_data = [(rm_linebreaks(t), l) for t, l in train_data]

# lower case
train_data = [(t.lower(), l) for t, l in train_data]

## Tokenize

In [5]:
train_data = [(nltk.tokenize.word_tokenize(t), l) for t, l in train_data]

## Index words

In [6]:
train_texts   = [x[0] for x in train_data]
train_word_pool   = [w for sentence in train_texts for w in sentence]

train_targets = [x[1] for x in train_data]

In [9]:
word_counts = sorted(Counter(train_word_pool).items(), key=lambda x: x[1], reverse=True)
label_counts = sorted(Counter(train_targets).items(), key=lambda x: x[1], reverse=True)

In [10]:
word_types = [x[0] for x in word_counts]
word_type2idx = {wordtype: i for i, wordtype in enumerate(word_types)}

label_types = [x[0] for x in label_counts]
label_type2idx = {labeltype: i for i, labeltype in enumerate(label_types)}

In [11]:
train_data_idx = [([word_type2idx[w] for w in t], label_type2idx[l]) for t, l in train_data]
train_data_idx[:3]

[([449, 44, 8, 37, 16, 387, 547, 260, 232, 963, 964, 1494, 35, 12, 28, 31, 0],
  0),
 ([109, 64, 116, 2, 1, 233, 97, 1495, 41, 19, 14, 60, 30, 0], 0),
 ([123, 965, 290, 697, 7, 291, 292, 124, 11, 698, 1, 293, 160, 87], 0)]

In [12]:
label_type2idx

{'action': 0, 'constituent': 1, nan: 2, 'property': 3}

## Featurize

In [13]:
from featurize import *

In [14]:
V = len(word_type2idx)

# bag of words
train_data_bow = [(bow(t, V), l) for t, l in train_data_idx]

## Train test (validation) split

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X = [t for t, l in train_data_bow]
y = [l for t, l in train_data_bow]

In [17]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

# Learn Classifiers

## Logistic regression

In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [20]:
model.score(X_val, y_val)

0.8287292817679558

In [21]:
from sklearn.metrics import f1_score

In [22]:
num_trial = 20
f1s = []

for i in range(num_trial):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred, average='micro')
    f1s.append(f1)

print("mean:", np.mean(f1s))
print("std :", np.std(f1s))

mean: 0.8383977900552486
std : 0.027227817558917622


## SVM

## Attention