# Train SVM classifiers for poet profiling (country)

In [29]:
from format import *
import features
import svm_classifier
import torch
from importlib import reload

Load in the training data. Then extract features to turn poem into vector.

In [30]:
df_train, df_test = get_text_to_country_of_origin()
df_train = df_train.where(df_train['content'].apply(lambda x: isinstance(x, str) and x.strip() != '')).dropna()

df_test = df_test.where(df_test['content'].apply(lambda x: isinstance(x, str) and x.strip() != '')).dropna()

countries = get_countries()
countries_map = {countries[i] : i for i in range(len(countries))}

df_train['country-birth'] = df_train['country-birth'].map(countries_map)
df_train = df_train.dropna(subset=['country-birth'])

df_test['country-birth'] = df_test['country-birth'].map(countries_map)
df_test = df_test.dropna(subset=['country-birth'])

In [31]:
print(countries)
df_train.where(df_train['country-birth']==0).count()

['Filipinas', 'Cuba', 'España', 'Ecuador', 'Portugal', 'México', 'Honduras', 'Italia', 'Brasil', 'Chile', 'Nicaragua', 'Perú', 'Argentina', 'Venezuela', 'Uruguay', 'Puerto Rico', 'Haití', 'Costa Rica', 'Colombia', 'República Dominicana', 'Bolivia', 'Paraguay', 'Panamá']


content          203
country-birth    203
dtype: int64

Some features (BoW and prosodic attributes)

In [32]:
# unigram and bigrams (just to check these out)
unigrams, bigrams = features.get_top_n_vocab(df_train['content'], n=2000)
print(unigrams)
print(bigrams)

['de', 'y', 'la', 'el', 'que', 'en', 'a', 'su', 'del', 'al', 'tu', 'con', 'un', 'mi', 'los', 'no', 'las', 'se', 'por', 'es', 'me', 'más', 'sus', 'como', 'una', 'te', 'lo', 'si', 'amor', 'sin', 'Y', 'vida', 'tus', 'ni', 'le', 'alma', 'para', 'ya', 'mis', 'cuando', 'tan', 'o', 'luz', 'cielo', 'yo', 'Dios', 'pues', 'sol', 'ser', 'ha', 'No', 'entre', 'corazón', 'mundo', 'ojos', 'sobre', 'En', 'El', 'La', 'qué', 'bien', 'muerte', 'tú', 'quien', 'día', 'todo', 'cual', 'gloria', 'ti', 'porque', 'dulce', 'mar', 'dolor', 'son', 'De', 'sólo', 'siempre', 'donde', 'noche', 'Si', 'fue', 'triste', 'frente', 'Mas', 'tanto', 'fin', 'voz', 'flores', 'mano', 'tierra', 'Oh', 'tiempo', 'mas', 'hay', 'pecho', 'este', 'nos', 'tal', 'oh', 'hoy', 'está', 'mí', 'así', 'oro', 'viento', 'mía', 'hasta', 'Por', 'él', 'mal', 'Qué', 'vez', 'esta', 'dos', 'Yo', 'ella', 'fuego', 'aunque', 'he', 'llanto', 'e', 'pero', 'bajo', 'A', 'gran', 'mil', 'hombre', 'llama', 'nombre', 'vano', 'sombra', 'aquel', 'Que', 'otro', 'fl

In [33]:
reload(features);

In [None]:
out_vect, uni_vocab, bi_vocab, tf_uni, tf_bi, idf_uni, idf_bi = features.get_tfidf_vocab(df_train['content'], n = 2000)
input = torch.Tensor(out_vect)
print(out_vect[0])

In [8]:
# compute ngram and prosodic features
# N x D
"""
input = []
counter = 0
total = df_train.count()['content']
remove = []
for text in df_train['content']:
    bow = features.text_to_bag_of_words(text, unigrams, bigrams)
    try:
        prosodic = features.features_for_sonnet(text).values()
    except Exception as e:
        # just throw the row away
        print(f"Error occurred: {e}")
        remove.append(text)
        continue
    input.append(list(bow) + list(prosodic))
    if counter % 1000 == 0:
        print(f"Constructing feature vector {counter}/{total}")
    counter += 1
for text in remove: # get rid of bad input
    df_train = df_train[df_train['content'] != text]
"""


'\ninput = []\ncounter = 0\ntotal = df_train.count()[\'content\']\nremove = []\nfor text in df_train[\'content\']:\n    bow = features.text_to_bag_of_words(text, unigrams, bigrams)\n    try:\n        prosodic = features.features_for_sonnet(text).values()\n    except Exception as e:\n        # just throw the row away\n        print(f"Error occurred: {e}")\n        remove.append(text)\n        continue\n    input.append(list(bow) + list(prosodic))\n    if counter % 1000 == 0:\n        print(f"Constructing feature vector {counter}/{total}")\n    counter += 1\nfor text in remove: # get rid of bad input\n    df_train = df_train[df_train[\'content\'] != text]\n'

In [9]:
input_arr = torch.Tensor(input)

label_arr = torch.Tensor(df_train['country-birth'].values)

print(input_arr.shape)
print(label_arr.shape)

torch.Size([4044, 2000])
torch.Size([4044])


Create SVM model.

In [10]:
reload(svm_classifier);

In [24]:

model = svm_classifier.MulticlassSVM(input_arr.shape[1], len(countries))
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

epochs = 5

for epoch in range(epochs):
    outputs = model(input_arr)
    loss = model.hinge_loss(outputs.float(), label_arr.long())
    print(f'Epoch {epoch}: Hinge Loss is {loss.item()}')
    with torch.no_grad():
        predictions = torch.argmax(outputs, dim=1)
        correct = (predictions == label_arr).sum().item()
        print(f'Train accuracy: {correct / label_arr.size(0):.3f}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch 0: Hinge Loss is 21.71601104736328
Train accuracy: 0.130
Epoch 1: Hinge Loss is 20.20848846435547
Train accuracy: 0.537
Epoch 2: Hinge Loss is 18.700971603393555
Train accuracy: 0.537
Epoch 3: Hinge Loss is 17.19345474243164
Train accuracy: 0.537
Epoch 4: Hinge Loss is 15.68593692779541
Train accuracy: 0.537


Evaluate on test set

In [25]:
# compute ngram and prosodic features
# for test set
"""
input_test = []
counter = 0
total_test = df_test.count()['content']
remove_test = []
for text in df_test['content']:
    bow = features.text_to_bag_of_words(text, unigrams, bigrams)
    try:
        prosodic = features.features_for_sonnet(text).values()
    except Exception as e:
        # just throw the row away
        print(f"Error occurred: {e}")
        remove_test.append(text)
        continue
    input_test.append(list(bow) + list(prosodic))
    if counter % 1000 == 0:
        print(f"Constructing feature vector {counter}/{total_test}")
    counter += 1
for text in remove_test: # get rid of bad input
    df_test = df_test[df_test['content'] != text]
input_arr_test = torch.Tensor(input_test)
label_arr_test = torch.Tensor(df_test['gender'].values)
"""

'\ninput_test = []\ncounter = 0\ntotal_test = df_test.count()[\'content\']\nremove_test = []\nfor text in df_test[\'content\']:\n    bow = features.text_to_bag_of_words(text, unigrams, bigrams)\n    try:\n        prosodic = features.features_for_sonnet(text).values()\n    except Exception as e:\n        # just throw the row away\n        print(f"Error occurred: {e}")\n        remove_test.append(text)\n        continue\n    input_test.append(list(bow) + list(prosodic))\n    if counter % 1000 == 0:\n        print(f"Constructing feature vector {counter}/{total_test}")\n    counter += 1\nfor text in remove_test: # get rid of bad input\n    df_test = df_test[df_test[\'content\'] != text]\ninput_arr_test = torch.Tensor(input_test)\nlabel_arr_test = torch.Tensor(df_test[\'gender\'].values)\n'

In [26]:
# just tfidf bow
# vars for ref: uni_vocab, bi_vocab, tf_uni, tf_bi, idf_uni, idf_bi
input_arr_test = torch.Tensor([features.get_tfidf_test(text, uni_vocab, bi_vocab, tf_uni, tf_bi, idf_uni, idf_bi) for text in df_test['content']])
    
label_arr_test = torch.Tensor(df_test['country-birth'].values)
print(input_arr_test.shape)
print(label_arr_test.shape)

torch.Size([439, 2000])
torch.Size([439])


In [27]:
test_outputs = model(input_arr_test)
test_loss = model.hinge_loss(test_outputs.float(), label_arr_test.long())
print(f'Test loss: {test_loss.item()}')
with torch.no_grad():
    test_predictions = torch.argmax(test_outputs, dim=1)
    test_correct = (test_predictions == label_arr_test).sum().item()
    print(f'Test accuracy: {test_correct / label_arr_test.size(0):.3f}')

Test loss: 13.977718353271484
Test accuracy: 0.549


Is this any better than always predicting Spain? (most frequent country in dataset)

In [28]:
print(test_predictions)
print(label_arr_test)
alwaysSpain = (label_arr_test == countries_map['España'] * torch.ones(label_arr_test.shape)).sum().item()
print(f'Test accuracy: {alwaysSpain / label_arr_test.size(0):.3f}')
# Answer: no, not really. It always predicts Spain :o

tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,