# Train SVM classifiers for poet profiling (época, century)

In [18]:
from format import *
import features
import svm_classifier
import torch
from importlib import reload

Load in the training data. Then extract features to turn poem into vector.

In [19]:
df_train, df_test = get_text_to_period()
df_train = df_train.where(df_train['content'].apply(lambda x: isinstance(x, str) and x.strip() != '')).dropna()

df_test = df_test.where(df_test['content'].apply(lambda x: isinstance(x, str) and x.strip() != '')).dropna()

# no need to map, since labels are numeric
categories = get_periods()
periodsMap = {categories[i] : i for i in range(len(categories))}
df_train['normdate'] = df_train['normdate'].map(periodsMap)
df_train = df_train.dropna(subset=['normdate'])

df_test['normdate'] = df_test['normdate'].map(periodsMap)
df_test = df_test.dropna(subset=['normdate'])

In [24]:
# what does 16th ctry split look like
print(categories)
df_train.where(df_train['normdate']==4).count()

[19.0, 20.0, 18.0, 17.0, 16.0, 15.0, 14.0]


content     478
normdate    478
dtype: int64

Some features (BoW and prosodic attributes)

In [25]:
# unigram and bigrams (just to check these out)
unigrams, bigrams = features.get_top_n_vocab(df_train['content'], n=1000)
print(unigrams)
print(bigrams)

['de', 'la', 'y', 'el', 'que', 'en', 'a', 'su', 'del', 'al', 'tu', 'con', 'mi', 'un', 'los', 'se', 'no', 'las', 'por', 'es', 'me', 'te', 'más', 'una', 'sus', 'como', 'lo', 'si', 'amor', 'sin', 'Y', 'tus', 'vida', 'ni', 'le', 'alma', 'ya', 'para', 'cuando', 'mis', 'tan', 'o', 'luz', 'cielo', 'yo', 'Dios', 'pues', 'sol', 'No', 'ser', 'ha', 'entre', 'corazón', 'mundo', 'ojos', 'En', 'El', 'sobre', 'La', 'qué', 'bien', 'muerte', 'gloria', 'tú', 'quien', 'todo', 'día', 'cual', 'porque', 'ti', 'dulce', 'son', 'mar', 'De', 'dolor', 'noche', 'donde', 'Si', 'siempre', 'sólo', 'triste', 'fue', 'Mas', 'frente', 'fin', 'tanto', 'tierra', 'voz', 'flores', 'mas', 'mano', 'Oh', 'nos', 'tiempo', 'pecho', 'hay', 'así', 'oro', 'este', 'está', 'tal', 'Por', 'mí', 'hoy', 'oh', 'mal', 'esta', 'hasta', 'él', 'viento', 'Qué', 'vez', 'mía', 'Yo', 'dos', 'ella', 'fuego', 'aunque', 'e', 'he', 'llama', 'llanto', 'bajo', 'sombra', 'vano', 'mil', 'pero', 'A', 'Es', 'hombre', 'gran', 'flor', 'otro', 'nombre', 'Que'

In [26]:
reload(features);

In [27]:
out_vect, uni_vocab, bi_vocab, tf_uni, tf_bi, idf_uni, idf_bi = features.get_tfidf_vocab(df_train['content'], n = 1000)
input = torch.Tensor(out_vect)
print(out_vect[0])

[0.04675627173579679, 0.07806985372703036, 0.04748019708565105, 0.02941465418067815, 0.009944766503610956, 0.029775010949934146, 0.011852965630666661, 0.028629981364004353, 0.0, 0.014113269826372553, 0.0, 0.0, 0.0, 0.015830392433155736, 0.015896784343928248, 0.01611840730570224, 0.01569438970905549, 0.050611459604314776, 0.0, 0.018045138639135285, 0.0, 0.0, 0.0, 0.0, 0.02120704291572052, 0.02065239581917868, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.023193040206645824, 0.0, 0.0, 0.0, 0.023819489573695524, 0.0, 0.0, 0.0, 0.0, 0.0, 0.025965444875741543, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02716497088500407, 0.0, 0.0, 0.0, 0.0, 0.0, 0.027654664677743093, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.028998387445823195, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.032604337249696004, 0.029687580717733135, 0.0, 0.0, 0.03312747802830147, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

In [54]:
# compute ngram and prosodic features
# N x D
"""
input = []
counter = 0
total = df_train.count()['content']
remove = []
for text in df_train['content']:
    bow = features.text_to_bag_of_words(text, unigrams, bigrams)
    try:
        prosodic = features.features_for_sonnet(text).values()
    except Exception as e:
        # just throw the row away
        print(f"Error occurred: {e}")
        remove.append(text)
        continue
    input.append(list(bow) + list(prosodic))
    if counter % 1000 == 0:
        print(f"Constructing feature vector {counter}/{total}")
    counter += 1
for text in remove: # get rid of bad input
    df_train = df_train[df_train['content'] != text]
"""


'\ninput = []\ncounter = 0\ntotal = df_train.count()[\'content\']\nremove = []\nfor text in df_train[\'content\']:\n    bow = features.text_to_bag_of_words(text, unigrams, bigrams)\n    try:\n        prosodic = features.features_for_sonnet(text).values()\n    except Exception as e:\n        # just throw the row away\n        print(f"Error occurred: {e}")\n        remove.append(text)\n        continue\n    input.append(list(bow) + list(prosodic))\n    if counter % 1000 == 0:\n        print(f"Constructing feature vector {counter}/{total}")\n    counter += 1\nfor text in remove: # get rid of bad input\n    df_train = df_train[df_train[\'content\'] != text]\n'

In [28]:
input_arr = torch.Tensor(input)

label_arr = torch.Tensor(df_train['normdate'].values)

print(input_arr.shape)
print(label_arr.shape)

torch.Size([4037, 2000])
torch.Size([4037])


Create SVM model.

In [29]:
reload(svm_classifier);

In [30]:

model = svm_classifier.MulticlassSVM(input_arr.shape[1], len(categories))
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

epochs = 5

for epoch in range(epochs):
    outputs = model(input_arr)
    loss = model.hinge_loss(outputs.float(), label_arr.long())
    print(f'Epoch {epoch}: Hinge Loss is {loss.item()}')
    with torch.no_grad():
        predictions = torch.argmax(outputs, dim=1)
        correct = (predictions == label_arr).sum().item()
        print(f'Train accuracy: {correct / label_arr.size(0):.3f}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch 0: Hinge Loss is 5.898813724517822
Train accuracy: 0.681
Epoch 1: Hinge Loss is 5.7176361083984375
Train accuracy: 0.689
Epoch 2: Hinge Loss is 5.536458969116211
Train accuracy: 0.689
Epoch 3: Hinge Loss is 5.355281829833984
Train accuracy: 0.689
Epoch 4: Hinge Loss is 5.174104690551758
Train accuracy: 0.689


Evaluate on test set

In [31]:
# compute ngram and prosodic features
# for test set
"""
input_test = []
counter = 0
total_test = df_test.count()['content']
remove_test = []
for text in df_test['content']:
    bow = features.text_to_bag_of_words(text, unigrams, bigrams)
    try:
        prosodic = features.features_for_sonnet(text).values()
    except Exception as e:
        # just throw the row away
        print(f"Error occurred: {e}")
        remove_test.append(text)
        continue
    input_test.append(list(bow) + list(prosodic))
    if counter % 1000 == 0:
        print(f"Constructing feature vector {counter}/{total_test}")
    counter += 1
for text in remove_test: # get rid of bad input
    df_test = df_test[df_test['content'] != text]
input_arr_test = torch.Tensor(input_test)
label_arr_test = torch.Tensor(df_test['gender'].values)
"""

'\ninput_test = []\ncounter = 0\ntotal_test = df_test.count()[\'content\']\nremove_test = []\nfor text in df_test[\'content\']:\n    bow = features.text_to_bag_of_words(text, unigrams, bigrams)\n    try:\n        prosodic = features.features_for_sonnet(text).values()\n    except Exception as e:\n        # just throw the row away\n        print(f"Error occurred: {e}")\n        remove_test.append(text)\n        continue\n    input_test.append(list(bow) + list(prosodic))\n    if counter % 1000 == 0:\n        print(f"Constructing feature vector {counter}/{total_test}")\n    counter += 1\nfor text in remove_test: # get rid of bad input\n    df_test = df_test[df_test[\'content\'] != text]\ninput_arr_test = torch.Tensor(input_test)\nlabel_arr_test = torch.Tensor(df_test[\'gender\'].values)\n'

In [32]:
# just tfidf bow
# vars for ref: uni_vocab, bi_vocab, tf_uni, tf_bi, idf_uni, idf_bi
input_arr_test = torch.Tensor([features.get_tfidf_test(text, uni_vocab, bi_vocab, tf_uni, tf_bi, idf_uni, idf_bi) for text in df_test['content']])
    
label_arr_test = torch.Tensor(df_test['normdate'].values)
print(input_arr_test.shape)
print(label_arr_test.shape)

torch.Size([446, 2000])
torch.Size([446])


In [33]:
test_outputs = model(input_arr_test)
test_loss = model.hinge_loss(test_outputs.float(), label_arr_test.long())
print(f'Test loss: {test_loss.item()}')
with torch.no_grad():
    test_predictions = torch.argmax(test_outputs, dim=1)
    test_correct = (test_predictions == label_arr_test).sum().item()
    print(f'Test accuracy: {test_correct / label_arr_test.size(0):.3f}')

Test loss: 4.985991954803467
Test accuracy: 0.693


Is this any better than always predicting 19th century?

In [None]:
print(test_predictions)
print(label_arr_test)
always19 = (label_arr_test == torch.zeros(label_arr_test.shape)).sum().item()
print(f'Test accuracy: {always19 / label_arr_test.size(0):.3f}')
# Answer: no, not really. It just sets things to 19th century

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,