# Train SVM classifiers for poet profiling (gender)

In [1]:
from format import *
import features
import svm_classifier
import torch
from importlib import reload

  from .autonotebook import tqdm as notebook_tqdm


Load in the training data. Then extract features to turn poem into vector.

In [2]:
df_train, df_test = get_text_to_gender()
df_train = df_train.where(df_train['content'].apply(lambda x: isinstance(x, str) and x.strip() != '')).dropna()

df_test = df_test.where(df_test['content'].apply(lambda x: isinstance(x, str) and x.strip() != '')).dropna()

gender_map = {'M': 0, 'F': 1}
df_train['gender'] = df_train['gender'].map(gender_map)
df_train = df_train.dropna(subset=['gender'])

df_test['gender'] = df_test['gender'].map(gender_map)
df_test = df_test.dropna(subset=['gender'])

In [3]:
# what does F vs M train split look like
df_train.where(df_train['gender']==1).count()

content    275
gender     275
dtype: int64

Some features (BoW and prosodic attributes)

In [18]:
# unigram and bigrams (just to check these out)
unigrams, bigrams = features.get_top_n_vocab(df_train['content'], n=1000)
print(unigrams)
print(bigrams)

['de', 'y', 'la', 'el', 'que', 'en', 'a', 'su', 'del', 'al', 'tu', 'mi', 'con', 'un', 'los', 'se', 'las', 'no', 'por', 'es', 'me', 'más', 'sus', 'te', 'una', 'como', 'lo', 'si', 'amor', 'sin', 'Y', 'ni', 'vida', 'tus', 'le', 'alma', 'para', 'ya', 'mis', 'tan', 'cuando', 'o', 'luz', 'cielo', 'yo', 'pues', 'Dios', 'sol', 'ser', 'ha', 'entre', 'mundo', 'No', 'corazón', 'En', 'ojos', 'sobre', 'bien', 'La', 'El', 'qué', 'gloria', 'muerte', 'todo', 'tú', 'día', 'quien', 'cual', 'porque', 'ti', 'dulce', 'son', 'mar', 'De', 'donde', 'dolor', 'sólo', 'Si', 'noche', 'siempre', 'fue', 'tanto', 'triste', 'Mas', 'frente', 'fin', 'flores', 'voz', 'tiempo', 'tierra', 'mano', 'pecho', 'este', 'Oh', 'mas', 'hay', 'está', 'oh', 'tal', 'así', 'nos', 'oro', 'mí', 'hoy', 'Por', 'mal', 'hasta', 'mía', 'Qué', 'esta', 'vez', 'él', 'viento', 'Yo', 'dos', 'ella', 'e', 'aunque', 'he', 'fuego', 'llanto', 'llama', 'bajo', 'flor', 'aquel', 'pero', 'A', 'Es', 'Que', 'hombre', 'vano', 'gran', 'suerte', 'sombra', 'Ya'

In [19]:
reload(features);

In [53]:
out_vect, uni_vocab, bi_vocab, tf_uni, tf_bi, idf_uni, idf_bi = features.get_tfidf_vocab(df_train['content'], n = 1000)
input = torch.Tensor(out_vect)
print(out_vect[0])

[0.02365287618612987, 0.036037715838459376, 0.024732910667101284, 0.037173165405216306, 0.050271477680536504, 0.012570995831923715, 0.04507850503755797, 0.018213466961545688, 0.01747164487950464, 0.035772233432933474, 0.0, 0.022782397288660487, 0.03649856520305132, 0.0, 0.0, 0.04074116755804655, 0.04246851570945095, 0.0, 0.0, 0.0, 0.026778323003825297, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0281240820259581, 0.025779089136242987, 0.027397393919603795, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.030871886699752455, 0.0299424030659486, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03673832451851364, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.039087629740822694, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.040739716631904135, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03781082718172574, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04184249012318385, 0.0, 0.0, 0.0, 0.0, 0.04293003008151545, 0.0, 0

In [54]:
# compute ngram and prosodic features
# N x D
"""
input = []
counter = 0
total = df_train.count()['content']
remove = []
for text in df_train['content']:
    bow = features.text_to_bag_of_words(text, unigrams, bigrams)
    try:
        prosodic = features.features_for_sonnet(text).values()
    except Exception as e:
        # just throw the row away
        print(f"Error occurred: {e}")
        remove.append(text)
        continue
    input.append(list(bow) + list(prosodic))
    if counter % 1000 == 0:
        print(f"Constructing feature vector {counter}/{total}")
    counter += 1
for text in remove: # get rid of bad input
    df_train = df_train[df_train['content'] != text]
"""


'\ninput = []\ncounter = 0\ntotal = df_train.count()[\'content\']\nremove = []\nfor text in df_train[\'content\']:\n    bow = features.text_to_bag_of_words(text, unigrams, bigrams)\n    try:\n        prosodic = features.features_for_sonnet(text).values()\n    except Exception as e:\n        # just throw the row away\n        print(f"Error occurred: {e}")\n        remove.append(text)\n        continue\n    input.append(list(bow) + list(prosodic))\n    if counter % 1000 == 0:\n        print(f"Constructing feature vector {counter}/{total}")\n    counter += 1\nfor text in remove: # get rid of bad input\n    df_train = df_train[df_train[\'content\'] != text]\n'

In [55]:
input_arr = torch.Tensor(input)

label_arr = torch.Tensor(df_train['gender'].values)

print(input_arr.shape)
print(label_arr.shape)

torch.Size([4036, 2000])
torch.Size([4036])


Create SVM model.

In [56]:
reload(svm_classifier);

In [58]:

model = svm_classifier.MulticlassSVM(input_arr.shape[1], 2)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

epochs = 5

for epoch in range(epochs):
    outputs = model(input_arr)
    loss = model.hinge_loss(outputs.float(), label_arr.long())
    print(f'Epoch {epoch}: Hinge Loss is {loss.item()}')
    with torch.no_grad():
        predictions = torch.argmax(outputs, dim=1)
        correct = (predictions == label_arr).sum().item()
        print(f'Train accuracy: {correct / label_arr.size(0):.3f}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch 0: Hinge Loss is 1.004929542541504
Train accuracy: 0.236
Epoch 1: Hinge Loss is 0.9897453784942627
Train accuracy: 0.896
Epoch 2: Hinge Loss is 0.9745612740516663
Train accuracy: 0.932
Epoch 3: Hinge Loss is 0.959377110004425
Train accuracy: 0.932
Epoch 4: Hinge Loss is 0.9441930651664734
Train accuracy: 0.932


Evaluate on test set

In [59]:
# compute ngram and prosodic features
# for test set
"""
input_test = []
counter = 0
total_test = df_test.count()['content']
remove_test = []
for text in df_test['content']:
    bow = features.text_to_bag_of_words(text, unigrams, bigrams)
    try:
        prosodic = features.features_for_sonnet(text).values()
    except Exception as e:
        # just throw the row away
        print(f"Error occurred: {e}")
        remove_test.append(text)
        continue
    input_test.append(list(bow) + list(prosodic))
    if counter % 1000 == 0:
        print(f"Constructing feature vector {counter}/{total_test}")
    counter += 1
for text in remove_test: # get rid of bad input
    df_test = df_test[df_test['content'] != text]
input_arr_test = torch.Tensor(input_test)
label_arr_test = torch.Tensor(df_test['gender'].values)
"""

'\ninput_test = []\ncounter = 0\ntotal_test = df_test.count()[\'content\']\nremove_test = []\nfor text in df_test[\'content\']:\n    bow = features.text_to_bag_of_words(text, unigrams, bigrams)\n    try:\n        prosodic = features.features_for_sonnet(text).values()\n    except Exception as e:\n        # just throw the row away\n        print(f"Error occurred: {e}")\n        remove_test.append(text)\n        continue\n    input_test.append(list(bow) + list(prosodic))\n    if counter % 1000 == 0:\n        print(f"Constructing feature vector {counter}/{total_test}")\n    counter += 1\nfor text in remove_test: # get rid of bad input\n    df_test = df_test[df_test[\'content\'] != text]\ninput_arr_test = torch.Tensor(input_test)\nlabel_arr_test = torch.Tensor(df_test[\'gender\'].values)\n'

In [60]:
# just tfidf bow
# vars for ref: uni_vocab, bi_vocab, tf_uni, tf_bi, idf_uni, idf_bi
input_arr_test = torch.Tensor([features.get_tfidf_test(text, uni_vocab, bi_vocab, tf_uni, tf_bi, idf_uni, idf_bi) for text in df_test['content']])
    
label_arr_test = torch.Tensor(df_test['gender'].values)
print(input_arr_test.shape)
print(label_arr_test.shape)

torch.Size([447, 2000])
torch.Size([447])


In [61]:
test_outputs = model(input_arr_test)
test_loss = model.hinge_loss(test_outputs.float(), label_arr_test.long())
print(f'Test loss: {test_loss.item()}')
with torch.no_grad():
    test_predictions = torch.argmax(test_outputs, dim=1)
    test_correct = (test_predictions == label_arr_test).sum().item()
    print(f'Test accuracy: {test_correct / label_arr_test.size(0):.3f}')

Test loss: 0.9285615682601929
Test accuracy: 0.933


Is this any better than always predicting male?

In [62]:
print(test_predictions)
print(label_arr_test)
alwaysM = (label_arr_test == torch.zeros(label_arr_test.shape)).sum().item()
print(f'Test accuracy: {alwaysM / label_arr_test.size(0):.3f}')
# Answer: no, not really. It always predicts male anyways :o

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,