# Train SVM classifiers for poet profiling (gender)

In [2]:
from format import *
import features
import svm_classifier
import torch
from importlib import reload

  from .autonotebook import tqdm as notebook_tqdm


Load in the training data. Then extract features to turn poem into vector.

In [3]:
df_train, df_test = get_text_to_gender()
df_train = df_train.where(df_train['content'].apply(lambda x: isinstance(x, str) and x.strip() != '')).dropna()

df_test = df_test.where(df_test['content'].apply(lambda x: isinstance(x, str) and x.strip() != '')).dropna()

gender_map = {'M': 0, 'F': 1}
df_train['gender'] = df_train['gender'].map(gender_map)
df_train = df_train.dropna(subset=['gender'])
df_train = df_train.dropna(subset=['rhyme'])
df_train = df_train.dropna(subset=['met'])

df_test['gender'] = df_test['gender'].map(gender_map)
df_test = df_test.dropna(subset=['gender'])
df_test = df_test.dropna(subset=['rhyme'])
df_test = df_test.dropna(subset=['met'])

In [4]:
# what does F vs M train split look like
print(df_train.where(df_train['gender']==1).count())
print(df_train.head())

content    274
gender     274
rhyme      274
met        274
dtype: int64
                                             content  gender  \
0  Mi palabra tenaz como el cincel\nfue modelando...       0   
1  La muda soledad del firmamento,\ncomo un lago,...       0   
2  ¿Quién, artista sublime, conmovido\nno se sint...       0   
3  Nos pusimos de acuerdo sin habernos hablado\nC...       0   
4  Contendían los Dioses soberanos\ndel cielo, po...       0   

                                        rhyme  \
0  [A, B, B, A, A, B, B, A, C, D, C, E, D, E]   
1  [A, B, B, A, A, B, B, A, C, D, C, E, D, E]   
2  [A, B, B, A, A, B, B, A, C, D, E, C, D, E]   
3  [A, B, A, B, C, D, C, D, E, E, F, G, G, F]   
4  [A, B, B, A, A, B, B, A, C, D, E, C, D, E]   

                                                 met  
0  [--+--+---+-, +--+-+---+-, -+-+-++-++-, ---+++...  
1  [-+---+---+-, -++--+---+-, --+--+---+-, ---+--...  
2  [+-+--+---+-, +--+---+-+-, --+--+---+-, --+--+...  
3  [+-+--+---++-+-, ++--+-+

Some features (BoW and prosodic attributes)

In [5]:
# unigram and bigrams (just to check these out)
unigrams, bigrams = features.get_top_n_vocab(df_train['content'], n=1000)
print(unigrams)
print(bigrams)

['de', 'la', 'y', 'el', 'que', 'en', 'a', 'su', 'del', 'al', 'tu', 'con', 'un', 'mi', 'los', 'no', 'las', 'se', 'por', 'es', 'me', 'sus', 'una', 'más', 'te', 'como', 'lo', 'si', 'amor', 'sin', 'Y', 'tus', 'ni', 'vida', 'le', 'alma', 'ya', 'para', 'mis', 'cuando', 'o', 'tan', 'luz', 'cielo', 'yo', 'Dios', 'pues', 'sol', 'No', 'ser', 'ha', 'entre', 'mundo', 'corazón', 'bien', 'ojos', 'En', 'La', 'qué', 'sobre', 'El', 'muerte', 'quien', 'gloria', 'día', 'tú', 'ti', 'cual', 'porque', 'todo', 'dulce', 'De', 'donde', 'son', 'siempre', 'dolor', 'sólo', 'mar', 'Si', 'noche', 'fue', 'triste', 'tanto', 'Mas', 'frente', 'flores', 'voz', 'tierra', 'fin', 'mano', 'tiempo', 'mas', 'pecho', 'Oh', 'este', 'hay', 'nos', 'hoy', 'tal', 'oh', 'oro', 'así', 'está', 'Por', 'hasta', 'mí', 'viento', 'mal', 'esta', 'Qué', 'vez', 'mía', 'ella', 'Yo', 'dos', 'he', 'fuego', 'él', 'e', 'aunque', 'pero', 'bajo', 'llama', 'flor', 'A', 'llanto', 'gran', 'sombra', 'nombre', 'hombre', 'aquel', 'mil', 'Es', 'ver', 'suer

In [6]:
reload(features);

In [7]:
out_vect, uni_vocab, bi_vocab, tf_uni, tf_bi, idf_uni, idf_bi = features.get_tfidf_vocab(df_train['content'], n = 2000)
rm_vect = features.get_rhyme_meter_vectors(df_train['rhyme'], df_train['met'])
rm_tensor = torch.Tensor(rm_vect)
input = torch.hstack((torch.Tensor(out_vect), rm_tensor))
print(rm_vect)

[[1, 2, 2, 1, 1, 2, 2, 1, 3, 4, 3, 5, 4, 5, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 2, 2, 1, 1, 2, 2, 1, 3, 4, 3, 5, 4, 5, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 

In [8]:
input_arr = input

label_arr = torch.Tensor(df_train['gender'].values)

print(input_arr.shape)
print(label_arr.shape)

torch.Size([4015, 4224])
torch.Size([4015])


Create SVM model.

In [9]:
reload(svm_classifier);

In [16]:

model = svm_classifier.MulticlassSVM(input_arr.shape[1], 2)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

epochs = 10

for epoch in range(epochs):
    outputs = model(input_arr)
    loss = model.hinge_loss(outputs.float(), label_arr.long())
    print(f'Epoch {epoch}: Hinge Loss is {loss.item()}')
    with torch.no_grad():
        predictions = torch.argmax(outputs, dim=1)
        correct = (predictions == label_arr).sum().item()
        print(f'Train accuracy: {correct / label_arr.size(0):.3f}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch 0: Hinge Loss is 0.8842678666114807
Train accuracy: 0.895
Epoch 1: Hinge Loss is 0.2474053055047989
Train accuracy: 0.929
Epoch 2: Hinge Loss is 0.23441006243228912
Train accuracy: 0.929
Epoch 3: Hinge Loss is 0.22153319418430328
Train accuracy: 0.929
Epoch 4: Hinge Loss is 0.2088039517402649
Train accuracy: 0.929
Epoch 5: Hinge Loss is 0.19647328555583954
Train accuracy: 0.929
Epoch 6: Hinge Loss is 0.18482638895511627
Train accuracy: 0.929
Epoch 7: Hinge Loss is 0.1745482087135315
Train accuracy: 0.929
Epoch 8: Hinge Loss is 0.16621847450733185
Train accuracy: 0.929
Epoch 9: Hinge Loss is 0.16222257912158966
Train accuracy: 0.929


Evaluate on test set

In [17]:
# just tfidf bow
# vars for ref: uni_vocab, bi_vocab, tf_uni, tf_bi, idf_uni, idf_bi
test_tfidf = torch.Tensor([features.get_tfidf_test(text, uni_vocab, bi_vocab, tf_uni, tf_bi, idf_uni, idf_bi) for text in df_test['content']])
test_rm = torch.Tensor(features.get_rhyme_meter_vectors(df_test['rhyme'], df_test['met']))

input_arr_test = torch.hstack((test_tfidf, test_rm))
    
label_arr_test = torch.Tensor(df_test['gender'].values)
print(input_arr_test.shape)
print(label_arr_test.shape)

torch.Size([445, 4224])
torch.Size([445])


In [18]:
test_outputs = model(input_arr_test)
test_loss = model.hinge_loss(test_outputs.float(), label_arr_test.long())
print(f'Test loss: {test_loss.item()}')
with torch.no_grad():
    test_predictions = torch.argmax(test_outputs, dim=1)
    test_correct = (test_predictions == label_arr_test).sum().item()
    print(f'Test accuracy: {test_correct / label_arr_test.size(0):.3f}')

Test loss: 0.15789827704429626
Test accuracy: 0.933


Is this any better than always predicting male?

In [19]:
print(test_predictions)
print(label_arr_test)
alwaysM = (label_arr_test == torch.zeros(label_arr_test.shape)).sum().item()
print(f'Test accuracy: {alwaysM / label_arr_test.size(0):.3f}')
# Answer: no, not really. It always predicts male anyways :o

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

Let's consider weigting so that the majority class doesn't overtake the model.

In [56]:
reload(svm_classifier);
# 0, then 1
class_count = np.bincount(label_arr, minlength=len(gender_map))

# normalized inverse class weight
class_weights = torch.Tensor(len(label_arr) / (len(gender_map) * class_count))
print(class_count)

print(class_weights)


[3741  274]
tensor([0.5366, 7.3266])


In [57]:
model = svm_classifier.MulticlassSVM(input_arr.shape[1], 2)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

epochs = 8

for epoch in range(epochs):
    outputs = model(input_arr)
    loss = model.hinge_loss(outputs.float(), label_arr.long(), class_weights=class_weights)
    print(f'Epoch {epoch}: Hinge Loss is {loss.item()}')
    with torch.no_grad():
        predictions = torch.argmax(outputs, dim=1)
        correct = (predictions == label_arr).sum().item()
        print(f'Train accuracy: {correct / label_arr.size(0):.3f}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch 0: Hinge Loss is 0.9965729713439941
Train accuracy: 0.733
Epoch 1: Hinge Loss is 0.9952638745307922
Train accuracy: 0.701
Epoch 2: Hinge Loss is 0.9939548373222351
Train accuracy: 0.673
Epoch 3: Hinge Loss is 0.992645800113678
Train accuracy: 0.639
Epoch 4: Hinge Loss is 0.9913366436958313
Train accuracy: 0.603
Epoch 5: Hinge Loss is 0.9900277256965637
Train accuracy: 0.573
Epoch 6: Hinge Loss is 0.9887186288833618
Train accuracy: 0.545
Epoch 7: Hinge Loss is 0.9874095320701599
Train accuracy: 0.515


In [58]:
test_outputs = model(input_arr_test)
test_loss = model.hinge_loss(test_outputs.float(), label_arr_test.long(), class_weights=class_weights)
print(f'Test loss: {test_loss.item()}')
with torch.no_grad():
    test_predictions = torch.argmax(test_outputs, dim=1)
    test_correct = (test_predictions == label_arr_test).sum().item()
    print(f'Test accuracy: {test_correct / label_arr_test.size(0):.3f}')

Test loss: 0.9815561771392822
Test accuracy: 0.443


In [59]:
print(test_predictions)

tensor([1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
        1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
        1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
        0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,
        1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
        0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0,
        0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
        1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0,
        1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
        1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
        1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,