In [3]:
import torch

import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [4]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [5]:
len(tokenizer.vocab)

30522

In [6]:
tokens = tokenizer.tokenize('Hello WORLD how ARE yoU?')

print(tokens)

['hello', 'world', 'how', 'are', 'you', '?']


In [7]:
indexes = tokenizer.convert_tokens_to_ids(tokens)

print(indexes)

[7592, 2088, 2129, 2024, 2017, 1029]


In [8]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [9]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [10]:
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [11]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

print(max_input_length)

512


In [12]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [13]:
from torchtext.legacy import data

TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)
#cEXT,cNEU,cAGR,cCON,cOPN
EXT = data.LabelField(dtype = torch.float)
NEU = data.LabelField(dtype = torch.float)
AGR = data.LabelField(dtype = torch.float)
CON = data.LabelField(dtype = torch.float)
OPN = data.LabelField(dtype = torch.float)


In [14]:
# from torchtext.legacy import datasets

# train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

# train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [15]:
from torchtext.legacy import datasets

# LABEL = data.Field()
# SAYING = data.Field()

#cEXT,cNEU,cAGR,cCON,cOPN
fields = {'TEXT': ('text', TEXT), 'cEXT': ('EXT', EXT), 'cNEU': ('NEU', NEU), 'cAGR': ('AGR', AGR), 'cCON': ('CON', CON), 'cOPN': ('OPN', OPN)}


In [16]:
train_data, valid_data, test_data = data.TabularDataset.splits(
                            path = 'data/k-fold_raw',
                            train = 'train_1.csv',
                            validation='valid_1.csv',
                            test = 'test.csv',
                            format = 'csv',
                            fields = fields
)


In [17]:

TEXT.build_vocab(train_data)


In [38]:
vocab = dict(TEXT.vocab.stoi)
vocab

{100: 0,
 0: 1,
 101: 2,
 102: 3,
 1045: 4,
 1012: 5,
 2000: 6,
 1010: 7,
 1005: 8,
 1996: 9,
 1998: 10,
 2026: 11,
 2008: 12,
 1037: 13,
 2009: 14,
 2003: 15,
 1997: 16,
 1056: 17,
 1999: 18,
 2031: 19,
 2033: 20,
 2021: 21,
 2061: 22,
 2023: 23,
 2005: 24,
 2572: 25,
 2022: 26,
 1055: 27,
 2428: 28,
 2074: 29,
 1049: 30,
 2025: 31,
 2066: 32,
 2001: 33,
 2006: 34,
 2055: 35,
 2079: 36,
 2007: 37,
 2123: 38,
 2054: 39,
 2012: 40,
 2064: 41,
 2035: 42,
 2002: 43,
 2138: 44,
 2228: 45,
 2024: 46,
 2085: 47,
 2131: 48,
 2113: 49,
 2016: 50,
 2027: 51,
 2065: 52,
 2041: 53,
 2183: 54,
 2175: 55,
 2051: 56,
 2030: 57,
 2017: 58,
 2057: 59,
 2039: 60,
 2097: 61,
 2004: 62,
 2043: 63,
 2215: 64,
 2052: 65,
 2045: 66,
 1029: 67,
 2111: 68,
 2014: 69,
 2028: 70,
 2172: 71,
 2514: 72,
 2129: 73,
 2157: 74,
 2182: 75,
 2092: 76,
 2013: 77,
 999: 78,
 2015: 79,
 2204: 80,
 2070: 81,
 2018: 82,
 2042: 83,
 1011: 84,
 2032: 85,
 2062: 86,
 2342: 87,
 2465: 88,
 2082: 89,
 2038: 90,
 2477: 91,
 2068

In [41]:
for key in vocab:
    print(key)
    tokens = tokenizer.convert_ids_to_tokens(key)
    print(tokens)
    break

100
[UNK]


In [25]:
import pickle
with open('data.pickle', 'wb') as f:
    pickle.dump(TEXT.vocab.stoi, f)

In [26]:
with open('data.pickle', 'rb') as f:
    data = pickle.load(f)

In [27]:
print(data)

defaultdict(None, {100: 0, 0: 1, 101: 2, 102: 3, 1045: 4, 1012: 5, 2000: 6, 1010: 7, 1005: 8, 1996: 9, 1998: 10, 2026: 11, 2008: 12, 1037: 13, 2009: 14, 2003: 15, 1997: 16, 1056: 17, 1999: 18, 2031: 19, 2033: 20, 2021: 21, 2061: 22, 2023: 23, 2005: 24, 2572: 25, 2022: 26, 1055: 27, 2428: 28, 2074: 29, 1049: 30, 2025: 31, 2066: 32, 2001: 33, 2006: 34, 2055: 35, 2079: 36, 2007: 37, 2123: 38, 2054: 39, 2012: 40, 2064: 41, 2035: 42, 2002: 43, 2138: 44, 2228: 45, 2024: 46, 2085: 47, 2131: 48, 2113: 49, 2016: 50, 2027: 51, 2065: 52, 2041: 53, 2183: 54, 2175: 55, 2051: 56, 2030: 57, 2017: 58, 2057: 59, 2039: 60, 2097: 61, 2004: 62, 2043: 63, 2215: 64, 2052: 65, 2045: 66, 1029: 67, 2111: 68, 2014: 69, 2028: 70, 2172: 71, 2514: 72, 2129: 73, 2157: 74, 2182: 75, 2092: 76, 2013: 77, 999: 78, 2015: 79, 2204: 80, 2070: 81, 2018: 82, 2042: 83, 1011: 84, 2032: 85, 2062: 86, 2342: 87, 2465: 88, 2082: 89, 2038: 90, 2477: 91, 2068: 92, 2200: 93, 2814: 94, 2205: 95, 4687: 96, 1000: 97, 2067: 98, 2188: 99

In [18]:
print(len(TEXT.vocab.stoi))

14095


In [15]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 1974
Number of validation examples: 247
Number of testing examples: 247


In [16]:
print(vars(train_data.examples[6]))

{'text': [4026, 5899, 3201, 6513, 5353, 5580, 5353, 7793, 2648, 5189, 15703, 3984, 15703, 5185, 6961, 5189, 2189, 2690, 2305, 18672, 2278, 20691, 24907, 2228, 2300, 2066, 2139, 3900, 2630, 11661, 2573, 2204, 25416, 8591, 6300, 2050, 3835, 3147, 3984, 2342, 2817, 3009, 4687, 3008, 3984, 3492, 2574, 8117, 3823, 2183, 3492, 3435, 4826, 5958, 7570, 11335, 2100, 2469, 7570, 11335, 2100, 11479, 4013, 10264, 2100, 9206, 2482, 29034, 2183, 2619, 2015, 2482, 9353, 4904, 28488, 4152, 20114, 2113, 11891, 2066, 3960, 20326, 13082, 5875, 3984, 2147, 2210, 2651, 2147, 6655, 2147, 11771, 3892, 2843, 2111, 4249, 2113, 2300, 16958, 2204, 6825, 2785, 5875, 2113, 2183, 2350, 11829, 4299, 3275, 2821, 26661, 6672, 2718, 2154, 26661, 6672, 4569, 2342, 2367, 7815, 2066, 26181, 4299, 4149, 2502, 4632, 5835, 5091, 2224, 4687, 2054, 2015, 2694, 2785, 2524, 3422, 22868, 2215, 8103, 7492, 3274, 2342, 4365, 13924, 2342, 15041, 7366, 19453, 2183, 2113, 2215, 2817, 3984, 2147, 3892, 10166, 12021, 5445, 4687, 9353, 3

In [17]:
tokens = tokenizer.convert_ids_to_tokens(vars(train_data.examples[6])['text'])

print(tokens)

['traffic', 'austin', 'ready', 'girlfriend', 'weekend', 'glad', 'weekend', 'buses', 'outside', 'loud', 'annoying', 'guess', 'annoying', 'sri', '##ves', 'loud', 'music', 'middle', 'night', 'rot', '##c', 'marches', 'thirsty', 'think', 'water', 'like', 'de', '##ja', 'blue', 'container', 'works', 'good', 'ref', '##ill', 'ye', '##a', 'nice', 'cold', 'guess', 'need', 'study', 'spanish', 'wonder', 'parents', 'guess', 'pretty', 'soon', 'min', 'seconds', 'going', 'pretty', 'fast', 'tomorrow', 'friday', 'ho', '##rra', '##y', 'sure', 'ho', '##rra', '##y', 'spelled', 'pro', '##bal', '##y', 'dictionary', 'car', 'alarms', 'going', 'someone', '##s', 'car', 'ac', '##ut', '##ually', 'gets', 'robbed', 'know', 'suck', 'like', 'bob', 'marley', 'poster', 'interesting', 'guess', 'work', 'little', 'today', 'work', 'bet', 'work', 'boring', 'tonight', 'lot', 'people', 'fields', 'know', 'water', 'tastes', 'good', 'psychology', 'kind', 'interesting', 'know', 'going', 'major', 'bug', 'wish', 'figure', 'oh', 'mab'

In [18]:
#cEXT,cNEU,cAGR,cCON,cOPN
EXT.build_vocab(train_data)
NEU.build_vocab(train_data)
AGR.build_vocab(train_data)
CON.build_vocab(train_data)
OPN.build_vocab(train_data)

In [19]:
#cEXT,cNEU,cAGR,cCON,cOPN
print(EXT.vocab.stoi)
print(NEU.vocab.stoi)
print(AGR.vocab.stoi)
print(CON.vocab.stoi)
print(OPN.vocab.stoi)

defaultdict(None, {'y': 0, 'n': 1})
defaultdict(None, {'y': 0, 'n': 1})
defaultdict(None, {'y': 0, 'n': 1})
defaultdict(None, {'y': 0, 'n': 1})
defaultdict(None, {'y': 0, 'n': 1})


In [20]:
BATCH_SIZE = 128

device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device,
    sort=False)

In [21]:
from transformers import BertTokenizer, BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
                
        # with torch.no_grad():
        #     embedded = self.bert(text)[0]

        embedded = self.bert(text)[0]
        
                
        #embedded = [batch size, sent len, emb dim]
        
        _, hidden = self.rnn(embedded)
        
        #hidden = [n layers * n directions, batch size, emb dim]
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
                
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        
        #output = [batch size, out dim]
        
        return output

In [23]:
HIDDEN_DIM = 256
OUTPUT_DIM = 5
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BERTGRUSentiment(bert,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         N_LAYERS,
                         BIDIRECTIONAL,
                         DROPOUT)

In [24]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 112,243,461 trainable parameters


In [25]:
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

In [26]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,761,221 trainable parameters


In [27]:
for name, param in model.named_parameters():                
    if param.requires_grad:
        print(name)

rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
rnn.weight_ih_l1_reverse
rnn.weight_hh_l1_reverse
rnn.bias_ih_l1_reverse
rnn.bias_hh_l1_reverse
out.weight
out.bias


In [28]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [29]:
criterion = nn.BCEWithLogitsLoss()

In [30]:
model = model.to(device)
criterion = criterion.to(device)

In [31]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    
    # correct = torch.mean(correct, dim=1)
    acc = torch.mean(correct, dim=0)
    # acc = correct / len(correct)
    # print(acc)
    return acc

In [32]:
def train(model, iterator, optimizer, criterion):
    #cEXT,cNEU,cAGR,cCON,cOPN
    epoch_loss = 0
    epoch_acc_EXT = 0
    epoch_acc_NEU = 0
    epoch_acc_AGR = 0
    epoch_acc_CON = 0
    epoch_acc_OPN = 0
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        #cEXT,cNEU,cAGR,cCON,cOPN
        label = torch.stack([batch.EXT, batch.NEU, batch.AGR, batch.CON, batch.OPN], dim=1)
        # print(label.shape)
        loss = criterion(predictions, label)
        
        acc = binary_accuracy(predictions, label)
        # print(acc)
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
        epoch_acc_EXT += acc[0].item()
        epoch_acc_NEU += acc[1].item()
        epoch_acc_AGR += acc[2].item()
        epoch_acc_CON += acc[3].item()
        epoch_acc_OPN += acc[4].item()
        # epoch_acc += accs
        
    return epoch_loss / len(iterator), epoch_acc_EXT / len(iterator), epoch_acc_NEU / len(iterator), epoch_acc_AGR / len(iterator), epoch_acc_CON / len(iterator), epoch_acc_OPN / len(iterator)

In [33]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc_EXT = 0
    epoch_acc_NEU = 0
    epoch_acc_AGR = 0
    epoch_acc_CON = 0
    epoch_acc_OPN = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            
            predictions = model(batch.text).squeeze(1)
            
            label = torch.stack([batch.EXT, batch.NEU, batch.AGR, batch.CON, batch.OPN], dim=1)

            loss = criterion(predictions, label)
            
            acc = binary_accuracy(predictions, label)

            epoch_loss += loss.item()
            epoch_acc_EXT += acc[0].item()
            epoch_acc_NEU += acc[1].item()
            epoch_acc_AGR += acc[2].item()
            epoch_acc_CON += acc[3].item()
            epoch_acc_OPN += acc[4].item()
            # epoch_acc += accs
        
    return epoch_loss / len(iterator), epoch_acc_EXT / len(iterator), epoch_acc_NEU / len(iterator), epoch_acc_AGR / len(iterator), epoch_acc_CON / len(iterator), epoch_acc_OPN / len(iterator)

In [34]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [35]:
N_EPOCHS = 30

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    #cEXT,cNEU,cAGR,cCON,cOPN
    train_loss, train_acc_EXT, train_acc_NEU, train_acc_AGR, train_acc_CON, train_acc_OPN = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc_EXT, valid_acc_NEU, valid_acc_AGR, valid_acc_CON, valid_acc_OPN = evaluate(model, valid_iterator, criterion)
        
    end_time = time.time()
        
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model/MP_train.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc EXT: {train_acc_EXT*100:.2f}% | Train Acc NEU: {train_acc_NEU*100:.2f}% | Train Acc AGR: {train_acc_AGR*100:.2f}% | Train Acc CON: {train_acc_CON*100:.2f}% | Train Acc OPN: {train_acc_OPN*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc EXT: {valid_acc_EXT*100:.2f}% |  Val. Acc NEU: {valid_acc_NEU*100:.2f}% |  Val. Acc AGR: {valid_acc_AGR*100:.2f}% |  Val. Acc CON: {valid_acc_CON*100:.2f}% |  Val. Acc OPN: {valid_acc_OPN*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 17s
	Train Loss: 0.710 | Train Acc EXT: 50.66% | Train Acc NEU: 50.79% | Train Acc AGR: 49.22% | Train Acc CON: 48.05% | Train Acc OPN: 53.51%
	 Val. Loss: 0.696 |  Val. Acc EXT: 50.21% |  Val. Acc NEU: 51.02% |  Val. Acc AGR: 46.07% |  Val. Acc CON: 48.92% |  Val. Acc OPN: 56.67%
Epoch: 02 | Epoch Time: 0m 17s
	Train Loss: 0.690 | Train Acc EXT: 53.42% | Train Acc NEU: 54.69% | Train Acc AGR: 52.33% | Train Acc CON: 53.84% | Train Acc OPN: 56.46%
	 Val. Loss: 0.718 |  Val. Acc EXT: 46.49% |  Val. Acc NEU: 51.71% |  Val. Acc AGR: 51.38% |  Val. Acc CON: 48.56% |  Val. Acc OPN: 50.12%
Epoch: 03 | Epoch Time: 0m 18s
	Train Loss: 0.687 | Train Acc EXT: 53.66% | Train Acc NEU: 55.93% | Train Acc AGR: 53.76% | Train Acc CON: 54.95% | Train Acc OPN: 57.37%
	 Val. Loss: 0.696 |  Val. Acc EXT: 54.68% |  Val. Acc NEU: 50.57% |  Val. Acc AGR: 46.55% |  Val. Acc CON: 51.89% |  Val. Acc OPN: 59.70%
Epoch: 04 | Epoch Time: 0m 18s
	Train Loss: 0.680 | Train Acc EXT: 54.97%

In [36]:
model.load_state_dict(torch.load('model/MP_train.pt'))

test_loss, test_acc_EXT, test_acc_NEU, test_acc_AGR, test_acc_CON, test_acc_OPN = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc EXT: {test_acc_EXT*100:.2f}% | Test Acc NEU: {test_acc_NEU*100:.2f}% | Test Acc AGR: {test_acc_AGR*100:.2f}% | Test Acc CON: {test_acc_CON*100:.2f}% | Test Acc OPN: {test_acc_OPN*100:.2f}%')

Test Loss: 0.691 | Test Acc EXT: 51.86% | Test Acc NEU: 51.26% | Test Acc AGR: 53.03% | Test Acc CON: 49.88% | Test Acc OPN: 64.21%


In [37]:
def predict_sentiment(model, tokenizer, sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor)).squeeze(0)
    # print(prediction)
    pred = [pred.item() for pred in prediction]
    # print(pred)
    #cEXT,cNEU,cAGR,cCON,cOPN
    print(f'EXT: {1-pred[0]:.3f}, NEU: {pred[1]:.3f}, AGR: {1-pred[2]:.3f}, CON: {1-pred[3]:.3f}, OPN: {1-pred[4]:.3f}')
    # return prediction.item()

In [38]:
#y,n,y,n,n
predict_sentiment(model, tokenizer, "Right now I'm sitting at my computer at home trying to decipher a way to buy a new truck. How can I find the right truck that's cool for me, is affordable, and will last without much repair. Well I guess the first thing to do would be to go around to dealerships and research prices and selection on the market. That sounds like a wonderful idea and tomorrow would be a perfect day to go out and do that with my dad, but there is one problem. Which is I am pledging a fraternity right now and should be up at the house tomorrow to be with my pledge brothers. This gets me right back into deciding what my priorties are. So far since school started I have dealt more with priorties in two weeks than ever in my life. I have had to be more definitive with decisions and stand behind them because they are so important now. I want to do so much stuff like join a frat, hang out with friends from high school, buy a truck, get good grades, meet people, and keep up with my girlfriend in New York. Now that's been the most pressing issue on my mind at all times. I got a girl that I'm in love with at school across the country and talk to her every day. Is it possible that I can expect our relationship to work out?  I know I want it to, I know I want to see her and have things the way they were, but is that something I can manage. Are we going to end up on bad terms because one of us is going to meet somebody and the other will feel left out, or can we trust each other and resist tempations and stay together. It just drives me crazy to think about what she is doing all the time. Then I start wondering about my best friend who went to Arizona and if he is meeting new people and finding different friends. I miss having the comfort of my girlfriend and best friend around because I feel I don't act like my true self around new people. I am not trying to be fake with anyone, but I can't be comfortable and completely open around new people and I don't like that. I just hope to keep in contact with all the people that mean a lot to me, and never sell them out to new friends because I would be preety upset if that ever happened to me. Even though I got lots of new friends in my fraternity, they can never eplace the ones that I spent my time with in high school and they will never share those same experiences.")


EXT: 0.540, NEU: 0.497, AGR: 0.718, CON: 0.528, OPN: 0.278


In [39]:
#n,y,n,y,y
predict_sentiment(model, tokenizer, "Stream of Consciousness that term reminds me so much of Virginia Woolf and my long semester studying her and Mrs. Dalloway. I can't even think about it without thinking of her and her crazy life and the craziness with how Mrs. Dalloway was written and it makes me so sad because her life was just so sad. I feel so bad for her. I wonder how fast twenty minutes really is I bet it may end up seeming forever and this writing will go on forever but I guess it is kind of interesting I just am worried my hands will begin to hurt from all of this typing. I am so glad that it is Labor Day weekend and I am going to get a long weekend to catch up on a few things like sleep and fun and I guess some studying too although that won't be too fun so that will actually take away from some of the fun goal. it is so interesting how I am sitting here all alone in my room and in fact most of the time it gets kind of lonely but sitting her and I some way expressing my thoughts instead of just listening to them and doing nothing makes it not so lonely and for once it is actually quiet in the dorm so that I can concentrate on just about anything. I hope this year goes well because I think that it really needs to be a good semester and that it will progress to a good year I if The semester goes well. I'm getting kind of sad now that because I am leaving town tomorrow I am going to miss the first Football which will be my first Texas football game and I have heard they are a great experience and a ton of fun, but at least there are plenty of other games that I can attend. My hand is really beginning to hurt that is really pretty sad considering I haven't even typed for 10 minutes. I wonder if I am missing out on anything while I am sitting her typing this but I guess it isn't that big of a deal I don't think there are many people her anyway and I don't know where anyone is. I was thinking about how when I graduated from high school that I would be balling my eyes out but I didn't cry then I thought I would cry when My parents left me here all alone. but the weird thing is I haven't cried or anything yet I haven't even gotten homesick. I mean let's face it isn't like I would have anything spectacular to do even if I was home so I guess that helps in the homesickness department. It is so cold in my room I am absolutely freezing and There is no way in which to control the air, I need gloves and winter boots. I really think that people need to something about the coolness of the buildings I mean I know it is hot outside and all but because it is hot outside students want to wear shorts and other things like that to keep cool when out in the heat, but then you step in a building and dorms are the absolute worst of all and you freeze to death like I am surprised I haven't gotten frost bite from these frigid places. Well the time has actually gone faster than I expected it to it hasn't been all that bad once I got over the whole Virginia Woolf suicide and her book that I had to suffer through and analyze. It has actually been kind of fun I mean all I have been doing is saying what is on my mind and how I am feeling and that isn't so hard as long as you know that not everyone in the world is going to be reading what you write because personally I Don't' want everyone in the world to know how I feel all the time it is pretty personal for the most part. I am really excited about getting to hang out at the lake this weekend and being able to relax and have food other than from the cafeteria not that the food is all that bad but a homecooked meal is the absolute best. that is real comfort food. Wow I am beginning to feel hungry now but I don't know for what also I really need these twenty minutes to over so that I can run and go get a drink I am really thirsty. oh my gosh my hands are like icicles between the pain from typing and the temperature of the room my hands are really beginning to suffer. but hey I don't have to type for too much longer.")

EXT: 0.483, NEU: 0.475, AGR: 0.640, CON: 0.413, OPN: 0.441


In [40]:
predict_sentiment(model, tokenizer, "I have a vivid imagination.") #OPENNESS

EXT: 0.478, NEU: 0.449, AGR: 0.583, CON: 0.386, OPN: 0.521


In [41]:
predict_sentiment(model, tokenizer, "I am exacting in my work.") #CONSCIENTIOUSNESS

EXT: 0.505, NEU: 0.509, AGR: 0.499, CON: 0.409, OPN: 0.621


In [42]:
predict_sentiment(model, tokenizer, "I am the life of the party.") #EXTRAVERSION

EXT: 0.618, NEU: 0.405, AGR: 0.681, CON: 0.580, OPN: 0.285


In [43]:
predict_sentiment(model, tokenizer, "I am not interested in other people's problems.") #AGREEABLENESS

EXT: 0.534, NEU: 0.377, AGR: 0.588, CON: 0.407, OPN: 0.510


In [44]:
predict_sentiment(model, tokenizer, 'I get irritated easily.') #NEUROTICISM

EXT: 0.492, NEU: 0.467, AGR: 0.621, CON: 0.400, OPN: 0.404
