In [1]:
!pip install torchtext==0.4.0



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# New Stuff
from torchtext.data import Field, TabularDataset, BucketIterator, LabelField
import spacy
import torch

torch.backends.cudnn.deterministic = True
print(torch.cuda.device_count())

# spacy_en = spacy.load('en')

#     return [token.text for token in spacy_en.tokenizer(text)]


#By default sequential = True, use_vocab = True, no need to include these parameters. See here for explanation of
#these parameters: https://torchtext.readthedocs.io/en/latest/data.html#fields
title = Field(tokenize='spacy', lower=True)
text = Field(tokenize='spacy', tokenizer_language='en_core_web_sm', include_lengths = True, lower=True)
label = LabelField(dtype=torch.float)

fields = {'text': ('text', text), 'label': ('label', label), 'title': ('title', title)}

1


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
ls

[0m[01;34mdrive[0m/  LSTM-Model.pt  news.csv  news_short.csv  [01;34msample_data[0m/


In [6]:
train_data = TabularDataset.splits(
    path='',
    train='news.csv',
    # validation='news.csv',
    # test='news.csv',
    format='csv',
    fields=fields)[0]

print(type(train_data))

print("Num of training: ", len(train_data))
# print("Num of validation: ", len(validation_data))
# print("Num of testing: ", len(test_data))

<class 'torchtext.data.dataset.TabularDataset'>
Num of training:  6335


In [7]:
%ls

[0m[01;34mdrive[0m/  LSTM-Model.pt  news.csv  news_short.csv  [01;34msample_data[0m/


In [8]:
train_data, validation_data = train_data.split(split_ratio=0.65)
validation_data, test_data = validation_data.split(split_ratio=0.5)
print("Num of training: ", len(train_data))
print("Num of validation: ", len(validation_data))
print("Num of testing: ", len(test_data))

Num of training:  4118
Num of validation:  1108
Num of testing:  1109


In [9]:
vars(train_data[0])
# print(train_data[0])

{'label': 'REAL',
 'text': ['to',
  'understand',
  'what',
  'ails',
  'hillary',
  'clinton',
  ',',
  'let',
  '’s',
  'rewind',
  'past',
  'iowa',
  'and',
  'new',
  'hampshire',
  '–',
  'two',
  'years',
  'back',
  ',',
  'in',
  'fact',
  ',',
  'to',
  'a',
  'speech',
  'in',
  'new',
  'orleans',
  'before',
  'the',
  'national',
  'auto',
  'dealers',
  'association',
  'and',
  'these',
  'words',
  ':',
  '\n\n',
  '“',
  'the',
  'last',
  'time',
  'i',
  'actually',
  'drove',
  'a',
  'car',
  'myself',
  'was',
  '1996',
  '.',
  'i',
  'remember',
  'it',
  'very',
  'well',
  '.',
  'unfortunately',
  ',',
  'so',
  'does',
  'the',
  'secret',
  'service',
  ',',
  'which',
  'is',
  'why',
  'i',
  'have',
  "n't",
  'driven',
  'since',
  'then',
  '.',
  '”',
  '\n\n',
  'that',
  'one',
  'passage',
  'underscores',
  'three',
  'of',
  'clinton',
  '’s',
  'present',
  '-',
  'day',
  'woes',
  ':',
  'she',
  '’s',
  'lived',
  'in',
  'a',
  'cocoon',
  

In [10]:
MAX_VOCAB_SIZE = 25_000

text.build_vocab(train_data, max_size = MAX_VOCAB_SIZE, vectors = "glove.6B.100d", unk_init = torch.Tensor.normal_)
title.build_vocab(train_data, max_size = MAX_VOCAB_SIZE, vectors = "glove.6B.100d", unk_init = torch.Tensor.normal_)
label.build_vocab(train_data)

In [11]:
BATCH_SIZE = 64

device = torch.device('cuda')

train_iterator, validation_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: x.title,
    sort_within_batch = True,
    # sort=False,
    device = device)

In [12]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                    bidirectional, dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers = n_layers, bidirectional=bidirectional, dropout = dropout )
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        
        #print(text.size())
        #text = [sent len, batch size]
        embedded = self.dropout(self.embedding(text))
        # embedded = [sent len, batch size, emb dim]
        
        # pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'), enforce_sorted=False)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)

        # unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout

        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))

        #hidden = [batch size, hid dim * num directions]

        return self.fc(hidden)

In [13]:
INPUT_DIM = len(text.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = text.vocab.stoi[text.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)


In [14]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,810,857 trainable parameters


In [15]:
pretrained_embeddings = text.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [16]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.6775,  0.6508, -1.4881,  ..., -0.3913, -0.3616,  0.1976],
        [ 0.9480, -0.0378,  0.0487,  ..., -0.9490, -1.3723,  0.6721],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.3932, -0.1523,  0.3094,  ...,  0.1689,  0.7236, -0.4796],
        [ 0.3495, -0.0160, -0.3485,  ..., -0.2744, -0.6759,  0.2178],
        [-0.2805,  0.1509,  0.9577,  ...,  0.7749,  0.1199, -0.6552]])

In [17]:
UNK_IDX = text.vocab.stoi[text.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.3932, -0.1523,  0.3094,  ...,  0.1689,  0.7236, -0.4796],
        [ 0.3495, -0.0160, -0.3485,  ..., -0.2744, -0.6759,  0.2178],
        [-0.2805,  0.1509,  0.9577,  ...,  0.7749,  0.1199, -0.6552]])


In [18]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [19]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [20]:
def binary_accuracy(preds, label):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == label).float()
    acc = correct.sum()/len(correct)
    return acc

In [21]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        optimizer.zero_grad()

        text, text_lengths = batch.text
        # print("-------batch.text is: -------")
        # print(batch.text)
        predictions = model(text, text_lengths).squeeze(1)

        loss = criterion(predictions, batch.label)

        acc = binary_accuracy(predictions, batch.label)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

In [22]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text

            predictions = model(text, text_lengths).squeeze(1)

            loss = criterion(predictions, batch.label)

            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

In [23]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elasped_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elasped_mins * 60))

    return elasped_mins, elapsed_secs

In [24]:
N_EPOCHS = 5

best_validation_loss = float('inf')

In [25]:
#%%pixie_debugger
for epoch in range(N_EPOCHS):
    start_time = time.time()

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    validation_loss, validation_acc = evaluate(model, validation_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if validation_loss < best_validation_loss:
        best_validation_loss = validation_loss
        torch.save(model.state_dict(), 'LSTM-Model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\Validation Loss: {validation_loss:.3f} | Validation Acc: {validation_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 1m 20s
	Train Loss: 0.413 | Train Acc: 81.18%
\Validation Loss: 0.145 | Validation Acc: 93.89%
Epoch: 02 | Epoch Time: 1m 20s
	Train Loss: 0.140 | Train Acc: 94.62%
\Validation Loss: 0.147 | Validation Acc: 94.43%
Epoch: 03 | Epoch Time: 1m 20s
	Train Loss: 0.170 | Train Acc: 93.65%
\Validation Loss: 0.188 | Validation Acc: 95.75%
Epoch: 04 | Epoch Time: 1m 20s
	Train Loss: 0.132 | Train Acc: 94.95%
\Validation Loss: 0.151 | Validation Acc: 94.41%
Epoch: 05 | Epoch Time: 1m 20s
	Train Loss: 0.101 | Train Acc: 96.49%
\Validation Loss: 0.114 | Validation Acc: 96.61%


In [26]:
#'''
model.load_state_dict(torch.load('LSTM-Model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
#'''

Test loss: 0.087 | Test Acc: 96.70%


In [31]:

import spacy
nlp = spacy.load('en_core_web_sm')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [text.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()



In [35]:
#'''
user_inputted_text = input("Copy Paste the Text of an Article and we will predict if it is fake or real:")
predict_sentiment(model, user_inputted_text)
#'''

Copy Paste the Text of an Article and we will predict if it is fake or real:Junior Congresswoman Alexandra Ocasio-Cortez has made a lot of waves recently with her Green New Deal nonsense and child-like proposals and speeches.  Well, it’s no different this afternoon as the political pixie announced her intention to draft legislation banning motorcycles from use in the United States of America.   Both Clay and Jax Teller take time off from their busy schedule of runnin’ guns, lovin’ women, and threatening Henry Rollins to address the issue. The Senorita of Socialism threw out all manner of statistics regarding deadly accidents and injuries, relaxed traffic rules and tolls for bikers, as well as a not-so-veiled jab at a core demographic of President Trump’s supporters :  “Besides like, what I just said?  A lot of these like, motorcycle people, okay, they’re like : ‘Ooh, look at me, I’m all old and fat and tough and I voted for Trump and smell like wet dog.’  And I’m supposed to slow my Pr

0.9954521059989929