<a href="https://colab.research.google.com/github/jbajaj1/faketweets/blob/master/faketweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.utils.data as data

from torch.utils.data import TensorDataset, DataLoader

In [2]:
def load_tweets(filename, initVoc=False):
    X = []
    y = []
    r = open(filename, 'r')
    for line in r:
        line = line.split()
        y.append(line[0])
        X.append(line[1:])

    tokenizedTweets = []
    tokenizedLabels = []
    labelDic = {"negative":0, "neutral":1, "positive":2}
    if initVoc:
        for t in X:
            twitterVoc.add_sentence(t)
    for l in y:
        tokenizedLabels.append(labelDic[l])
    for t in X:
        tokenizedTweets.append(twitterVoc.sentence_to_vec(t))

    tokenizedTweets = torch.LongTensor(tokenizedTweets)
    tokenizedLabels = torch.LongTensor(tokenizedLabels)

    return tokenizedTweets, tokenizedLabels

In [3]:
class Vocab:


    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "PAD", 1: "UNK"}
        self.num_words = 2
        self.num_sentences = 0
        self.longest_sentence = 0
        self.unknown_count = 0


    def add_word(self, word):
        word = word.lower()
        if word not in self.word2index:
            # First entry of word into vocabulary
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            # Word exists; increase word count
            self.word2count[word] += 1


    def add_sentence(self, sentence):
        sentence_len = 0
        #print(sentence)
        for word in sentence:
            sentence_len += 1
            self.add_word(word)
        if sentence_len > self.longest_sentence:
            # This is the longest sentence
            self.longest_sentence = sentence_len
        # Count the number of sentences
        self.num_sentences += 1

    def to_word(self, index):
        return self.index2word[index]

    def to_index(self, word):
        if word not in self.word2index:
            #print("Unknown word:", word)
            self.unknown_count += 1
            return 1
        return self.word2index[word]

    def sentence_to_vec(self, sentence):
        vec = []
        for word in sentence:
            word = word.lower()
            vec.append(self.to_index(word))
        while len(vec) < self.longest_sentence:
            vec.append(0)
        return vec

In [11]:
class LSTM(torch.nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, num_layers=1, dropout=0.1):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = torch.nn.Dropout(dropout)

        self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)
        self.rnn = nn.LSTM(self.embedding_size, self.hidden_size, self.num_layers, batch_first=True)
        self.output = nn.Linear(self.hidden_size, 3)



    def forward(self, X):
        emb = self.embedding(X)
        emb = self.dropout(emb)

        hidden_states, _ = self.rnn(emb)

        numMask = (X != 0).float()
        mask = (X != 0).float().unsqueeze(-1).expand(hidden_states.size())
        hidden_states = (hidden_states*mask).sum(-2)/numMask.sum(-1).unsqueeze(-1)
        hidden_states = self.dropout(hidden_states)

        output_dist = self.output(hidden_states)

        return output_dist


In [12]:
def validate(expected, predictions):
    '''
    totdiff = 0
    numdiff = 0
    counter = 0
    for i in expected:
        diff = abs(i-predictions[counter])
        totdiff += diff
        if diff != 0:
            numdiff += 1
        counter += 1
    return totdiff, numdiff
    '''
    return confusion_matrix(expected, predictions)

In [13]:
twitterVoc = Vocab("twitter")

#Put proper location of file here
tokenizedTweets, tokenizedLabels = load_tweets("../twitter_sentiment/semeval_train.txt",  initVoc=True)

print(twitterVoc.to_word(4))
print(twitterVoc.to_index("this"))

print(twitterVoc.num_words)

my
82
33059


In [14]:
ourLSTM = LSTM(twitterVoc.num_words, 64, 64)

In [15]:
##################
#####Training#####
##################

opt = torch.optim.Adam(ourLSTM.parameters(), lr=.1)
loss = torch.nn.CrossEntropyLoss()
epochs = 100
dataset = DataLoader(TensorDataset(tokenizedTweets, tokenizedLabels), batch_size=100)
for i in range(epochs):
    print("Training on epoch", i)
    for batchidx, (x, y) in enumerate(dataset):
        opt.zero_grad()
        outputs = ourLSTM(x)
        lossVal = loss(outputs, y)
        lossVal.backward()
        opt.step()

Training on epoch 0
Training on epoch 1
Training on epoch 2
Training on epoch 3
Training on epoch 4
Training on epoch 5
Training on epoch 6
Training on epoch 7
Training on epoch 8
Training on epoch 9
Training on epoch 10
Training on epoch 11
Training on epoch 12
Training on epoch 13
Training on epoch 14
Training on epoch 15
Training on epoch 16
Training on epoch 17
Training on epoch 18
Training on epoch 19
Training on epoch 20
Training on epoch 21
Training on epoch 22
Training on epoch 23
Training on epoch 24
Training on epoch 25
Training on epoch 26
Training on epoch 27
Training on epoch 28
Training on epoch 29
Training on epoch 30
Training on epoch 31
Training on epoch 32
Training on epoch 33
Training on epoch 34
Training on epoch 35
Training on epoch 36
Training on epoch 37
Training on epoch 38
Training on epoch 39
Training on epoch 40
Training on epoch 41
Training on epoch 42
Training on epoch 43
Training on epoch 44
Training on epoch 45
Training on epoch 46
Training on epoch 47
Tr

In [16]:
##################
#####Predict######
####Evaluation####
##################


predVal = ourLSTM(tokenizedTweets).argmax(dim=-1)

print("Results for Train Data:\n", validate(tokenizedLabels, predVal))


filelist = ["Twitter2013_raw.txt", "Twitter2014_raw.txt", "Twitter2015_raw.txt", "Twitter2016_raw.txt"]

for file in filelist:
    #Update file location here
    tokTestTweets, tokTestLabels = load_tweets("../twitter_sentiment/" + file)
    predVal = ourLSTM(tokTestTweets).argmax(dim=-1)
    print("Results for", file, "\n", validate(tokTestLabels, predVal))




print("Num Unknown Words:", twitterVoc.unknown_count)
#Test commit for github 3

Results for Train Data:
 [[1225   17   23]
 [  29 4015   65]
 [  16   15 3199]]
Results for Twitter2013_raw.txt 
 [[125 243 233]
 [117 897 625]
 [110 495 967]]
Results for Twitter2014_raw.txt 
 [[ 52  78  72]
 [ 49 348 272]
 [ 54 306 621]]
Results for Twitter2015_raw.txt 
 [[ 89 128 147]
 [ 98 527 362]
 [ 85 342 611]]
Results for Twitter2016_raw.txt 
 [[ 770 1237 1224]
 [1207 5109 4025]
 [ 545 2427 4087]]
Num Unknown Words: 104225
