## Sentiment Analysis

In [1]:
import torchtext
from torchtext import data
import pandas as pd
import spacy

In [2]:
tweetsDF = pd.read_csv('trainingandtestdata/training.1600000.processed.noemoticon.csv', engine = 'python', header = None)

In [3]:
tweetsDF.head(5)

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
tweetsDF[0].value_counts()

4    800000
0    800000
Name: 0, dtype: int64

In [5]:
tweetsDF['sentiment_cat'] = tweetsDF[0].astype('category')

In [6]:
tweetsDF.head(5)

Unnamed: 0,0,1,2,3,4,5,sentiment_cat
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,0
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,0
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,0
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",0


In [7]:
tweetsDF['sentiment_cat'].value_counts()

4    800000
0    800000
Name: sentiment_cat, dtype: int64

In [8]:
tweetsDF['sentiment'] = tweetsDF['sentiment_cat'].cat.codes

In [9]:
tweetsDF.head(5)

Unnamed: 0,0,1,2,3,4,5,sentiment_cat,sentiment
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,0
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,0,0
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,0,0
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,0,0
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",0,0


In [10]:
tweetsDF['sentiment'].value_counts()

1    800000
0    800000
Name: sentiment, dtype: int64

In [13]:
tweetsDF.to_csv('trainingandtestdata/train-processed.csv', header = None, index = None)

In [14]:
tweetsDF.sample(1000).to_csv('trainingandtestdata/train-processed-sample.csv', header = None, index = None)

In [11]:
LABEL = data.LabelField()
TWEET = data.Field(tokenize = 'spacy', lower = True)

In [12]:
fields = [('score', None), ('id', None), ('date', None), ('query', None), ('name', None),
         ('tweet', TWEET), ('category', None), ('label', LABEL)]

In [15]:
twitterDataset = torchtext.data.TabularDataset(path = 'trainingandtestdata/train-processed.csv',
                                              format = 'CSV',
                                              fields = fields,
                                              skip_header = False)

In [16]:
(train, test, valid) = twitterDataset.split(split_ratio = [0.8, 0.1, 0.1])

In [17]:
print(len(train), len(test), len(valid))

1280000 160000 160000


In [18]:
vars(train.examples[7])

{'label': '0',
 'tweet': ['@lexiloohoo',
  'i',
  "'m",
  'sorry',
  ' ',
  'today',
  "'s",
  'been',
  'nothing',
  'but',
  'a',
  'fucking',
  'heartbreak']}

In [20]:
vars(train.examples[9])

{'label': '1',
 'tweet': ['@aaron_vail',
  'i',
  'know',
  '!',
  'that',
  "'s",
  'where',
  'i',
  'got',
  'your',
  'twitter',
  'page',
  '!']}

In [21]:
vars(train.examples[11])

{'label': '0',
 'tweet': ['spent',
  'the',
  'evening',
  'arguing',
  'with',
  'charter',
  'tech',
  'support',
  '.',
  'what',
  'a',
  'fun',
  'night',
  'this',
  'has',
  'been']}

In [22]:
vocab_size = 20000
TWEET.build_vocab(train, max_size = vocab_size)

In [23]:
len(TWEET.vocab)

20002

In [24]:
TWEET.vocab.freqs.most_common(10)

[('i', 799856),
 ('!', 724163),
 ('.', 646990),
 (' ', 469690),
 ('to', 452340),
 ('the', 417119),
 (',', 386289),
 ('a', 304882),
 ('my', 252995),
 ('it', 243037)]

In [25]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train, valid, test), batch_size = 32)

### Create model

In [34]:
import torch.nn as nn
import torch.optim as optim

In [35]:
class OurFirstLSTM(nn.Module):
    def __init__(self, hidden_size, embedding_dim, vocab_size):
        super(OurFirstLSTM, self).__init__()
        
        self.embeddding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(input_size = embedding_dim, hidden_size = hidden_size,
                              num_layers = 1)
        self.predictor = nn.Linear(hidden_size, 2)
        
    
    def forward(self, seq):
        output, (hidden, _) = self.encoder(self.embedding(seq))
        preds = self.predictor(hidden.squeeze(0))
        return preds

In [36]:
model = OurFirstLSTM(100, 300, 20002)

In [37]:
optimizer = optim.Adam(model.parameters(), lr = 2e-2)
criterion = nn.CrossEntropyLoss()

In [38]:
def train(model, optimizer, criterion, train_iterator, valid_iterator, num_epochs):
    for epoch in range(num_epochs):
        training_loss = 0.0
        valid_loss = 0.0
        model.train()
        
        for batch_idx, batch in enumerate(train_iterator):
            optimizer.zero_grad()
            preds = model(batch.tweet)
            loss = criterion(preds, batch.label)
            loss.backward()
            optimizer.step()
            training_loss += loss.item() * batch.tweet.size(0)
        training_loss /= len(train_iterator)
        
        model.eval()
        for batch_idx, batch in enumerate(valid_iterator):
            preds = model(batch.tweet)
            loss = criterion(preds, batch.label)
            valid_loss += loss.item() * batch.tweet.size(0)
        valid_loss /= len(valid_iterator)
        print('Epoch {}, Training loss: {:.2f}, Validation loss: {:.2f}'.format(epoch, training_loss, valid_loss))

In [40]:
train(model, optimizer, criterion, train_iterator, valid_iterator, 10)

AttributeError: 'LabelField' object has no attribute 'vocab'