In [1]:
import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
import torch.nn as nn

In [5]:
import re
import nltk
from nltk.stem import WordNetLemmatizer

def remove_links(text):
    wnl=WordNetLemmatizer()
    text = re.sub(r'\d+', "", text)
    text = re.sub('http://\S+|https://\S+', '', text)
    emoji_pattern = re.compile("["
                        u"\U0001F600-\U0001F64F"  
                        u"\U0001F680-\U0001F6FF"  
                        u"\U0001F1E0-\U0001F1FF"  
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    text = re.sub(r'@\w+',  '', text).strip()
    text = re.sub("[^a-zA-Z0-9 ']", "", text)
    text=' '.join([wnl.lemmatize(i) for i in text.lower().split()])
    
    return text

In [3]:
disaster_df = pd.read_csv('train.csv')

In [24]:
disaster_df['text'] = disaster_df['text'].apply(remove_links)

In [7]:
all_text = ""

In [25]:
for text in disaster_df['text']:
    all_text += text
    all_text += "\n"

In [26]:
tweets_split = all_text.split("\n")
all_text = ' '.join(tweets_split)
words = all_text.split()

In [28]:
from collections import Counter
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)

## Build a dictionary that maps words to integers
vocab_to_int = None
tweets_to_int = []

vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

reviews_ints = []
vocab_to_int
## use the dict to tokenize each review in reviews_split
## store the tokenized reviews in reviews_ints
for tweets in disaster_df['text']:
    tweets_to_int.append([vocab_to_int[word] if word in vocab_to_int else vocab_to_int['unk'] for word in tweets.split() ])
    

In [30]:
vocab_to_int['unk'] = 14375

In [16]:
tweet_lens = Counter([len(x) for x in tweets_to_int])
print("Zero-length reviews: {}".format(tweet_lens[0]))
print("Maximum review length: {}".format(max(tweet_lens)))

Zero-length reviews: 0
Maximum review length: 31


In [32]:
tweet_labels = list(disaster_df['target'])

In [33]:
tweet_labels = np.asarray(tweet_labels)

In [34]:
def pad_features(tweets_to_int, seq_length=200):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    for idx, tweets in enumerate(tweets_to_int):
        if(len(tweets)>seq_length):
            tweets_to_int[idx] = tweets[:seq_length]
        else:
            pad_zeros = [0 for i in range(seq_length - len(tweets))]
            if(pad_zeros != []):
                tweets_to_int[idx] = pad_zeros + tweets
    ## implement function
    
    features=tweets_to_int
    
    return features

In [35]:
features = pad_features(tweets_to_int, seq_length=200)
features = np.asarray(features)
## test statements - do not change - ##
assert len(features)==len(tweets_to_int), "Your features should have as many rows as reviews."
assert len(features[0])==200, "Each feature row should contain seq_length values."

# print first 10 values of the first 30 batches 
features[:30,:]

array([[   0,    0,    0, ..., 4030,   39,   38],
       [   0,    0,    0, ..., 5900, 5901, 1326],
       [   0,    0,    0, ...,  437,   21, 1062],
       ...,
       [   0,    0,    0, ...,  112,   12, 2243],
       [   0,    0,    0, ...,    0,    0, 5914],
       [   0,    0,    0, ...,   11,   24, 4036]])

In [36]:
features[1:,:].shape

(7612, 200)

In [39]:
train_x, train_y = features[3:,:], tweet_labels[3:]

In [40]:
train_x.shape

(7610, 200)

In [41]:
train_y.shape

(7610,)

In [42]:
from torch.utils.data import TensorDataset, DataLoader

# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))

# dataloaders
batch_size = 10

# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)


In [43]:
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([10, 200])
Sample input: 
 tensor([[   0,    0,    0,  ...,  112,    9,  177],
        [   0,    0,    0,  ...,  101,    2,  456],
        [   0,    0,    0,  ...,  291,  992,  231],
        ...,
        [   0,    0,    0,  ..., 2147, 1208,  509],
        [   0,    0,    0,  ...,  226,    2,  324],
        [   0,    0,    0,  ...,  316,    1,   78]])

Sample label size:  torch.Size([10])
Sample label: 
 tensor([0, 0, 1, 1, 0, 0, 1, 0, 1, 1])


In [44]:
import torch.nn as nn

class TweetClassifier(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(TweetClassifier, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        # define all layers
        

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        embed_out = self.embedding(x)
        lstm_out, hidden = self.rnn(embed_out, hidden)
        out = self.dropout(lstm_out)
        out = self.fc(out)

        # sigmoid function
        sig_out = self.sig(out)

        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        weight = next(self.parameters()).data

        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())

        return hidden

In [45]:
vocab_size = len(vocab_to_int) + 1
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2

model = TweetClassifier(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(model)

TweetClassifier(
  (embedding): Embedding(14377, 400)
  (rnn): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [47]:
# loss and optimization functions
lr=0.003

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [48]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

No GPU available, training on CPU.


In [49]:
epochs = 4 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    model.cuda()

model.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = model.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        model.zero_grad()

        # get the output from the model
        output, h = model(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()))

Epoch: 1/4... Step: 100... Loss: 0.633264...
Epoch: 1/4... Step: 200... Loss: 0.308938...
Epoch: 1/4... Step: 300... Loss: 0.430902...
Epoch: 1/4... Step: 400... Loss: 0.434198...
Epoch: 1/4... Step: 500... Loss: 0.507837...
Epoch: 1/4... Step: 600... Loss: 0.408614...
Epoch: 1/4... Step: 700... Loss: 0.490468...
Epoch: 2/4... Step: 800... Loss: 0.426665...
Epoch: 2/4... Step: 900... Loss: 0.134754...
Epoch: 2/4... Step: 1000... Loss: 0.347215...
Epoch: 2/4... Step: 1100... Loss: 0.570327...
Epoch: 2/4... Step: 1200... Loss: 0.581874...
Epoch: 2/4... Step: 1300... Loss: 0.638136...
Epoch: 2/4... Step: 1400... Loss: 0.660555...
Epoch: 2/4... Step: 1500... Loss: 0.089418...
Epoch: 3/4... Step: 1600... Loss: 0.200989...
Epoch: 3/4... Step: 1700... Loss: 0.034462...
Epoch: 3/4... Step: 1800... Loss: 0.340015...
Epoch: 3/4... Step: 1900... Loss: 0.172357...
Epoch: 3/4... Step: 2000... Loss: 0.788657...
Epoch: 3/4... Step: 2100... Loss: 0.094335...
Epoch: 3/4... Step: 2200... Loss: 0.344535.

In [50]:
def preprocessing_pipeline(df,vocab_to_int):
    tweets_to_int = []
    ## use the dict to tokenize each review in reviews_split
    ## store the tokenized reviews in reviews_ints
    df['text'] = df['text'].apply(remove_links)
    for tweets in df['text']:
        tweets_to_int.append([vocab_to_int[word] if word in vocab_to_int else vocab_to_int['unk'] for word in tweets.split()])
    features = pad_features(tweets_to_int, seq_length=200)
    features = np.asarray(features)
    return features
    

In [51]:
def predict(model, features, sequence_length=200):
    ''' Prints out whether a give review is predicted to be 
        positive or negative in sentiment, using a trained model.
        
        params:
        net - A trained net 
        test_review - a review made of normal text and punctuation
        sequence_length - the padded length of a review
        '''
    model.eval()
    feature_tensor = torch.from_numpy(features)
    feature_tensor = feature_tensor.reshape(1,-1)
    batch_size = feature_tensor.size(0)
    print(batch_size)
    h = model.init_hidden(batch_size)
    output, h = model(feature_tensor, h)
    pred = torch.round(output.squeeze())
    
    return pred
        

In [52]:
test_pred = {}
disaster_test_df = pd.read_csv('test.csv')

In [53]:
tensor_o = torch.from_numpy(features[0])

In [54]:
tensor_o.reshape(1,-1).shape

torch.Size([1, 200])

In [55]:
tensor_o.shape

torch.Size([200])

In [56]:
disaster_test_df

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [57]:
len(vocab_to_int)

14376

In [58]:
features = preprocessing_pipeline(disaster_test_df, vocab_to_int)

In [59]:
features.shape

(3263, 200)

In [60]:
len(features)

3263

In [61]:
outputs = []

In [62]:
predict(model,features[0])

1


tensor(1., grad_fn=<RoundBackward>)

In [63]:
for i in range(len(features)):
    output = predict(model,features[i])
    outputs.append(int(output.item()))
    

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [64]:
submission = pd.DataFrame({'id':disaster_test_df['id'],'target':outputs})

In [65]:
submission

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [66]:
submission.to_csv("submissions.csv",index=False)

In [67]:
submission['target'].value_counts()

0    1936
1    1327
Name: target, dtype: int64