# Cleaning Depression Related Tweets

### Importing Extracted Data 

In [1]:
import pandas as pd

In [2]:
df_1 = pd.read_csv("dep_sadness.csv")
df_2 = pd.read_csv("dep_depressed.csv")
df_3 = pd.read_csv("dep_loneliness.csv")
df_4 = pd.read_csv("dep_depression.csv")

df = pd.concat([df_1, df_2, df_3, df_4], ignore_index=True, axis='rows')

print("Shape:",df.shape)
print("Unique tweets:", len(df['Tweet Id'].value_counts()))

Shape: (22004, 9)
Unique tweets: 21621


### Dropping/removing duplicate tweets.

In [3]:
df = df.drop_duplicates(subset =["Tweet Id"])
print("Shape:",df.shape)

Shape: (21621, 9)


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Username,Text,Language,URL,Mention,Hashtags
0,0,2020-12-30 23:05:25+00:00,1344419347524165638,seesawlesbian,“bts is a trend” #bts #btsarmy #btsbts #hashta...,en,https://twitter.com/seesawlesbian/status/13444...,,"['bts', 'btsarmy', 'btsbts', 'hashtag', 'kpop'..."
1,1,2020-12-30 22:51:23+00:00,1344415815249113090,That_Guy_Crash,What it feels like to lose in Mario Kart #mari...,en,https://twitter.com/That_Guy_Crash/status/1344...,,"['mariokart', 'mariokart8', 'loser', 'lost', '..."
2,2,2020-12-30 22:48:18+00:00,1344415040653422593,MIDAGEDRUNNER,@Lunker58Steele #sadness,und,https://twitter.com/MIDAGEDRUNNER/status/13444...,"[User(username='Lunker58Steele', id=707050254,...",['sadness']
3,3,2020-12-30 22:39:13+00:00,1344412753314902022,MBCharacter,https://t.co/X0T941sg7a\n#help #covid_19 #isol...,und,https://twitter.com/MBCharacter/status/1344412...,,"['help', 'covid_19', 'isolation', 'sadness', '..."
4,4,2020-12-30 21:58:28+00:00,1344402501307564032,rebeccajchaney,"Like everyone, I have so many memories from th...",en,https://twitter.com/rebeccajchaney/status/1344...,"[User(username='rebeccajchaney', id=2842089636...","['memory', 'reflection', 'Reflection2020', 'ca..."


The scraper has scraped URLs of the tweets and has not scraped URLs that are in the tweets. We will need to drop the row "URL" and will have to find a way to find URLs from the text.

In [5]:
df.drop(['Unnamed: 0', 'URL'], axis='columns', inplace=True)
print("Shape:", df.shape)

Shape: (21621, 7)


## Filtering Data

### Dropping tweets that are not in English language.

In [6]:
df = df[df['Language'].str.contains('en')]
print("Shape:", df.shape)

Shape: (17112, 7)


### Remove entries containing URLs, they may be promotional tweets.

In [7]:
s = ['https:', 'Https:']
df = df[~df.Text.str.contains('|'.join(s))]

print("Shape:", df.shape)

Shape: (6333, 7)


### Remove entries with at (i.e. @) mentions.

In [8]:
df = df.loc[pd.isnull(df['Mention'])]
df.shape

(4029, 7)

In [9]:
df.head(15)

Unnamed: 0,Datetime,Tweet Id,Username,Text,Language,Mention,Hashtags
23,2020-12-30 17:30:10+00:00,1344334980252655616,nobleregulus,not a single comment on my recent fic yet.. 😖😣...,en,,['sadness']
28,2020-12-30 15:28:41+00:00,1344304406536269825,mariasophiemegn,its so sad when u take down all the christmas ...,en,,"['sad', 'sadness', 'Christmas', 'christmasisov..."
30,2020-12-30 14:05:12+00:00,1344283398177779714,ClosetsWidows,"If love could have saved you, I know you would...",en,,"['grief', 'sadness', 'widowed', 'widows', 'dea..."
33,2020-12-30 12:42:32+00:00,1344262594694172672,BookSnip,too much happiness always overflowed into tear...,en,,"['happiness', 'sadness', 'crying']"
44,2020-12-30 06:55:25+00:00,1344175239358341120,Isla_Plastic18,Sometimes it takes a little bit of sadness to ...,en,,"['Islabot', 'Sadness', 'Happiness']"
46,2020-12-30 05:49:04+00:00,1344158544484941824,linzmwilliams,Sad that friendships are fractured because of ...,en,,"['friendship', 'sadness', 'COVID19', 'scared']"
49,2020-12-30 04:55:55+00:00,1344145166165131266,ariadnedreams,I have an IV drip of sorrow \nI keep thinking ...,en,,"['poem', 'sadness']"
51,2020-12-30 04:40:52+00:00,1344141380235489287,KlevesAnna,Good night.....Good bye.....both start with go...,en,,"['sadness', 'healing', 'broken']"
58,2020-12-30 03:10:41+00:00,1344118682415366144,letra_at_musika,"One day before New Year's Eve, everything turn...",en,,"['letraatmusika', 'music', 'poetry', 'musika',..."
60,2020-12-30 02:51:00+00:00,1344113732792807424,CORTherapist,Healthy expressions of #sadness move the sadne...,en,,['sadness']


## Make tweet column hastag free.

In [10]:
import re

df["Text"] = df["Text"].apply(lambda x: re.sub(r'#\w+', '', x))
df["Depression"] = 1
df.drop(['Tweet Id', 'Mention', 'Hashtags', 'Username', 'Datetime', 'Language'],
        axis='columns', inplace=True)

print(df.head(5))

                                                 Text  Depression
23   not a single comment on my recent fic yet.. 😖😣😔            1
28  its so sad when u take down all the christmas ...           1
30  If love could have saved you, I know you would...           1
33  too much happiness always overflowed into tear...           1
44  Sometimes it takes a little bit of sadness to ...           1


# Cleaning Non-Depressive Tweets

### Importing Extracted Data 

In [11]:
df_1 = pd.read_csv("non_dep_happy.csv")
df_2 = pd.read_csv("non_dep_selflove.csv")
df_3 = pd.read_csv("non_dep_positivevibes.csv")
df_4 = pd.read_csv("non_dep_inspiration.csv")

df_non = pd.concat([df_1, df_2, df_3, df_4],ignore_index=True,axis='rows')

print("Shape:", df_non.shape)
print("Unique tweets:", len(df_non['Tweet Id'].value_counts()))

Shape: (22004, 9)
Unique tweets: 20823


### Dropping/removing duplicate tweets.

In [12]:
df_non = df_non.drop_duplicates(subset =["Tweet Id"])
print("Shape:",df_non.shape)

Shape: (20823, 9)


In [13]:
pd.set_option('display.max_colwidth', 100)

print("Shape:", df_non.shape)
df_non.head()

Shape: (20823, 9)


Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Username,Text,Language,URL,Mention,Hashtags
0,0,2020-12-30 23:59:58+00:00,1344433078320521217,txtraveltegal,#love #TFLers #instagood #tweegram #photooftheday #me #instamood #cute #iphonesia #summer #tbt ...,und,https://twitter.com/txtraveltegal/status/1344433078320521217,,"['love', 'TFLers', 'instagood', 'tweegram', 'photooftheday', 'me', 'instamood', 'cute', 'iphones..."
1,1,2020-12-30 23:59:31+00:00,1344432963954438146,pr_deciel,あっという間に、今年最後の日＝大晦日となりました\n\nやり残したことはありませんか？\n\nデシェルの「福袋」買いましたか？\n\nたいへん、夢に出ちゃいますよｗ\n\n年末年始だけの数量限...,ja,https://twitter.com/pr_deciel/status/1344432963954438146,,"['福袋', 'HAPPY', '限定', '2021年', '新春', '化粧品', '夢に出る', 'やり残し', '数量限定']"
2,2,2020-12-30 23:59:13+00:00,1344432887035256834,braintickler_,JOKE OF THE DAY : Whoever invented the knock-knock joke should get a no bell prize. #humor #meme...,en,https://twitter.com/braintickler_/status/1344432887035256834,,"['humor', 'memesdaily', 'rofl', 'jokeoftheday', 'funny', 'lol', 'lmao', 'happy', 'followme', 'lo..."
3,3,2020-12-30 23:59:07+00:00,1344432862381051904,PandaMartini,"Dear writing community, I want a #penpal. I have #stickers, a #cricut, and lots of #stationary. ...",en,https://twitter.com/PandaMartini/status/1344432862381051904,,"['penpal', 'stickers', 'cricut', 'stationary', 'cute', 'gifts', 'notgoingout', 'Lonely', 'spread..."
4,4,2020-12-30 23:57:15+00:00,1344432391683715072,Rhaulli_Panda,#merrychristmas🎄 #merryxmas #happy #newyear #happynewyear #happynewyear2021 #2021 #2021春婚 #dicie...,es,https://twitter.com/Rhaulli_Panda/status/1344432391683715072,,"['merrychristmas', 'merryxmas', 'happy', 'newyear', 'happynewyear', 'happynewyear2021', '2021春婚'..."


### Dropping irrelevant columns.

In [14]:
df_non.drop(['Unnamed: 0', 'URL'], axis='columns', inplace=True)
print("Shape:", df_non.shape)

Shape: (20823, 7)


## Filtering Data

### Dropping tweets that are not in English language.

In [15]:
df_non = df_non[df_non['Language'].str.contains('en')]
print("Shape:", df_non.shape)

Shape: (15708, 7)


### Remove entries containing URLs, they may be promotional tweets.

In [16]:
df_non = df_non[~df_non.Text.str.contains('|'.join(s))]
print("Shape:", df_non.shape)

Shape: (3835, 7)


### Remove entries with at (i.e. @) mentions.

In [17]:
df_non = df_non.loc[pd.isnull(df_non['Mention'])]
df_non.shape

(3162, 7)

In [18]:
df_non.head(6)

Unnamed: 0,Datetime,Tweet Id,Username,Text,Language,Mention,Hashtags
2,2020-12-30 23:59:13+00:00,1344432887035256834,braintickler_,JOKE OF THE DAY : Whoever invented the knock-knock joke should get a no bell prize. #humor #meme...,en,,"['humor', 'memesdaily', 'rofl', 'jokeoftheday', 'funny', 'lol', 'lmao', 'happy', 'followme', 'lo..."
3,2020-12-30 23:59:07+00:00,1344432862381051904,PandaMartini,"Dear writing community, I want a #penpal. I have #stickers, a #cricut, and lots of #stationary. ...",en,,"['penpal', 'stickers', 'cricut', 'stationary', 'cute', 'gifts', 'notgoingout', 'Lonely', 'spread..."
24,2020-12-30 23:42:58+00:00,1344428798624878592,ahmedsamiirr1,#Happy _new _year ♥♥,en,,['Happy']
27,2020-12-30 23:41:45+00:00,1344428491647889417,MasugzyoN,Some of us had to make it on our own #Happy New year Eve to all of us,en,,['Happy']
37,2020-12-30 23:31:06+00:00,1344425812553052174,braintickler_,"JOKE OF THE DAY : Have you heard about the film ""Constipation"", you probably haven't because it'...",en,,"['humor', 'memesdaily', 'rofl', 'jokeoftheday', 'funny', 'lol', 'lmao', 'happy', 'followme', 'lo..."
73,2020-12-30 23:01:32+00:00,1344418372847628289,braintickler_,JOKE OF THE DAY : What concert costs only 45 cents? 50 cent featuring Nickelback. #humor #memesd...,en,,"['humor', 'memesdaily', 'rofl', 'jokeoftheday', 'funny', 'lol', 'lmao', 'happy', 'followme', 'lo..."


## Make tweet column hastag free.

In [19]:
df_non["Text"] = df_non["Text"].apply(lambda x: re.sub(r'#\w+', '', x))
df_non["Depression"] = 0
df_non.drop(['Tweet Id', 'Mention', 'Hashtags', 'Username', 'Datetime', 'Language'],
           axis='columns', inplace=True)

df_non.head(5)

Unnamed: 0,Text,Depression
2,JOKE OF THE DAY : Whoever invented the knock-knock joke should get a no bell prize.,0
3,"Dear writing community, I want a . I have , a , and lots of . DM me? I also will send 🥰 Over 1...",0
24,_new _year ♥♥,0
27,Some of us had to make it on our own New year Eve to all of us,0
37,"JOKE OF THE DAY : Have you heard about the film ""Constipation"", you probably haven't because it'...",0


# Training

### Combining/concatenating dataframes.

In [20]:
df = pd.concat([df, df_non], ignore_index=True, axis='rows')
df.head()

Unnamed: 0,Text,Depression
0,not a single comment on my recent fic yet.. 😖😣😔,1
1,its so sad when u take down all the christmas decorations \n,1
2,"If love could have saved you, I know you would still be here with me.\n",1
3,"too much happiness always overflowed into tears of sorrow.\n\n-Amy Tan, The Hundred Secret Sense...",1
4,Sometimes it takes a little bit of sadness to know what happiness is.,1


In [21]:
df.shape

(7191, 2)

### NaNs?

In [22]:
print("Number of NaNs:\n", df.isna().sum())

Number of NaNs:
 Text          0
Depression    0
dtype: int64


### Number of Depression/non-Depression related tweets?

In [23]:
df.Depression.value_counts()

1    4029
0    3162
Name: Depression, dtype: int64

In [None]:
import itertools

**itertools** is used to iterate over data structures that can be stepped over using a for-loop.

In [None]:
from torchtext.data import Field, BucketIterator, TabularDataset

**Field** class models common text processing datatypes that can be represented by tensors. It holds a
Vocab object that defines the set of possible values for elements of the field and their corresponding
numerical representations. The Field object also holds other parameters relating to how a datatype should be numericalized, such as a tokenization method and the kind of Tensor that should be produced.

**BucketIterator** defines an iterator that batches examples of similar lengths together.
Minimizes amount of padding needed while producing freshly shuffled batches for each new epoch.
See pool for the bucketing procedure used.

**TabularDataset** defines a Dataset of columns stored in CSV, TSV, or JSON format.

In [None]:
import torch
import torch.nn as nn

**torch.nn** is a module and contains different classess that help you build neural network models.

In [None]:
import torch.optim as optim

**torch.optim** is a package implementing various optimization algorithms.

In [None]:
from torch.autograd import Variable

**Variables** are just wrappers for the tensors so you can now easily auto compute the gradients.

In [None]:
import torch.nn.functional as F

**torch.nn.Functional** contains some useful functions like activation functions a convolution operations you can use.

You would use the torch.nn.Functional conv operations to define a custom layer for example with a convolution operation, but not to define a standard convolution layer.

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torchtext

**pack_padded_sequence:** Packs a Tensor containing padded sequences of variable length.

**pad_packed_sequence:** Pads a packed batch of variable length sequences.

#### But, why do we "pack" the sequences?

When training RNN (LSTM or GRU or vanilla-RNN), it is difficult to batch the variable length sequences. For example: if the length of sequences in a size 8 batch is [4,6,8,5,4,3,7,8], you will pad all the sequences and that will result in 8 sequences of length 8. You would end up doing 64 computations (8x8), but you needed to do only 45 computations. Moreover, if you wanted to do something fancy like using a bidirectional-RNN, it would be harder to do batch computations just by padding and you might end up doing more computations than required.

Instead, PyTorch allows us to pack the sequence, internally packed sequence is a tuple of two lists. One contains the elements of sequences. Elements are interleaved by time steps and other contains the size of each sequence the batch size at each step. This is helpful in recovering the actual sequences as well as telling RNN what is the batch size at each time step.

### Renaming columns.

In [None]:
df = df.rename({'Text': 'tweet', 'Depression': 'target'}, axis='columns')

Since, many users use shortened words (also called contractions) while writing on social media platforms, we need to fix that.

In [None]:
contraction_dict = {"couldn't": "could not", "ain't": "is not", "aren't": "are not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would", "can't": "cannot", "'cause": "because", "could've": "could have", "he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}

def _get_contractions(contraction_dict):
    
    contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
    return contraction_re

contractions_re = _get_contractions(contraction_dict)
contractions = contraction_dict

def replace_contractions(text):
    
    def replace(match):
        return contractions[match.group(0)]
    
    return contractions_re.sub(replace, text)

### Cleaning tweets.

In [24]:
def tweet_clean(text):
    
    # Remove URLs(if left)
    text = re.sub(r'<([^>]*)>', ' ', text)
    
    # Remove emojis(if any)
    text = re.sub(r'[0-9]+', ' ', text)
    
    # remove at mentions(if left)
    text = re.sub(r'@\w+', ' ', text)
    
    # remove hashtag symbol(if left)
    text = re.sub(r'https?:/\/\S+', ' ', text)
    
    # remove numbers(if any)
    text = re.sub(r'#', '', text)
    
    text = replace_contractions(text)
    pattern = re.compile(r"[ \n\t]+")
    text = pattern.sub(" ", text)      
    text = "".join("".join(s)[:2] for _, s in itertools.groupby(text)) 
    
    # Remove all symbols and punctuations, except for some
    text = re.sub(r'[^A-Za-z0-9,?.!]+', ' ', text)
    
    return text.strip()

# Since, torchtext faces problem in handling "\n", replacing "\n" with space.
df['tweet'] = df.tweet.progress_apply(lambda x: re.sub('\n', ' ', x))

nlp = spacy.load('en',disable=['parser', 'tagger', 'ner'])

def tokenizer(s):
    return [w.text.lower() for w in nlp(tweet_clean(s))]

TEXT = Field(sequential=True, tokenize=tokenizer, include_lengths=True, use_vocab=True)
TARGET = Field(sequential=False, use_vocab=False, pad_token=None, unk_token=None, is_target =False)

data_fields = [(None, None), ("tweet", TEXT), ("target", TARGET)]

### Splitting Dataset.

In [None]:
from sklearn.model_selection import train_test_split

def split_train_test(df, test_size=0.2):
    
    train, val = train_test_split(df, test_size=test_size,random_state=42)
    return train.reset_index(drop=True), val.reset_index(drop=True)

train_val, test = split_train_test(df, test_size=0.2)
train, val = split_train_test(train_val, test_size=0.2)

### Twitter pre-trained word vectors.

Word embeddings are a set of NLP techniques where individual words are mapped to a real-value vector in a high-dimensional space. The vectors are learned in such a way that words that have similar meanings will have similar representation in the vector space.

Pre-trained word embeddings are the embeddings learned in one task that are used for solving another similar task. These embeddings are trained on large datasets, saved, and then used for solving other tasks.

In [25]:
vec = torchtext.vocab.Vectors('glove.twitter.27B.100d.txt')
TEXT.build_vocab(train_data,max_size = 100_000,vectors=vec)
TARGET.build_vocab(train_data)
train_loader, val_loader, test_loader = BucketIterator.splits(datasets=(train, val, test), batch_sizes=(3,3,3), sort_key=lambda x: len(x.tweet), device=None, sort_within_batch=True, repeat=False)

def idxtosent(batch, idx):
    return ' '.join([TEXT.vocab.itos[i] for i in batch.tweet[0][:,idx].cpu().data.numpy()])

class BatchGenerator:
    
    def __init__(self, dl, x_field, y_field):
        self.dl, self.x_field, self.y_field = dl, x_field, y_field
        
    def __len__(self):
        return len(self.dl)
    
    def __iter__(self):
        
        for batch in self.dl:
            X = getattr(batch, self.x_field)
            y = getattr(batch, self.y_field)
            yield (X,y)
            
train_batch_it = BatchGenerator(train_loader, 'tweet', 'target')
vocab_size = len(TEXT.vocab)

In [None]:
embedding_dim = 100
n_hidden = 64
n_out = 2

class ConcatPoolingGRUAdaptive(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, n_hidden, n_out, pretrained_vec, dropout, bidirectional=True):
        
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.n_hidden = n_hidden
        self.n_out = n_out
        self.bidirectional = bidirectional
        self.emb = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.emb.weight.data.copy_(pretrained_vec)
        self.emb.weight.requires_grad = False
        self.gru = nn.GRU(self.embedding_dim, self.n_hidden, bidirectional=bidirectional)
        
        if bidirectional:
            self.fc = nn.Linear(self.n_hidden*4, self.n_out)
            
        else:
            self.fc = nn.Linear(2*self.n_hidden, self.n_out)
            
        self.dropout = nn.Dropout(dropout)
    
    def init_hidden(self, batch_size): 
        
        if self.bidirectional:
            return torch.zeros((2,batch_size,self.n_hidden))
        
        else:
            return torch.zeros((1,batch_size,self.n_hidden))
    
    def forward(self, seq, lengths):
        
        bs = seq.size(1)
        self.h = self.init_hidden(bs)
        seq = seq.transpose(0,1)
        embs = self.emb(seq)
        embs = embs.transpose(0,1)
        embs = pack_padded_sequence(embs, lengths)
        gru_out, self.h = self.gru(embs, self.h)
        gru_out, lengths = pad_packed_sequence(gru_out)  
        avg_pool = F.adaptive_avg_pool1d(gru_out.permute(1,2,0),1).view(bs,-1)
        max_pool = F.adaptive_max_pool1d(gru_out.permute(1,2,0),1).view(bs,-1) 
        cat = self.dropout(torch.cat([avg_pool,max_pool],dim=1))
        outp = self.fc(cat)
        
        return F.log_softmax(outp)

In [None]:
def train(model, iterator, optimizer, criterion, num_batch):
    
    y_true_train = list()
    y_pred_train = list()
    total_loss_train = 0
        
    for (X,lengths),y in iterator:
            
        lengths = lengths.numpy()
        opt.zero_grad()
        pred = model(X, lengths)
        loss = criterion(pred, y)
        loss.backward()
        opt.step()
        pred_idx = torch.max(pred, dim=1)[1]
        y_true_train += list(y.data.numpy())
        y_pred_train += list(pred_idx.data.numpy())
        total_loss_train += loss.item()
            
    train_acc = accuracy_score(y_true_train, y_pred_train)
    train_loss = total_loss_train/num_batch
        
    return train_loss, train_acc

In [26]:
from sklearn.metrics import accuracy_score

def evaluate(model, iterator, criterion, num_batch):
    
    y_true_val = list()
    y_pred_val = list()
    total_loss_val = 0
            
    for (X,lengths),y in iterator:
                
        pred = model(X, lengths.cpu().numpy())
        loss = criterion(pred, y)
        pred_idx = torch.max(pred, 1)[1]
        y_true_val += list(y.cpu().data.numpy())
        y_pred_val += list(pred_idx.cpu().data.numpy())
        total_loss_val += loss.item()
                
    valacc = accuracy_score(y_true_val, y_pred_val)
    valloss = total_loss_val/num_batch
            
    return valloss, valacc

train_loader, val_loader, test_loader = BucketIterator.splits(datasets=(train_data, val_data, test_data), batch_sizes=(32,32,32), sort_key=lambda x: len(x.tweet), device=device, sort_within_batch=True, repeat=False)
train_batch_it = BatchGenerator(train_loader, 'tweet', 'target')
val_batch_it = BatchGenerator(val_loader, 'tweet', 'target')
test_batch_it = BatchGenerator(test_loader, 'tweet', 'target')
m = ConcatPoolingGRUAdaptive(vocab_size, embedding_dim, n_hidden, n_out, train_data.fields['tweet'].vocab.vectors, 0.5).to(device)
opt = optim.Adam(filter(lambda p: p.requires_grad, m.parameters()), 1e-3)
loss_fn = F.nll_loss

In [27]:
epochs = 12

for epoch in range(epochs):      
    
    train_loss, train_acc = train(m, iter(train_batch_it), opt, loss_fn, len(train_batch_it))
    valid_loss, valid_acc = evaluate(m, iter(val_batch_it), loss_fn, len(val_batch_it))
    
    if valid_loss < best_valid_loss:
        
        best_valid_loss = valid_loss
        torch.save(m.state_dict(), 'tut4-model.pt')
          
    train_acc = train_acc*100
    val_acc = val_acc*100
    
    print(f'Epoch {epoch}: Train Accuracy: {train_loss:.2f} Train Loss: {train_acc:.2f})
    print(f'Epoch {epoch}: Val Accuracy: {valid_loss:.2f} Val Loss: {valid_acc:.2f}')    

Epoch 0: Train Accuracy: 70.03, Train Loss: 0.59
Epoch 0: Val Accuracy: 69.37, Val Loss: 0.60
Epoch 1: Train Accuracy: 71.62, Train Loss: 0.56
Epoch 1: Val Accuracy: 71.23, Val Loss: 0.56
Epoch 2: Train Accuracy: 73.17, Train Loss: 0.52
Epoch 2: Val Accuracy: 73.02, Val Loss: 0.53
Epoch 3: Train Accuracy: 75.67, Train Loss: 0.48
Epoch 3: Val Accuracy: 74.37, Val Loss: 0.49
Epoch 4: Train Accuracy: 77.03, Train Loss: 0.47
Epoch 4: Val Accuracy: 76.34, Val Loss: 0.47
Epoch 5: Train Accuracy: 79.32, Train Loss: 0.45
Epoch 5: Val Accuracy: 79.26, Val Loss: 0.46
Epoch 6: Train Accuracy: 80.54, Train Loss: 0.43
Epoch 6: Val Accuracy: 79.75, Val Loss: 0.45
Epoch 7: Train Accuracy: 81.53, Train Loss: 0.41
Epoch 7: Val Accuracy: 81.23, Val Loss: 0.43
Epoch 8: Train Accuracy: 83.01, Train Loss: 0.40
Epoch 8: Val Accuracy: 82.37, Val Loss: 0.41
Epoch 9: Train Accuracy: 84.11, Train Loss: 0.38
Epoch 9: Val Accuracy: 84.02, Val Loss: 0.40
Epoch 10: Train Accuracy: 86.03, Train Loss: 0.36
Epoch 10: 