In [1]:
import re
import string
import random
import pandas as pd
import nltk
from nltk.tokenize import TweetTokenizer
import trax.fastmath.numpy as np
from trax import layers as tl
from trax import optimizers
from trax.supervised import training

## Loading tweets

In [2]:
VAL_PCT = 0.2

In [3]:
all_train_tweets = pd.read_csv('train.csv')
all_test_tweets = pd.read_csv('test.csv')

## Preprocessing

In [4]:
def process_tweet(tweet, remove_stopwords=False, stem=False):
    # Remove hyper-links
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # Remove hashtags
    tweet = re.sub(r'#', '', tweet)
    # Remove stock market tickers
    tweet = re.sub(r'\$\w*', '', tweet)
    # Remove old style tweet text RT
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    
    if remove_stopwords
    
    
    # Tokenize tweet
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    return [word for word in tokenizer.tokenize(tweet) if word not in string.punctuation]

In [5]:
all_train_tweets['clean'] = all_train_tweets.text.map(process_tweet, args(True,))
all_test_tweets['clean'] = all_test_tweets.text.map(process_tweet, args(True,))

In [6]:
# Building vocabulary
vocab = {
    '__PAD__': 0,
    '__<e>__': 1,
    '__UNK__': 2,
}
for tweet in all_train_tweets.clean.to_list():
    for word in tweet:
        if word not in vocab:
            vocab[word] = len(vocab)

# Tweet to tensor
def tweet_to_tensor(tweet, vocab):
    return [vocab.get(token, vocab['__UNK__']) for token in tweet]

all_train_tweets['clean'] = all_train_tweets.clean.apply(tweet_to_tensor, args=(vocab,))
all_test_tweets['clean'] = all_test_tweets.clean.apply(tweet_to_tensor, args=(vocab,))

In [7]:
# Train/Validation split
all_pos_train = all_train_tweets.loc[all_train_tweets.target == 1]
all_neg_train = all_train_tweets.loc[all_train_tweets.target == 0]

pos_cut_idx = int(all_pos_train.shape[0] * (1 - VAL_PCT))
pos_val = all_pos_train.iloc[pos_cut_idx:]
pos_train = all_pos_train.iloc[:pos_cut_idx]

neg_cut_idx = int(all_neg_train.shape[0] * (1 - VAL_PCT))
neg_val = all_neg_train.iloc[neg_cut_idx:]
neg_train = all_neg_train.iloc[:neg_cut_idx] 

all_train = pd.concat([pos_train, neg_train])
all_val = pd.concat([pos_val, neg_val])

In [8]:
def data_generator(data_pos, data_neg, batch_size, loop=False):
    len_pos = len(data_pos)
    len_neg = len(data_neg)
    
    pos_idx_lines =  list(range(len_pos))
    neg_idx_lines = list(range(len_neg))
    
    pos_idx = 0
    neg_idx = 0
    
    n_to_take = batch_size // 2
    
    random.shuffle(pos_idx_lines)
    random.shuffle(neg_idx_lines)
    
    stop = False
    
    while not stop:
        batch = []
        targets = []
        max_len = 0
        for i in range(n_to_take):
            if pos_idx >= len_pos or neg_idx >= len_neg:
                if not loop:
                    stop = True
                    break
                if pos_idx >= len_pos:
                    pos_idx = 0
                    random.shuffle(pos_idx_lines)
                if neg_idx >= len_neg:
                    neg_idx = 0
                    random.shuffle(neg_idx_lines)
                    
            pos_elem = data_pos[pos_idx]
            batch.append(pos_elem)
            targets.append(1)
            if len(pos_elem) > max_len:
                max_len = len(pos_elem)

            neg_elem = data_neg[neg_idx]
            batch.append(neg_elem)
            targets.append(0)
            if len(neg_elem) > max_len:
                max_len = len(neg_elem)

            pos_idx += 1
            neg_idx += 1
                
        if stop:
            break
            
        pos_idx += n_to_take
        neg_idx += n_to_take
        
        # padding
        for elem in batch:
            elem += [0] * (max_len - len(elem))
            
        example_weights = np.array([1] * (n_to_take * 2))
            
        yield np.array(batch), np.array(targets), example_weights

In [9]:
def train_generator(batch_size, train_pos, train_neg):
    return data_generator(
        data_pos=train_pos,
        data_neg=train_neg,
        batch_size=batch_size,
        loop=True
    )
def val_generator(batch_size, val_pos, val_neg):
    return data_generator(
        data_pos=val_pos,
        data_neg=val_neg,
        batch_size=batch_size,
        loop=True
    )

## Building the model

#### First trail, word embeddings with a single hidden layer. Should suffice for sentiment analysis tasks like this one. 

In [10]:
# Embedding dimension. Something to experiemnt with.
EMBED_DIM = 256
BATCH_SIZE = 32
OUTPUT_DIR = './model'

In [11]:
def classifier(vocab_size, embedding_dim):
    return tl.Serial(
        tl.Embedding(
            vocab_size=vocab_size,
            d_feature=embedding_dim,
        ),
        tl.Mean(axis=1),
        tl.Dense(n_units=2),
        tl.LogSoftmax()
    )

## Training the model

In [12]:
def get_train_eval_tasks(train_pos, train_neg, val_pos, val_neg, batch_size):
    train_task = training.TrainTask(
        labeled_data=train_generator(
            batch_size=batch_size,
            train_pos=train_pos,
            train_neg=train_neg
        ),
        loss_layer=tl.CrossEntropyLoss(),
        optimizer=optimizers.Adam(0.01),
        n_steps_per_checkpoint=10,
    )
    eval_task = training.EvalTask(
        labeled_data=val_generator(
            batch_size=batch_size,
            val_pos=val_pos,
            val_neg=val_neg
        ),
        metrics=[
            tl.CrossEntropyLoss(),
            tl.Accuracy(),
        ]
    )
    
    return train_task, eval_task

In [13]:
model = classifier(len(vocab), EMBED_DIM)
train_task, eval_task = get_train_eval_tasks(
    train_pos=pos_train.clean.to_list(),
    train_neg=neg_train.clean.to_list(),
    val_pos=pos_val.clean.to_list(),
    val_neg=neg_val.clean.to_list(),
    batch_size=BATCH_SIZE
)

In [14]:
def train_model(model, train_task, eval_task, n_steps, output_dir):
    training_loop = training.Loop(
        model=model,
        tasks=train_task,
        eval_tasks=eval_task,
        output_dir=output_dir,
    )
    
    training_loop.run(n_steps=n_steps)
    
    return training_loop

In [15]:
training_loop = train_model(
    model=model,
    train_task=train_task,
    eval_task=[eval_task],
    n_steps=100,
    output_dir=OUTPUT_DIR,
)




Step    210: Ran 10 train steps in 4.96 secs
Step    210: train CrossEntropyLoss |  0.28367159
Step    210: eval  CrossEntropyLoss |  0.69616920
Step    210: eval          Accuracy |  0.62500000

Step    220: Ran 10 train steps in 2.37 secs
Step    220: train CrossEntropyLoss |  0.19699201
Step    220: eval  CrossEntropyLoss |  0.54470193
Step    220: eval          Accuracy |  0.75000000

Step    230: Ran 10 train steps in 2.05 secs
Step    230: train CrossEntropyLoss |  0.27555084
Step    230: eval  CrossEntropyLoss |  0.14237022
Step    230: eval          Accuracy |  0.96875000

Step    240: Ran 10 train steps in 2.32 secs
Step    240: train CrossEntropyLoss |  0.16946909
Step    240: eval  CrossEntropyLoss |  0.45608792
Step    240: eval          Accuracy |  0.75000000

Step    250: Ran 10 train steps in 1.50 secs
Step    250: train CrossEntropyLoss |  0.11694790
Step    250: eval  CrossEntropyLoss |  0.49138933
Step    250: eval          Accuracy |  0.78125000

Step    260: Ran 10

## Testing the model

In [16]:
# Prediction
max_len = all_test_tweets.clean.map(len).max().item()

def pad(tweet, max_len):
    return tweet + ([0] * (max_len - len(tweet)))

all_test_tweets['clean'] = all_test_tweets.clean.apply(pad, args=(max_len,))

preds = training_loop.eval_model(np.array(all_test_tweets.clean.to_list()))
target = np.array([pred[1] > pred[0] for pred in preds]).astype(np.float32)
all_test_tweets['target'] = target

In [28]:
# Output
all_test_tweets[['id', 'target']].to_csv('submission.csv', index=False)

## Experiment log:
- First thing, we tried a simple model with an embedding layer, followed by a mean layer, then the output layers. Stopwords were not removed and no stemming was performed. Accuracy was varying between runs, ranging between 0.62500000 0.96875000. Something is definitely wrong.
- Next thing to try, is word removal alone, then stemming alone, then both.