# Formative 3: Particle Prediction

In [1]:
import re
import numpy as np
import pandas as pd
import torch
import nltk

from tqdm import tqdm
from collections import defaultdict, Counter
from html import unescape
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from torch import nn, optim
from torch.nn import functional as F

In [2]:
pd.options.mode.chained_assignment = None

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Define function to clean and split text
def clean(text):
    text = unescape(text)
    return [re.sub('[^a-z]', '', w.lower()) for w in text.strip().split()]

In [4]:
# Load dataframe
df = pd.read_csv('formative3_data_us_equities_news.csv')

In [5]:
df

Unnamed: 0.1,Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id
0,0,221515,NIO,Why Shares of Chinese Electric Car Maker NIO A...,news,What s happening\nShares of Chinese electric c...,2020-01-15,The Motley Fool,https://invst.ly/pigqi,2060327.0
1,1,221516,NIO,NIO only consumer gainer Workhorse Group amon...,news,Gainers NIO NYSE NIO 7 \nLosers MGP Ingr...,2020-01-18,Seeking Alpha,https://invst.ly/pje9c,2062196.0
2,2,221517,NIO,NIO leads consumer gainers Beyond Meat and Ma...,news,Gainers NIO NYSE NIO 14 Village Farms In...,2020-01-15,Seeking Alpha,https://invst.ly/pifmv,2060249.0
3,3,221518,NIO,NIO NVAX among premarket gainers,news,Cemtrex NASDAQ CETX 85 after FY results \n...,2020-01-15,Seeking Alpha,https://invst.ly/picu8,2060039.0
4,4,221519,NIO,PLUG NIO among premarket gainers,news,aTyr Pharma NASDAQ LIFE 63 on Kyorin Pharm...,2020-01-06,Seeking Alpha,https://seekingalpha.com/news/3529772-plug-nio...,2053096.0
...,...,...,...,...,...,...,...,...,...,...
141243,142221,440043,CMCSA,Comcast Customer Data Bundling is Dying and ...,news,The concept is simple enough Customers of one...,2020-01-29,The Motley Fool,https://invst.ly/poet0,2070527.0
141244,142222,440044,CMCSA,AT T Ended 2019 With Fewer Video Subscribers T...,news,AT T NYSE T lost another 1 16 million video ...,2020-01-30,The Motley Fool,https://invst.ly/po-ev,2071864.0
141245,142223,440045,CMCSA,Comcast CMCSA Beats Q4 Earnings And Revenue ...,opinion,Comcast CMCSA came out with quarterly earnin...,2020-01-22,Zacks Investment Research,https://www.investing.com/analysis/comcast-cmc...,200500909.0
141246,142224,440046,EBAY,3 Views To Bid On eBay At 45 50,opinion,The auction giant has long been discussed as h...,2013-06-04,Gregory W. Harmon,https://www.investing.com/analysis/3-views-to-...,169867.0


In [6]:
# Remove columns not needed for formative
df = df[['content', 'provider']]
df['content'] = df['content'].astype(str)

# Remove empty comments
df = df[df.content.apply(lambda x: len(clean(x))) > 0]

In [7]:
#Define a dictionnary for particle lookup ( from part. to part id)
p2id = {'give': 0, 'buy': 1, 'hold': 2, 'sell': 3}

# Define dictionary for reverse particle look-up (from id to part.)
id2p = {v: k for k, v in p2id.items()}

# Initialize lists for storing contexts around particles
sent_1 = list()
sent_2 = list()

# Initialize list for storing labels
labels = list()

# Loop over comments
for c in tqdm(df.content):
    
    # Loop over individual sentences
    for s in nltk.sent_tokenize(c):
        
        # Clean and split sentence
        split = clean(s)
        
        if len(split) < 10:
            continue
        
        # Add sentence to list if only one particle in sentence
        if len([w for w in split if w in p2id]) == 1:
            
            # Identify particle
            p = [w for w in split if w in p2id][0]

            # Store contexts and label
            sent_1.append(split[:split.index(p)])
            sent_2.append(split[split.index(p) + 1:])
            labels.append(p)

100%|██████████| 141248/141248 [02:22<00:00, 988.18it/s]


In [8]:
# Create dataframe with contexts and labels and perform stratified sampling
p_df = pd.DataFrame({'sent_1': sent_1, 'sent_2': sent_2, 'label': labels})[['sent_1', 'sent_2', 'label']]
p_df = p_df.groupby('label', group_keys=False).apply(lambda x: x.sample(n=1500, random_state=123, replace = True)).reset_index(drop=True)

In [9]:
# Split dataframe into training, evaluation, and test data
train, dev_test = train_test_split(p_df, test_size=0.2, stratify=p_df['label'], random_state=123)
dev, test = train_test_split(dev_test, test_size=0.5, stratify=dev_test['label'], random_state=123)

In [10]:
p_df.head()

Unnamed: 0,sent_1,sent_2,label
0,"[a, , contraction, for, the, u, s, economy, du...","[t, mobile, us, nyse, tmus, commodities, finis...",buy
1,"[getting, big, returns, from, financial, portf...",[],buy
2,"[kb, home, nyse, kbh, announced, the, addition...","[you, can, see, other, top, ranked, stocks, in...",buy
3,"[investors, interested, in, retail, discount, ...","[right, now, this, system, places, an, emphasi...",buy
4,"[shares, of, broadridge, financial, solutions,...","[another, favorably, placed, stock, in, the, o...",buy


In [11]:
# Hint : Create dictionary for word look-up, this is helpfull for the word encoding later
sent = train.sent_1 + train.sent_2
words = []
for i in sent:
  for j in i:
    words.append(j)
words[0:10]

['the',
 'consumer',
 'staples',
 'select',
 'sector',
 'spdr',
 'nyse',
 'xlp',
 'is',
 'an']

In [12]:
word_counter = Counter(words)
del word_counter['']
w2id = Counter() # words to indices mapper
i = 2
for key,val in word_counter.most_common(5000):
    w2id[key] = i
    i +=1
# Create dictionary for reverse word look-up
id2w = Counter() # don't forget about reverse mapping ( otherwise you don't know how to translate back from encoded representations)

In [13]:
for key,val in w2id.items():
  id2w[val] = key

In [14]:
# Define function to encode sentences
def encode(sen, w2id):
  output = []# encoder should take sentences and output the word to index representation
  for word in sen:
    if word in w2id.keys():
      output.append(w2id[word])
    else:
      output.append(1)
  return output

In [15]:
len(w2id)

5000

In [16]:
# Padding & cutting stage
# once you have your encoded sentences, you need to make lengths uniform
# ie. a sentence like [12,13,5,6,8,9,17] should be lenght 5 and you need to cut it
# ie. a sentence like [12,13,5] should be lenght 5 and you need to pad it with 0
def pad(sent):
    if len(sent) > 5:
        while len(sent) > 5:
          sent.pop()
        
    elif len(sent) < 5:
        while len(sent) < 5:
          sent.append(0)
    return sent

In [17]:
# Helper code 
# Encode and pad sentences
for data in [train, dev, test]:
    
    # Encode and pad left contexts
    data['enc_1'] = data.sent_1.apply(lambda x: pad(encode(x, w2id)))
    
    # Reverse order of right contexts prior to padding
    data['enc_2'] = data.sent_2.apply(lambda x: pad(encode(x, w2id)[::-1]))

In [18]:
# Inspect dataframe to get a sense of the data
train.label.value_counts()

sell    1200
buy     1200
give    1200
hold    1200
Name: label, dtype: int64

In [19]:
def lam(i):
    lis = [0] * 5000
    for j in i:
        if int(j) != 0:
            lis[int(j)] += 1
    return lis

In [20]:
train['combined'] = train['enc_1'] + train['enc_2']

In [140]:
train['combined_lstm'] = train['enc_1_lstm'] + train['enc_2_lstm']

In [21]:
train['to_tensor'] = train['combined'].apply(lambda x: lam(x))

In [22]:
arr_X = np.array(train.to_tensor.to_list())
ten_X = torch.from_numpy(arr_X)

In [149]:
arr_X_lstm_right = np.array(train.enc_2_lstm.to_list())
arr_X_lstm_left = np.array(train.enc_1_lstm.to_list())
ten_X_lstm_right = torch.from_numpy(arr_X_lstm_right)
tex_X_lstm_left = torch.from_numpy(arr_X_lstm_left)

In [23]:
train = pd.get_dummies(train,prefix=['label'], columns = ['label'])

In [24]:
Y_arr_1 = train[['label_buy','label_sell','label_give','label_hold']].values

In [25]:
ten_y = torch.from_numpy(Y_arr_1)

In [26]:
test = pd.get_dummies(test,prefix=['label'], columns = ['label'])

In [27]:
Y_arr_2 = test[['label_buy','label_sell','label_give','label_hold']].values
test_y =torch.from_numpy(Y_arr_2)

### Part II: Logistic Regression Classifier

In [119]:
from torchmetrics.classification import Accuracy

In [132]:
# Define logistic regression classifier class
class LRClassifier(nn.Module):
    
    def __init__(self, input_dim, output_dim):
        # don't forget the super layer
        super(LRClassifier, self).__init__()
        # create a Linear layer
        self.linear = nn.Linear(input_dim, output_dim)
    def forward(self,ten_X= ten_X):
        outputs = torch.sigmoid(self.linear(ten_X))
        return outputs
    # create and fill a forward function as well 

In [133]:
ten_X.shape

torch.Size([4800, 5000])

In [134]:
log_reg = LRClassifier(5000,4)

In [135]:
def train_model(X,y,steps = 20):
    model = log_reg
    # Define optimization method
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    # Define training objective
    criterion = nn.CrossEntropyLoss()
    
    
    model()

    # Set all tensor gradients to zero
    optimizer.zero_grad()
    
    for step in range(steps):

        # Perform forward pass
        y_pred = model(X)

        # Compute loss
        loss = criterion(y_pred, y)

        # Compute gradient of loss with respect to all model parameters
        loss.backward()

        # Perform parameter update
        optimizer.step()
        
        if step %5==0:

            # Put neural net into evaluation mode
            model.eval()

            # Specify that gradients should not be computed during evaluation
            with torch.no_grad():
                y_pred = model(X)    

            y_pred_class = np.zeros_like(y_pred)
            y_pred_class[np.arange(len(y_pred)), y_pred.argmax(1)] = 1
            accuracy = Accuracy(task ='binary',top_k = 1, num_classes=4)
            print(accuracy(y_pred, y))

In [136]:
ten_X, ten_y = ten_X.to(torch.float32), ten_y.to(torch.float32)

In [137]:
train_model(ten_X, ten_y)

tensor(0.7295)
tensor(0.7470)
tensor(0.7492)
tensor(0.7538)


**A) What accuracy would a classifier get that predicts classes based on random guesses? How
does the logistic regression classifier compare to that baseline?**

A random guess classifier would have a 25% accuracy since there are 4 classes. Logistic regression performs better than the random guesses at ~75%

**B) Plot the accuracy as a function of the context window size $k$. What do you observe? What conclusions can you draw regarding the linguistic information necessary for predicting particles?**

...

**C) For each $k$, examine the top 10 predictive words of each particle. Are your observations in line with the hypothesis made above?**

...

### Part III: Feed-forward Neural Network Classifier

In [190]:
# Define feed-forward neural network classifier class
class FNNClassifier(nn.Module):
    def __init__(self,input_dim,output_dim,d):
        super(FNNClassifier, self).__init__()
        self.hidden = nn.Linear(input_dim,d)
        self.tanh = torch.nn.Tanh() 
        self.otpt = nn.Linear(d,output_dim)
    def forward(self, x= ten_X):
        output = self.hidden(x)
        output = self.tanh(output)
        output = self.otpt(output)
        return output

In [187]:
ffnn = FNNClassifier(5000,4,100)

In [None]:
def train_model_2(X,y,steps = 20):
    model = ffnn
    # Define optimization method
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    # Define training objective
    criterion = nn.CrossEntropyLoss()
    
    
    model()

    # Set all tensor gradients to zero
    optimizer.zero_grad()
    
    for step in range(steps):

        # Perform forward pass
        y_pred = model(X)

        # Compute loss
        loss = criterion(y_pred, y)

        # Compute gradient of loss with respect to all model parameters
        loss.backward()

        # Perform parameter update
        optimizer.step()
        
        if step %5==0:

            # Put neural net into evaluation mode
            model.eval()

            # Specify that gradients should not be computed during evaluation
            with torch.no_grad():
                y_pred = model(X)    
            y_pred_class = np.zeros_like(y_pred)
            y_pred_class[np.arange(len(y_pred)), y_pred.argmax(1)] = 1
            accuracy = Accuracy(task ='binary',top_k = 1, num_classes=4)
            print(float(accuracy(y_pred, y)))
            

In [131]:
train_model_2(ten_X, ten_y)

tensor(0.8706)
tensor(0.8634)
tensor(0.8723)
tensor(0.8707)


**A) How
does this classifier compare to the logistic regression classifier?**

Generally, higher accuracy is observed in this classifier.

**B) Plot the accuracy as a function of the hidden dimension $d$. What do you observe?**

In [192]:
for d in [5,10,50,100,500]:
  ffnn = FNNClassifier(5000,4,d)
  train_model_2(ten_X,ten_y)

tensor(0.6250)
tensor(0.6484)
tensor(0.6815)
tensor(0.6840)
tensor(0.2514)
tensor(0.5305)
tensor(0.5998)
tensor(0.6142)
tensor(0.7708)
tensor(0.6978)
tensor(0.6837)
tensor(0.7079)
tensor(0.6078)
tensor(0.6913)
tensor(0.6913)
tensor(0.7291)
tensor(0.6915)
tensor(0.7046)
tensor(0.7423)
tensor(0.7097)


Accuracy is generally improving as d increases. However, at d=50 a higher accuracy is achieved as compared to d=500!

### Part IV: LSTM Classifier

In [138]:
w2id_2 = Counter() # words to indices mapper
i = 1
for key,val in word_counter.most_common():
    w2id_2[key] = i
    i +=1

In [143]:
len(w2id_2)

43487

In [139]:
for data in [train, dev, test]:
    
    # Encode and pad left contexts
    data['enc_1_lstm'] = data.sent_1.apply(lambda x: pad(encode(x, w2id_2)))
    
    # Reverse order of right contexts prior to padding
    data['enc_2_lstm'] = data.sent_2.apply(lambda x: pad(encode(x, w2id_2)[::-1]))

In [None]:
# Define LSTM classifier class --> you should be familiar with pytorch syntax 
# this is a helper as the lstm itself requires some careful thought so focus 
# on the steps described in the lecture slides for the LSTM
# for the cell, hidden states and the input, forget and output gates
# code it up in this LSTM class
class LSTMClassifier(nn.Module):
    
    # Pass hyperparameters as arguments
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, dropout, context):
        
        super(LSTMClassifier, self).__init__()
        

In [161]:
len(tex_X_lstm_left)

4800

In [166]:
from torch.utils.data import DataLoader

# create a DataLoader for the left input dataset
left_input_dataloader = DataLoader(tex_X_lstm_left, batch_size=5, shuffle=True)

# create a DataLoader for the right input dataset
right_input_dataloader = DataLoader(ten_X_lstm_right, batch_size=5, shuffle=True)

# create a DataLoader for the target dataset
target_dataloader = DataLoader(ten_y, batch_size=5, shuffle=True)


In [193]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx, dropout):
        super().__init__()
        
        self.embedding_left = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm_left = nn.LSTM(embedding_dim, hidden_dim, bidirectional=False)
        self.embedding_right = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm_right = nn.LSTM(embedding_dim, hidden_dim, bidirectional=False)
        self.linear = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text_left, text_right):
        # text_left and text_right should be tensors of shape (seq_length, batch_size)
        
        embedded_left = self.dropout(self.embedding_left(text_left))
        embedded_right = self.dropout(self.embedding_right(text_right.flip(0)))
        
        # flip the right context to process from right to left
        output_left, (hidden_left, cell_left) = self.lstm_left(embedded_left)
        output_right, (hidden_right, cell_right) = self.lstm_right(embedded_right)
        
        # concatenate the last hidden states of both LSTMs
        hidden_cat = torch.cat((hidden_left[-1], hidden_right[-1]), dim=1)
        
        # pass through the linear layer
        linear_output = self.linear(self.dropout(hidden_cat))
        
        return linear_output

In [194]:
model = LSTMClassifier(43487,300,200,4,0,0.5)

In [199]:
# define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# define the loss function
loss_function = nn.CrossEntropyLoss()
i=1
# train the model for 1 epoch
if i == 1:

    # set the model to training mode
    model.train()

    # iterate over the training data
    for i, (left_input_batch, right_input_batch, target_batch) in enumerate(zip(left_input_dataloader, right_input_dataloader, target_dataloader)):

        # clear the gradients
        optimizer.zero_grad()

        # forward pass
        output = model(left_input_batch, right_input_batch)
        loss = loss_function(output, target_batch)

        # backward pass and optimization step
        loss.backward()
        optimizer.step()

print(output)

tensor([[ 0.1101,  0.3555, -0.0267, -0.1914],
        [ 0.0600,  0.1466,  0.2469,  0.0702],
        [-0.2927, -0.3884, -0.0413,  0.0460],
        [-0.4975, -0.1608, -0.2706,  0.0915],
        [-0.3184, -0.5457,  0.0402, -0.3198]], grad_fn=<AddmmBackward0>)


**A) How
does the LSTM classifier compare to the feed-forward neural network classifier?**

...

**B)  Modify the LSTM architecture so that it only takes the left or right context into account. Train and test these two models. Which of the two contexts provides more information for
particle prediction?**

...

**C) Tabulate the number of misclassified examples as a function of the number of UNK tokens
in the left and right contexts. Manually inspect a couple of misclassified examples. What do
you observe? How do your observations relate to results of earlier parts of the formative?**

...

**D) Create a confusion matrix of the predicted labels versus the true labels. What do you
observe?**

...

### Part V: Overall Discussion

**Compare the three models to the trigram model presented in class. What information is available for the classifier in each of the four approaches? Are you able to interpret the overall
success of the models in relation to the information that is available in each one and the ability
to exploit it in an optimal fashion?**

...