In [46]:
>>> import nltk
>>> nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [47]:
import pandas as pd
import json
import numpy as np
import re
from nltk.stem.porter import *
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import torch
from torchtext import data
import random
import torch.nn as nn
import torch.optim as optim
import time
import spacy

stemmer = PorterStemmer()
import warnings
warnings.filterwarnings('ignore')
print("GPU Model   ",torch.cuda.get_device_name(0))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

GPU Model    Tesla T4


In [48]:
def clean_data(X):
    STOPWORDS = set(stopwords.words('english'))
    X = X.str.lower()
    X = X.str.replace("[/(){}\[\]\|@,;]", " ")
    X = X.str.replace("[^0-9a-z #+_]", " ")
    X = X.str.replace(r'\d+','')
    X = X.apply(lambda x: ' '.join([w for w in x.split() if (len(w)>2 and w not in STOPWORDS) ] ))
    X = X.apply(lambda x: x.split()) 
    return X

def target_arrange(y):
    
    for i in range(len(y)):
        if y.values[i]=="Negative":
            y.values[i]=0.0
        elif y[i]=="Positive":
            y.values[i]=1.0
        else:
            y.values[i]=2.0
            
    y=y.to_numpy()  
    y=y.reshape(y.shape[0],1)
    y= pd.DataFrame(data=y)
    y=np.ravel(y)
    y=y.astype('float')
    return y

In [49]:
with open("All_Tickers.json","r") as fp:
#with open("General_Market.json",encoding='utf8') as fp:
    json_d = json.load(fp)
 
ticks_d = json_d['data']
df = pd.DataFrame(ticks_d)

X= pd.DataFrame(columns=['Date', 'Article','Target'])
X['Date']=pd.to_datetime(df['date'])
X['Article']=df['title']+" "+df['text']
X['Target']=df['sentiment']

X=X.sort_values("Date")

print("Number of Examples : ",len(X),"\n")
X.drop_duplicates(inplace=True)
X.index = range(len(X))
print("Number of Examples after removing duplicates: ",len(X),"\n")

print('Number of words before cleaning : ',X['Article'].apply(lambda x: len(str(x).split(' '))).sum())
X['Article']=clean_data(X['Article'])
print('Number of words after cleaning : ',X['Article'].apply(lambda x: len(str(x).split(' '))).sum())
print("\n******************\n")

X['Target']=target_arrange(X['Target'])
X=X.drop('Date',1)
L=[]
L.append((X['Target']==0.0).sum())
L.append((X['Target']==1.0).sum())
L.append((X['Target']==2.0).sum())

print("Negative Examples : ",(X['Target']==0.0).sum())
print("Positive Examples : ",(X['Target']==1.0).sum())
print("Neutral Examples : ",(X['Target']==2.0).sum())

maximum=max(L)

Weights=[]

for i in L:
  Weights.append(maximum/i)
  
class_weights = torch.FloatTensor(Weights).to(device)
print("\n Weights = ",class_weights)

Number of Examples :  28130 

Number of Examples after removing duplicates:  27937 

Number of words before cleaning :  1067277
Number of words after cleaning :  718391

******************

Negative Examples :  3143
Positive Examples :  9901
Neutral Examples :  14893

 Weights =  tensor([4.7385, 1.5042, 1.0000], device='cuda:0')


In [50]:
X['Article']= X['Article'].apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
for i in range(len(X['Article'])): #φέρνω τα tokens ξανά μαζί διαχωριζόμενα με κενά
    X['Article'][i] = ' '.join(X['Article'][i])
print(X['Article'])


0        vianet group inc announc unaudit second quarte...
1        krato present canaccord virtual growth confer ...
2        rewalk robot report second quarter financi res...
3        pyxi tanker announc date releas second quarter...
4        bionano genom report second quarter financi re...
                               ...                        
27932    facebook block new polit ad may fall short sti...
27933    amazon growth problem buy good good enough amz...
27934    pyrogenesi sign contract navi two ship build p...
27935    guardant health high price lot potenti guardan...
27936    oil futur post first weekli fall week oil futu...
Name: Article, Length: 27937, dtype: object


In [51]:
X.to_csv (r'General_rnn.csv', index = False, header=True)

In [52]:
torch.manual_seed(1234)
torch.backends.cudnn.deterministic = True

batch=1024
#TEXT = data.Field(tokenize='spacy',batch_first=True,include_lengths=True)
TEXT = data.Field(batch_first=True,include_lengths=True)
LABEL = data.LabelField(dtype = torch.long,batch_first=True)

fields = [('text',TEXT),('label', LABEL)]
Train_Data=data.TabularDataset(path = 'General_rnn.csv',format = 'csv',fields = fields,skip_header = True)

X_train, X_test = Train_Data.split(split_ratio=0.75, random_state = random.seed(1234))
X_train, X_val = X_train.split(split_ratio=0.8, random_state = random.seed(1234))

TEXT.build_vocab(X_train, min_freq=2)  
LABEL.build_vocab(X_train)

print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  
train_it, val_it, test_it = data.BucketIterator.splits((X_train, X_val, X_test),sort_key = lambda x: len(x.text),
    sort_within_batch=True,batch_size = batch,device = device)  



Size of TEXT vocabulary: 14528
Size of LABEL vocabulary: 3
[('stock', 6745), ('earn', 5809), ('inc', 5119), ('compani', 4671), ('announc', 4504), ('result', 4210), ('report', 3204), ('busi', 2862), ('new', 2743), ('investor', 2680)]


In [53]:
def find_accuracy(preds, y):
   
    temp = torch.log_softmax(preds, dim = 1)
    _, y_pred = torch.max(temp, dim = 1) 
    valid = (y_pred == y).float() 
    accur = valid.sum() / len(valid)
    return accur

def Loss_Optimizer (model,valueLR):
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=valueLR)
    return optimizer,criterion

In [54]:
def train(model,data,lr,optimizer,criterion):
        
    #optimizer,criterion=Loss_Optimizer (model,lr)
    #criterion = criterion.to(device)
    model.train()
    sumloss=0.0
    sumacc=0.0
    for i in data:
        text, text_lengths = i.text   
        optimizer.zero_grad()
        pred = model(text, text_lengths).squeeze(0)       
        loss = criterion(pred, i.label)
        acc = find_accuracy(pred, i.label)
        loss.backward()
        optimizer.step()
        sumloss += loss.item()
        sumacc += acc.item()
          
    return sumloss / len(data), sumacc / len(data)

In [55]:
def test(model, data, criterion):
    
    sumloss = 0
    sumacc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for i in data:
            text, text_lengths = i.text
            
            pred = model(text, text_lengths).squeeze(0)
           
            loss = criterion(pred, i.label)
            
            acc = find_accuracy(pred, i.label)

            sumloss += loss.item()
            sumacc += acc.item()
        
    return sumloss / len(data), sumacc / len(data)

In [56]:
def process_test(model,numEpochs,data_tr,data_val,data_test,lr,loss_file,acc_file,best_model):

    optimizer,criterion=Loss_Optimizer (model,lr)
    criterion = criterion.to(device)
    start_time = time.time()
    fileout=open(loss_file,"w")
    fileout2=open(acc_file,"w")
    
    for i in range(numEpochs):
 
        train_loss, train_acc = train(model, data_tr, lr,optimizer,criterion)
        valid_loss, valid_acc = test(model, data_val, criterion)
        fileout.write(str(valid_loss)+"\n")
        fileout2.write(str(valid_acc)+"\n")
        if i==0:
            max=valid_acc
            torch.save(model.state_dict(), best_model)
        elif valid_acc > max:
            max = valid_acc
            torch.save(model.state_dict(), best_model)

        if (i+1)%5==0:
          print("Epoch : ",i+1," Train Loss : ",train_loss,"  Train Acc : ",train_acc,"  Valid Loss : ",valid_loss, " Val Acc : ",valid_acc)
    
    end_time = time.time()
    timeHelp=(end_time-start_time)/60.0
    print("\nTime needed for Training : ",timeHelp)

    fileout.close()
    fileout2.close()  
    
   
    model.load_state_dict(torch.load(best_model))
    
    test_loss, test_acc = test(model, data_test, criterion)
    print("\nLoss in Testset : ",test_loss,"  Accuracy in Testset : ",test_acc,"\n")

In [57]:
print("GPU Model   ",torch.cuda.get_device_name(0))

GPU Model    Tesla T4


In [58]:
class myLSTM(nn.Module):
    def __init__(self, voc, embed_d, hid_d, out_d,dropout):
        
        super().__init__()
        
        self.embedding = nn.Embedding(voc, embed_d)
        self.rnn = nn.LSTM(embed_d,hid_d,batch_first=True,num_layers=2,bidirectional=True,dropout=dropout)
        self.fc = nn.Linear(hid_d * 2, out_d)
        self.dropout = nn.Dropout(dropout)
    def forward(self, input_text, text_lengths):
                
        embedded = self.dropout(self.embedding(input_text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        rnn_out,(rnn_hid,rnn_cell) = self.rnn(packed_embedded)
        hidden = self.dropout(torch.cat((rnn_hid[-2,:,:], rnn_hid[-1,:,:]), dim=1))
        return self.fc(hidden)

In [59]:
class myGRU(nn.Module):
    def __init__(self, voc, embed_d, hid_d, out_d,dropout):
        
        super().__init__()
        
        self.embedding = nn.Embedding(voc, embed_d)
        self.rnn = nn.GRU(embed_d,hid_d,num_layers=2,bidirectional=True,dropout=dropout)
        self.fc = nn.Linear(hid_d * 2, out_d)
        self.dropout = nn.Dropout(dropout)
    def forward(self, input_text,text_lengths):
                
        embedded = self.dropout(self.embedding(input_text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        rnn_out,hidden = self.rnn(packed_embedded)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        return self.fc(hidden)

In [60]:
myInput = len(TEXT.vocab)
myEmbed = 300
myHid = 256
myOut = 3
dropout = 0.4

new_model2 = myLSTM(myInput, myEmbed, myHid, myOut,dropout)
new_model2.to(torch.device(device))
new_model3 = myGRU(myInput, myEmbed, myHid, myOut,dropout)
new_model3.to(torch.device(device))

print(new_model2)
print(new_model3)
#torch.save(new_model2.state_dict(), 'model_LSTM_1.pt')

myLSTM(
  (embedding): Embedding(14528, 300)
  (rnn): LSTM(300, 256, num_layers=2, batch_first=True, dropout=0.4, bidirectional=True)
  (fc): Linear(in_features=512, out_features=3, bias=True)
  (dropout): Dropout(p=0.4, inplace=False)
)
myGRU(
  (embedding): Embedding(14528, 300)
  (rnn): GRU(300, 256, num_layers=2, dropout=0.4, bidirectional=True)
  (fc): Linear(in_features=512, out_features=3, bias=True)
  (dropout): Dropout(p=0.4, inplace=False)
)


In [61]:
process_test(new_model2,30,train_it,val_it,test_it,0.001,"lstm1_loss.txt","lstm1_acc.txt","best_LSTM_1_model.pt")

Epoch :  5  Train Loss :  0.3259941427694524   Train Acc :  0.7471277538467856   Valid Loss :  0.30067711481824516  Val Acc :  0.7822265625
Epoch :  10  Train Loss :  0.1789632415097645   Train Acc :  0.8730468714938444   Valid Loss :  0.3545713071856881  Val Acc :  0.8177734375
Epoch :  15  Train Loss :  0.10191365624504055   Train Acc :  0.9342256398761973   Valid Loss :  0.4914697307627648  Val Acc :  0.83984375
Epoch :  20  Train Loss :  0.053707805460469576   Train Acc :  0.9649011913467856   Valid Loss :  0.6354317214019829  Val Acc :  0.851171875
Epoch :  25  Train Loss :  0.0408472887801883   Train Acc :  0.9740923678173738   Valid Loss :  0.7163185386343685  Val Acc :  0.854296875
Epoch :  30  Train Loss :  0.03002985843169667   Train Acc :  0.9800666325232562   Valid Loss :  0.7014689822448418  Val Acc :  0.848828125

Time needed for Training :  1.7363901734352112

Loss in Testset :  0.8401467885289874   Accuracy in Testset :  0.8282538822719029 



In [62]:
process_test(new_model3,30,train_it,val_it,test_it,0.001,"gru1_loss.txt","gru1_acc.txt","best_gru_1_model.pt")

Epoch :  5  Train Loss :  0.330435328240342   Train Acc :  0.7508042244350209   Valid Loss :  0.2921428566798568  Val Acc :  0.801953125
Epoch :  10  Train Loss :  0.20542483986355364   Train Acc :  0.8602941141409033   Valid Loss :  0.3412595229223371  Val Acc :  0.833203125
Epoch :  15  Train Loss :  0.13450773693415718   Train Acc :  0.9131433788467856   Valid Loss :  0.4300518475472927  Val Acc :  0.8350041151046753
Epoch :  20  Train Loss :  0.08223554554997999   Train Acc :  0.9439338200232562   Valid Loss :  0.510126031190157  Val Acc :  0.8504338026046753
Epoch :  25  Train Loss :  0.058786326736855486   Train Acc :  0.9611672759056091   Valid Loss :  0.6737908825278283  Val Acc :  0.8518009901046752
Epoch :  30  Train Loss :  0.03969232612357968   Train Acc :  0.974781706052668   Valid Loss :  0.7627447455619404  Val Acc :  0.84921875

Time needed for Training :  1.4233663082122803

Loss in Testset :  0.6613414511084557   Accuracy in Testset :  0.8308434401239667 

