In [1]:
>>> import nltk
>>> nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
import pandas as pd
import json
import numpy as np
import re
from nltk.stem.porter import *
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import torch
from torchtext import data
import random
import torch.nn as nn
import torch.optim as optim
import time
import spacy

stemmer = PorterStemmer()
import warnings
warnings.filterwarnings('ignore')
print("GPU Model   ",torch.cuda.get_device_name(0))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

GPU Model    Tesla P4


In [3]:
def clean_data(X):
    STOPWORDS = set(stopwords.words('english'))
    X = X.str.lower()
    X = X.str.replace("[/(){}\[\]\|@,;]", " ")
    X = X.str.replace("[^0-9a-z #+_]", " ")
    X = X.str.replace(r'\d+','')
    X = X.apply(lambda x: ' '.join([w for w in x.split() if (len(w)>2 and w not in STOPWORDS) ] ))
    X = X.apply(lambda x: x.split()) 
    return X

def target_arrange(y):
    
    for i in range(len(y)):
        if y.values[i]=="Negative":
            y.values[i]=0.0
        elif y[i]=="Positive":
            y.values[i]=1.0
        else:
            y.values[i]=2.0
            
    y=y.to_numpy()  
    y=y.reshape(y.shape[0],1)
    y= pd.DataFrame(data=y)
    y=np.ravel(y)
    y=y.astype('float')
    return y

In [4]:
with open("All_Tickers.json","r") as fp:
#with open("General_Market.json",encoding='utf8') as fp:
    json_d = json.load(fp)
 
ticks_d = json_d['data']
df = pd.DataFrame(ticks_d)

X= pd.DataFrame(columns=['Date', 'Article','Target'])
X['Date']=pd.to_datetime(df['date'])
X['Article']=df['title']+" "+df['text']
X['Target']=df['sentiment']

X=X.sort_values("Date")

print("Number of Examples : ",len(X),"\n")
X.drop_duplicates(inplace=True)
X.index = range(len(X))
print("Number of Examples after removing duplicates: ",len(X),"\n")

print('Number of words before cleaning : ',X['Article'].apply(lambda x: len(str(x).split(' '))).sum())
X['Article']=clean_data(X['Article'])
print('Number of words after cleaning : ',X['Article'].apply(lambda x: len(str(x).split(' '))).sum())
print("\n******************\n")

X['Target']=target_arrange(X['Target'])
X=X.drop('Date',1)
L=[]
L.append((X['Target']==0.0).sum())
L.append((X['Target']==1.0).sum())
L.append((X['Target']==2.0).sum())

print("Negative Examples : ",(X['Target']==0.0).sum())
print("Positive Examples : ",(X['Target']==1.0).sum())
print("Neutral Examples : ",(X['Target']==2.0).sum())

maximum=max(L)

Weights=[]

for i in L:
  Weights.append(maximum/i)
  
class_weights = torch.FloatTensor(Weights).to(device)
print("\n Weights = ",class_weights)

Number of Examples :  24030 

Number of Examples after removing duplicates:  23867 

Number of words before cleaning :  910682
Number of words after cleaning :  612873

******************

Negative Examples :  2640
Positive Examples :  8465
Neutral Examples :  12762

 Weights =  tensor([4.8341, 1.5076, 1.0000], device='cuda:0')


In [5]:
X['Article']= X['Article'].apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
for i in range(len(X['Article'])): #φέρνω τα tokens ξανά μαζί διαχωριζόμενα με κενά
    X['Article'][i] = ' '.join(X['Article'][i])
print(X['Article'])


0        vianet group inc announc unaudit second quarte...
1        bionano genom report second quarter financi re...
2        pyxi tanker announc date releas second quarter...
3        intellig system announc new board member norcr...
4        krato present canaccord virtual growth confer ...
                               ...                        
23862    grab slice pie resurg emerg market etf emerg m...
23863    chipotl stock jump toward straight record wedb...
23864    seattl base big fish game lay peopl read memo ...
23865    molson coor steal stock valu illog take advant...
23866    sylvania still look cheap palladium mine resta...
Name: Article, Length: 23867, dtype: object


In [6]:
X.to_csv (r'General_rnn.csv', index = False, header=True)

In [7]:
torch.manual_seed(1234)
torch.backends.cudnn.deterministic = True

batch=1024
#TEXT = data.Field(tokenize='spacy',batch_first=True,include_lengths=True)
TEXT = data.Field(batch_first=True,include_lengths=True)
LABEL = data.LabelField(dtype = torch.long,batch_first=True)

fields = [('text',TEXT),('label', LABEL)]
Train_Data=data.TabularDataset(path = 'General_rnn.csv',format = 'csv',fields = fields,skip_header = True)

X_train, X_test = Train_Data.split(split_ratio=0.7, random_state = random.seed(1234))
X_train, X_val = X_train.split(split_ratio=0.8, random_state = random.seed(1234))

TEXT.build_vocab(X_train, min_freq=2)  
LABEL.build_vocab(X_train)

print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  
train_it, val_it, test_it = data.BucketIterator.splits((X_train, X_val, X_test),sort_key = lambda x: len(x.text),
    sort_within_batch=True,batch_size = batch,device = device)  



Size of TEXT vocabulary: 13043
Size of LABEL vocabulary: 3
[('stock', 5352), ('earn', 4718), ('inc', 4126), ('compani', 3759), ('announc', 3574), ('result', 3558), ('report', 2534), ('busi', 2310), ('new', 2168), ('investor', 2120)]


In [8]:
def find_accuracy(preds, y):
   
    temp = torch.log_softmax(preds, dim = 1)
    _, y_pred = torch.max(temp, dim = 1) 
    valid = (y_pred == y).float() 
    accur = valid.sum() / len(valid)
    return accur

def Loss_Optimizer (model,valueLR):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=valueLR)
    return optimizer,criterion

In [9]:
def train(model,data,lr,optimizer,criterion):
        
    #optimizer,criterion=Loss_Optimizer (model,lr)
    #criterion = criterion.to(device)
    model.train()
    sumloss=0.0
    sumacc=0.0
    for i in data:
        text, text_lengths = i.text   
        optimizer.zero_grad()
        pred = model(text, text_lengths).squeeze(0)       
        loss = criterion(pred, i.label)
        acc = find_accuracy(pred, i.label)
        loss.backward()
        optimizer.step()
        sumloss += loss.item()
        sumacc += acc.item()
          
    return sumloss / len(data), sumacc / len(data)

In [10]:
def test(model, data, criterion):
    
    sumloss = 0
    sumacc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for i in data:
            text, text_lengths = i.text
            
            pred = model(text, text_lengths).squeeze(0)
           
            loss = criterion(pred, i.label)
            
            acc = find_accuracy(pred, i.label)

            sumloss += loss.item()
            sumacc += acc.item()
        
    return sumloss / len(data), sumacc / len(data)

In [11]:
def process_test(model,numEpochs,data_tr,data_val,data_test,lr,loss_file,acc_file,best_model):

    optimizer,criterion=Loss_Optimizer (model,lr)
    criterion = criterion.to(device)
    start_time = time.time()
    fileout=open(loss_file,"w")
    fileout2=open(acc_file,"w")
    
    for i in range(numEpochs):
 
        train_loss, train_acc = train(model, data_tr, lr,optimizer,criterion)
        valid_loss, valid_acc = test(model, data_val, criterion)
        fileout.write(str(valid_loss)+"\n")
        fileout2.write(str(valid_acc)+"\n")
        if i==0:
            max=valid_acc
            torch.save(model.state_dict(), best_model)
        elif valid_acc > max:
            max = valid_acc
            torch.save(model.state_dict(), best_model)

        if (i+1)%5==0:
          print("Epoch : ",i+1," Train Loss : ",train_loss,"  Train Acc : ",train_acc,"  Valid Loss : ",valid_loss, " Val Acc : ",valid_acc)
    
    end_time = time.time()
    timeHelp=(end_time-start_time)/60.0
    print("\nTime needed for Training : ",timeHelp)

    fileout.close()
    fileout2.close()  
    
   
    model.load_state_dict(torch.load(best_model))
    
    test_loss, test_acc = test(model, data_test, criterion)
    print("\nLoss in Testset : ",test_loss,"  Accuracy in Testset : ",test_acc,"\n")

In [12]:
print("GPU Model   ",torch.cuda.get_device_name(0))

GPU Model    Tesla P4


In [13]:
class myLSTM(nn.Module):
    def __init__(self, voc, embed_d, hid_d, out_d,dropout):
        
        super().__init__()
        
        self.embedding = nn.Embedding(voc, embed_d)
        self.rnn = nn.LSTM(embed_d,hid_d,batch_first=True,num_layers=2,bidirectional=True,dropout=dropout)
        self.fc = nn.Linear(hid_d * 2, out_d)
        self.dropout = nn.Dropout(dropout)
    def forward(self, input_text, text_lengths):
                
        embedded = self.dropout(self.embedding(input_text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        rnn_out,(rnn_hid,rnn_cell) = self.rnn(packed_embedded)
        hidden = self.dropout(torch.cat((rnn_hid[-2,:,:], rnn_hid[-1,:,:]), dim=1))
        return self.fc(hidden)

In [14]:
class myGRU(nn.Module):
    def __init__(self, voc, embed_d, hid_d, out_d,dropout):
        
        super().__init__()
        
        self.embedding = nn.Embedding(voc, embed_d)
        self.rnn = nn.GRU(embed_d,hid_d,num_layers=2,bidirectional=True,dropout=dropout)
        self.fc = nn.Linear(hid_d * 2, out_d)
        self.dropout = nn.Dropout(dropout)
    def forward(self, input_text,text_lengths):
                
        embedded = self.dropout(self.embedding(input_text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        rnn_out,hidden = self.rnn(packed_embedded)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        return self.fc(hidden)

In [17]:
myInput = len(TEXT.vocab)
myEmbed = 300
myHid = 256
myOut = 3
dropout = 0.5

new_model2 = myLSTM(myInput, myEmbed, myHid, myOut,dropout)
new_model2.to(torch.device(device))
new_model3 = myGRU(myInput, myEmbed, myHid, myOut,dropout)
new_model3.to(torch.device(device))

print(new_model2)
print(new_model3)
#torch.save(new_model2.state_dict(), 'model_LSTM_1.pt')

myLSTM(
  (embedding): Embedding(13043, 300)
  (rnn): LSTM(300, 256, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)
myGRU(
  (embedding): Embedding(13043, 300)
  (rnn): GRU(300, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [18]:
process_test(new_model2,30,train_it,val_it,test_it,0.001,"lstm1_loss.txt","lstm1_acc.txt","best_LSTM_1_model.pt")

Epoch :  5  Train Loss :  0.5368909218481609   Train Acc :  0.7792271205357143   Valid Loss :  0.45904945954680443  Val Acc :  0.8040331304073334
Epoch :  10  Train Loss :  0.3272989824015115   Train Acc :  0.8643973214285714   Valid Loss :  0.4453469328582287  Val Acc :  0.8385123163461685
Epoch :  15  Train Loss :  0.21343472559237853   Train Acc :  0.9167131696428571   Valid Loss :  0.6048370450735092  Val Acc :  0.8456867784261703
Epoch :  20  Train Loss :  0.13702156721878314   Train Acc :  0.9461495535714286   Valid Loss :  0.6705305650830269  Val Acc :  0.8456867784261703
Epoch :  25  Train Loss :  0.09885719674951231   Train Acc :  0.96240234375   Valid Loss :  0.7458628192543983  Val Acc :  0.8529638051986694
Epoch :  30  Train Loss :  0.07908210468095993   Train Acc :  0.9715401785714286   Valid Loss :  0.7690333873033524  Val Acc :  0.8504280149936676

Time needed for Training :  1.5869518240292868

Loss in Testset :  0.9190350472927094   Accuracy in Testset :  0.82442745992

In [19]:
process_test(new_model3,30,train_it,val_it,test_it,0.001,"gru1_loss.txt","gru1_acc.txt","best_gru_1_model.pt")

Epoch :  5  Train Loss :  0.49289161073310034   Train Acc :  0.79296875   Valid Loss :  0.46295440196990967  Val Acc :  0.8070572018623352
Epoch :  10  Train Loss :  0.33121166195321294   Train Acc :  0.8638392857142857   Valid Loss :  0.45033938623964787  Val Acc :  0.8353775143623352
Epoch :  15  Train Loss :  0.24201753609980056   Train Acc :  0.9031110491071429   Valid Loss :  0.5072396621108055  Val Acc :  0.8452928960323334
Epoch :  20  Train Loss :  0.17347335571581166   Train Acc :  0.9331752232142857   Valid Loss :  0.6225523538887501  Val Acc :  0.8498371690511703
Epoch :  25  Train Loss :  0.1284193864516315   Train Acc :  0.94873046875   Valid Loss :  0.7187322080135345  Val Acc :  0.8504198491573334
Epoch :  30  Train Loss :  0.10712693845354286   Train Acc :  0.9580775669642857   Valid Loss :  0.8060244657099247  Val Acc :  0.8415835946798325

Time needed for Training :  1.410451642672221

Loss in Testset :  0.6587474857057843   Accuracy in Testset :  0.8198192715644836 
