In [2]:
>>> import nltk
>>> nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
import pandas as pd
import json
import numpy as np
import re
from nltk.stem.porter import *
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import torch
from torchtext import data
import random
import torch.nn as nn
import torch.optim as optim
import time
import spacy

stemmer = PorterStemmer()
import warnings
warnings.filterwarnings('ignore')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("GPU Model   ",torch.cuda.get_device_name(0))

GPU Model    Tesla T4


In [4]:
def clean_data(X):
    STOPWORDS = set(stopwords.words('english'))
    X=X.str.lower()
    X=X.str.replace("[/(){}\[\]\|@,;]", " ")
    X=X.str.replace("[^0-9a-z #+_]", " ")
    X = X.str.replace(r'\d+','')
    X = X.apply(lambda x: ' '.join([w for w in str(x).split() if (len(w)>2 and w not in STOPWORDS) ] ))
    X = X.apply(lambda x: x.split()) 
    return X

def target_arrange(y):
    
    for i in range(len(y)):
        if y.values[i]=="Negative":
            y.values[i]=0.0
        elif y[i]=="Positive":
            y.values[i]=1.0
        else:
            y.values[i]=2.0
            
    y=y.to_numpy()  
    y=y.reshape(y.shape[0],1)
    y= pd.DataFrame(data=y)
    y=np.ravel(y)
    y=y.astype('float')
    return y

In [5]:
with open("Tech_news.json","r") as fp:
#with open("General_Market.json",encoding='utf8') as fp:
    json_d = json.load(fp)
 
ticks_d = json_d['data']
df = pd.DataFrame(ticks_d)

X= pd.DataFrame(columns=['Date', 'Article','Target'])
X['Date']=pd.to_datetime(df['date'])
X['Article']=df['title']+" "+df['text']
X['Target']=df['sentiment']

X=X.sort_values("Date")

print("Number of Examples : ",len(X),"\n")
X.drop_duplicates(inplace=True)
X.index = range(len(X))
print("Number of Examples after removing duplicates: ",len(X),"\n")

print('Number of words before cleaning : ',X['Article'].apply(lambda x: len(str(x).split(' '))).sum())
X['Article']=clean_data(X['Article'])
print('Number of words after cleaning : ',X['Article'].apply(lambda x: len(str(x).split(' '))).sum())
print("\n******************\n")
#print(X.groupby(['Target']).count())
X['Target']=target_arrange(X['Target'])
X=X.drop('Date',1)

L=[]
L.append((X['Target']==0.0).sum())
L.append((X['Target']==1.0).sum())
L.append((X['Target']==2.0).sum())

print("Negative Examples : ",(X['Target']==0.0).sum())
print("Positive Examples : ",(X['Target']==1.0).sum())
print("Neutral Examples : ",(X['Target']==2.0).sum())

maximum=max(L)

Weights=[]

for i in L:
  Weights.append(maximum/i)
  
class_weights = torch.FloatTensor(Weights).to(device)
print("\n Weights = ",class_weights)


Number of Examples :  15641 

Number of Examples after removing duplicates:  15432 

Number of words before cleaning :  585553
Number of words after cleaning :  393507

******************

Negative Examples :  1892
Positive Examples :  5838
Neutral Examples :  7702

 Weights =  tensor([4.0708, 1.3193, 1.0000], device='cuda:0')


In [6]:
X['Article']= X['Article'].apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
for i in range(len(X['Article'])): #φέρνω τα tokens ξανά μαζί διαχωριζόμενα με κενά
    X['Article'][i] = ' '.join(X['Article'][i])
print(X['Article'])


0        mercuri system receiv contract award base new ...
1        progress second quarter revenu exce guidanc in...
2        acuiti brand declar quarterli dividend atlanta...
3         share factset soar today earn came better expect
4        stifel say inseego leader inseego corp nasdaq ...
                               ...                        
15427    lead plaintiff deadlin alert faruqi faruqi llp...
15428    taiwan semiconductor stock best chip make game...
15429    twilio stock rock star may done yet twilio sto...
15430    incred stat zoom follow blowout earn report zo...
15431    explor sale xandr advertis unit amid restructu...
Name: Article, Length: 15432, dtype: object


In [7]:
X.to_csv (r'Tech.csv', index = False, header=True)

In [20]:
torch.manual_seed(1234)
torch.backends.cudnn.deterministic = True

batch=256
#TEXT = data.Field(tokenize='spacy',batch_first=True,include_lengths=True)
TEXT = data.Field(batch_first=True,include_lengths=True)
LABEL = data.LabelField(dtype = torch.long,batch_first=True)

fields = [('text',TEXT),('label', LABEL)]
Train_Data=data.TabularDataset(path = 'Tech.csv',format = 'csv',fields = fields,skip_header = True)

X_train, X_test = Train_Data.split(split_ratio=0.7, random_state = random.seed(1234))
X_train, X_val = X_train.split(split_ratio=0.8, random_state = random.seed(1234))

TEXT.build_vocab(X_train, min_freq=2)  
LABEL.build_vocab(X_train)

print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  
train_it, val_it, test_it = data.BucketIterator.splits((X_train, X_val, X_test),sort_key = lambda x: len(x.text),
    sort_within_batch=True,batch_size = batch,device = device)  



Size of TEXT vocabulary: 7604
Size of LABEL vocabulary: 3
[('stock', 3765), ('earn', 3275), ('inc', 2413), ('result', 2250), ('compani', 2174), ('announc', 2041), ('report', 1817), ('quarter', 1812), ('busi', 1726), ('technolog', 1328)]


In [10]:
def find_accuracy(preds, y):
   
    temp = torch.log_softmax(preds, dim = 1)
    _, y_pred = torch.max(temp, dim = 1) 
    valid = (y_pred == y).float() 
    accur = valid.sum() / len(valid)
    return accur

def Loss_Optimizer (model,valueLR):
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=valueLR)
    return optimizer,criterion

In [11]:
def train(model,data,lr,optimizer,criterion):
        
    #optimizer,criterion=Loss_Optimizer (model,lr)
    #criterion = criterion.to(device)
    model.train()
    sumloss=0.0
    sumacc=0.0
    for i in data:
        text, text_lengths = i.text   
        optimizer.zero_grad()
        pred = model(text, text_lengths).squeeze(0)       
        loss = criterion(pred, i.label)
        acc = find_accuracy(pred, i.label)
        loss.backward()
        optimizer.step()
        sumloss += loss.item()
        sumacc += acc.item()
          
    return sumloss / len(data), sumacc / len(data)

In [12]:
def test(model, data, criterion):
    
    sumloss = 0
    sumacc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for i in data:
            text, text_lengths = i.text
            
            pred = model(text, text_lengths).squeeze(0)
           
            loss = criterion(pred, i.label)
            
            acc = find_accuracy(pred, i.label)

            sumloss += loss.item()
            sumacc += acc.item()
        
    return sumloss / len(data), sumacc / len(data)

In [13]:
def process_test(model,numEpochs,data_tr,data_val,data_test,lr,loss_file,acc_file,best_model):

    optimizer,criterion=Loss_Optimizer (model,lr)
    criterion = criterion.to(device)
    start_time = time.time()
    fileout=open(loss_file,"w")
    fileout2=open(acc_file,"w")
    
    for i in range(numEpochs):
 
        train_loss, train_acc = train(model, data_tr, lr,optimizer,criterion)
        valid_loss, valid_acc = test(model, data_val, criterion)
        fileout.write(str(valid_loss)+"\n")
        fileout2.write(str(valid_acc)+"\n")
        if i==0:
            max=valid_acc
            torch.save(model.state_dict(), best_model)
        elif valid_acc > max:
            max = valid_acc
            torch.save(model.state_dict(), best_model)

        if (i+1)%5==0:
          print("Epoch : ",i+1," Train Loss : ",train_loss,"  Train Acc : ",train_acc,"  Valid Loss : ",valid_loss, " Val Acc : ",valid_acc)
    
    end_time = time.time()
    timeHelp=(end_time-start_time)/60.0
    print("\nTime needed for Training : ",timeHelp)

    fileout.close()
    fileout2.close()  
    
   
    model.load_state_dict(torch.load(best_model))
    
    test_loss, test_acc = test(model, data_test, criterion)
    print("\nLoss in Testset : ",test_loss,"  Accuracy in Testset : ",test_acc,"\n")

In [14]:
def process_test2(model,numEpochs,data_tr,data_val,data_test,lr,loss_file,acc_file,best_model):

    optimizer,criterion=Loss_Optimizer (model,lr)
    criterion = criterion.to(device)
    start_time = time.time()
    fileout=open(loss_file,"w")
    fileout2=open(acc_file,"w")
    
    for i in range(numEpochs):
 
        train_loss, train_acc = train(model, data_tr, lr,optimizer,criterion)
        valid_loss, valid_acc = test(model, data_val, criterion)
        fileout.write(str(valid_loss)+"\n")
        fileout2.write(str(valid_acc)+"\n")
        if i==0:
            minloss=valid_loss
            torch.save(model.state_dict(), best_model)
        elif valid_loss < minloss:
            minloss = valid_loss
            torch.save(model.state_dict(), best_model)

        if (i+1)%5==0:
          print("Epoch : ",i+1," Train Loss : ",train_loss,"  Train Acc : ",train_acc,"  Valid Loss : ",valid_loss, " Val Acc : ",valid_acc)
    
    end_time = time.time()
    timeHelp=(end_time-start_time)/60.0
    print("\nTime needed for Training : ",timeHelp)

    fileout.close()
    fileout2.close()  
    
   
    model.load_state_dict(torch.load(best_model))
    
    test_loss, test_acc = test(model, data_test, criterion)
    print("\nLoss in Testset : ",test_loss,"  Accuracy in Testset : ",test_acc,"\n")

In [15]:
print("GPU Model   ",torch.cuda.get_device_name(0))

GPU Model    Tesla T4


In [16]:
class myLSTM(nn.Module):
    def __init__(self, voc, embed_d, hid_d, out_d,dropout):
        
        super().__init__()
        
        self.embedding = nn.Embedding(voc, embed_d)
        self.rnn = nn.LSTM(embed_d,hid_d,batch_first=True,num_layers=2,bidirectional=True,dropout=dropout)
        self.fc = nn.Linear(hid_d * 2, out_d)
        self.dropout = nn.Dropout(dropout)
    def forward(self, input_text, text_lengths):
                
        embedded = self.dropout(self.embedding(input_text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        rnn_out,(rnn_hid,rnn_cell) = self.rnn(packed_embedded)
        hidden = self.dropout(torch.cat((rnn_hid[-2,:,:], rnn_hid[-1,:,:]), dim=1))
        return self.fc(hidden)

In [17]:
class myGRU(nn.Module):
    def __init__(self, voc, embed_d, hid_d, out_d,dropout):
        
        super().__init__()
        
        self.embedding = nn.Embedding(voc, embed_d)
        self.rnn = nn.GRU(embed_d,hid_d,num_layers=2,bidirectional=True,dropout=dropout)
        self.fc = nn.Linear(hid_d * 2, out_d)
        self.dropout = nn.Dropout(dropout)
    def forward(self, input_text,text_lengths):
                
        embedded = self.dropout(self.embedding(input_text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        rnn_out,hidden = self.rnn(packed_embedded)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        return self.fc(hidden)

In [21]:
myInput = len(TEXT.vocab)
myEmbed = 300
myHid = 256
myOut = 3
dropout = 0.4

new_model2 = myLSTM(myInput, myEmbed, myHid, myOut,dropout)
new_model2.to(torch.device(device))
new_model3 = myGRU(myInput, myEmbed, myHid, myOut,dropout)
new_model3.to(torch.device(device))

print(new_model2)
print(new_model3)
#torch.save(new_model2.state_dict(), 'model_LSTM_1.pt')

myLSTM(
  (embedding): Embedding(7604, 300)
  (rnn): LSTM(300, 256, num_layers=2, batch_first=True, dropout=0.4, bidirectional=True)
  (fc): Linear(in_features=512, out_features=3, bias=True)
  (dropout): Dropout(p=0.4, inplace=False)
)
myGRU(
  (embedding): Embedding(7604, 300)
  (rnn): GRU(300, 256, num_layers=2, dropout=0.4, bidirectional=True)
  (fc): Linear(in_features=512, out_features=3, bias=True)
  (dropout): Dropout(p=0.4, inplace=False)
)


In [22]:
process_test(new_model2,30,train_it,val_it,test_it,0.001,"lstm1_loss.txt","lstm1_acc.txt","best_LSTM_1_model.pt")

Epoch :  5  Train Loss :  0.2576934501857442   Train Acc :  0.8417968732469222   Valid Loss :  0.43743111689885456  Val Acc :  0.8064236111111112
Epoch :  10  Train Loss :  0.10272306690732126   Train Acc :  0.9443933805998634   Valid Loss :  0.5944082545013063  Val Acc :  0.8093378014034696
Epoch :  15  Train Loss :  0.0417797258718672   Train Acc :  0.9772518364822164   Valid Loss :  0.83327941223979  Val Acc :  0.8097718291812472
Epoch :  20  Train Loss :  0.032135021476064785   Train Acc :  0.9828814320704516   Valid Loss :  0.9649006779719558  Val Acc :  0.8058655791812472
Epoch :  25  Train Loss :  0.01857244676177609   Train Acc :  0.990808821776334   Valid Loss :  1.0699947975162003  Val Acc :  0.818018356959025
Epoch :  30  Train Loss :  0.022855999606476603   Train Acc :  0.9889705864822164   Valid Loss :  0.9904275880609121  Val Acc :  0.8089037736256918

Time needed for Training :  0.7764581680297852

Loss in Testset :  1.0194450191369182   Accuracy in Testset :  0.83333956

In [23]:
process_test(new_model3,30,train_it,val_it,test_it,0.001,"gru1_loss.txt","gru1_acc.txt","best_gru_1_model.pt")

Epoch :  5  Train Loss :  0.2738383040379952   Train Acc :  0.8347886011880987   Valid Loss :  0.39098928175452685  Val Acc :  0.8033854166666666
Epoch :  10  Train Loss :  0.12625998173675992   Train Acc :  0.9303768364822164   Valid Loss :  0.5452661157921992  Val Acc :  0.8090277777777778
Epoch :  15  Train Loss :  0.05979759281720309   Train Acc :  0.9657261494327994   Valid Loss :  0.7841192494185331  Val Acc :  0.8229166666666666
Epoch :  20  Train Loss :  0.04976228163029779   Train Acc :  0.975183821776334   Valid Loss :  0.7608811949224522  Val Acc :  0.8198784722222222
Epoch :  25  Train Loss :  0.028087836438156526   Train Acc :  0.9840303291292751   Valid Loss :  0.9980696097546671  Val Acc :  0.8203125
Epoch :  30  Train Loss :  0.02240768766034093   Train Acc :  0.9883961379528046   Valid Loss :  1.0199691245729303  Val Acc :  0.8203125

Time needed for Training :  0.6738803466161092

Loss in Testset :  0.8052080297646554   Accuracy in Testset :  0.8248355263157895 

