In [1]:
import pandas as pd
import json
import numpy as np
import re
from nltk.stem.porter import *
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn import model_selection,metrics,naive_bayes,preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
stemmer = PorterStemmer()
import warnings
warnings.filterwarnings('ignore')

In [2]:
def clean_data(X):
    STOPWORDS = set(stopwords.words('english'))
    X=X.str.lower()
    X=X.str.replace("[/(){}\[\]\|@,;]", " ")
    X=X.str.replace("[^0-9a-z #+_]", " ")
    X = X.str.replace(r'\d+','')
    X = X.apply(lambda x: ' '.join([w for w in x.split() if (len(w)>2 and w not in STOPWORDS) ] ))
    X = X.apply(lambda x: x.split()) 
    return X

def target_arrange(y):
    
    for i in range(len(y)):
        if y.values[i]=="Negative":
            y.values[i]=0.0
        elif y[i]=="Positive":
            y.values[i]=1.0
        else:
            y.values[i]=2.0
            
    y=y.to_numpy()  
    y=y.reshape(y.shape[0],1)
    y= pd.DataFrame(data=y)
    y=np.ravel(y)
    y=y.astype('float')
    return y

In [3]:
with open("All_Tickers.json","r") as fp:
#with open("General_Market.json",encoding='utf8') as fp:
    json_d = json.load(fp)
 
ticks_d = json_d['data']
df = pd.DataFrame(ticks_d)

X= pd.DataFrame(columns=['Date', 'Article','Target'])
X['Date']=pd.to_datetime(df['date'])
X['Article']=df['title']+" "+df['text']
X['Target']=df['sentiment']

X=X.sort_values("Date")

print("Number of Examples : ",len(X),"\n")
X.drop_duplicates(inplace=True)
X.index = range(len(X))
print("Number of Examples after removing duplicates: ",len(X),"\n")


X.to_csv (r'General.csv', index = False, header=True)

print('Number of words before cleaning : ',X['Article'].apply(lambda x: len(str(x).split(' '))).sum())
X['Article']=clean_data(X['Article'])
print('Number of words after cleaning : ',X['Article'].apply(lambda x: len(str(x).split(' '))).sum())

print(X.groupby(['Target']).count(),"\n")
print(X['Date'])
y=target_arrange(X['Target'])



Number of Examples :  24030 

Number of Examples after removing duplicates:  23867 

Number of words before cleaning :  910682
Number of words after cleaning :  612873
           Date  Article
Target                  
Negative   2640     2640
Neutral   12762    12762
Positive   8465     8465 

0       2020-08-07 08:00:00-04:00
1       2020-08-07 08:00:00-04:00
2       2020-08-07 08:00:00-04:00
3       2020-08-07 08:00:00-04:00
4       2020-08-07 08:00:00-04:00
                   ...           
23862   2020-09-01 14:51:53-04:00
23863   2020-09-01 14:52:59-04:00
23864   2020-09-01 14:54:43-04:00
23865   2020-09-01 14:58:39-04:00
23866   2020-09-01 14:59:18-04:00
Name: Date, Length: 23867, dtype: datetime64[ns, pytz.FixedOffset(-240)]


In [4]:
X['Article']= X['Article'].apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
for i in range(len(X['Article'])): #φέρνω τα tokens ξανά μαζί διαχωριζόμενα με κενά
    X['Article'][i] = ' '.join(X['Article'][i])

print(X['Article'])

0        vianet group inc announc unaudit second quarte...
1        bionano genom report second quarter financi re...
2        pyxi tanker announc date releas second quarter...
3        intellig system announc new board member norcr...
4        krato present canaccord virtual growth confer ...
                               ...                        
23862    grab slice pie resurg emerg market etf emerg m...
23863    chipotl stock jump toward straight record wedb...
23864    seattl base big fish game lay peopl read memo ...
23865    molson coor steal stock valu illog take advant...
23866    sylvania still look cheap palladium mine resta...
Name: Article, Length: 23867, dtype: object


In [5]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.9)
Xv = tfidf_vectorizer.fit_transform(X['Article'])
Xv = pd.DataFrame(Xv.todense())
X_train,X_test,y_train,y_test = train_test_split(Xv,y, test_size=0.3,stratify=y)

In [6]:
alpha = 0.1 # This is the smoothing parameter for Laplace/Lidstone smoothing
model = naive_bayes.MultinomialNB(alpha=alpha)

model.fit(X_train,y_train)
y_predicted = model.predict(X_test)


recall = metrics.recall_score(y_test,y_predicted,average='macro')
precision = metrics.precision_score(y_test,y_predicted,average='macro')
f1 = metrics.f1_score(y_test,y_predicted,average='macro')
Accur=metrics.accuracy_score(y_test,y_predicted)

print(' Recall metric:',recall)
print(' F1 metric:',f1)
print(' Precision metric:',precision)
print(' Accuracy metric:',Accur)


 Recall metric: 0.6803532643151754
 F1 metric: 0.6921713760677429
 Precision metric: 0.7291425271175639
 Accuracy metric: 0.7644183773216031


In [7]:
skfold = model_selection.StratifiedKFold(n_splits=5)
model2 = RandomForestClassifier(n_estimators=100, max_features="auto", n_jobs=-1)
results = model_selection.cross_val_score(model2, Xv,y, cv=skfold)
print(results.mean())

0.7991770143308028


In [8]:
model2 = RandomForestClassifier(n_estimators=100, max_features="auto", n_jobs=-1)
model2.fit(X_train,y_train)
y_predicted = model2.predict(X_test)


recall = metrics.recall_score(y_test,y_predicted,average='macro')
precision = metrics.precision_score(y_test,y_predicted,average='macro')
f1 = metrics.f1_score(y_test,y_predicted,average='macro')
Accur=metrics.accuracy_score(y_test,y_predicted)

print(' Recall metric:',recall)
print(' F1 metric:',f1)
print(' Precision metric:',precision)
print(' Accuracy metric:',Accur)

 Recall metric: 0.7011104090909471
 F1 metric: 0.7218918934432083
 Precision metric: 0.8110436742344805
 Accuracy metric: 0.807149839407904
