In [2]:
import pandas as pd
import json
import numpy as np
import re
from nltk.stem.porter import *
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn import model_selection,metrics,naive_bayes,preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
stemmer = PorterStemmer()
import warnings
warnings.filterwarnings('ignore')

In [3]:
def clean_data(X):
    STOPWORDS = set(stopwords.words('english'))
    X=X.str.lower()
    X=X.str.replace("[/(){}\[\]\|@,;]", " ")
    X=X.str.replace("[^0-9a-z #+_]", " ")
    X = X.str.replace(r'\d+','')
    X = X.apply(lambda x: ' '.join([w for w in str(x).split() if (len(w)>2 and w not in STOPWORDS) ] ))
    X = X.apply(lambda x: x.split()) 
    return X

def target_arrange(y):
    
    for i in range(len(y)):
        if y.values[i]=="Negative":
            y.values[i]=0.0
        elif y[i]=="Positive":
            y.values[i]=1.0
        else:
            y.values[i]=2.0
            
    y=y.to_numpy()  
    y=y.reshape(y.shape[0],1)
    y= pd.DataFrame(data=y)
    y=np.ravel(y)
    y=y.astype('float')
    return y

In [4]:
with open("Tech_news.json","r") as fp:
#with open("General_Market.json",encoding='utf8') as fp:
    json_d = json.load(fp)
 
ticks_d = json_d['data']
df = pd.DataFrame(ticks_d)

X= pd.DataFrame(columns=['Date', 'Article','Target'])
X['Date']=pd.to_datetime(df['date'])
X['Article']=df['title']+" "+df['text']
X['Target']=df['sentiment']

X=X.sort_values("Date")
print("Number of Examples : ",len(X),"\n")
X.drop_duplicates(inplace=True)
X.index = range(len(X))
print("Number of Examples after removing duplicates: ",len(X),"\n")
X.to_csv (r'Tech.csv', index = False, header=True)

print('Number of words before cleaning : ',X['Article'].apply(lambda x: len(str(x).split(' '))).sum())
X['Article']=clean_data(X['Article'])
print('Number of words after cleaning : ',X['Article'].apply(lambda x: len(str(x).split(' '))).sum())

print(X.groupby(['Target']).count())

y=target_arrange(X['Target'])



Number of Examples :  15641 

Number of Examples after removing duplicates:  15432 

Number of words before cleaning :  585553
Number of words after cleaning :  393507
          Date  Article
Target                 
Negative  1892     1892
Neutral   7702     7702
Positive  5838     5838


In [5]:
X['Article']= X['Article'].apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
for i in range(len(X['Article'])): #φέρνω τα tokens ξανά μαζί διαχωριζόμενα με κενά
    X['Article'][i] = ' '.join(X['Article'][i])

print(X['Date'],"  ",X['Article'])

0       2020-06-25 16:15:00-04:00
1       2020-06-25 16:15:00-04:00
2       2020-06-25 16:15:00-04:00
3       2020-06-25 16:18:00-04:00
4       2020-06-25 16:21:36-04:00
                   ...           
15427   2020-09-01 14:23:00-04:00
15428   2020-09-01 14:27:40-04:00
15429   2020-09-01 14:35:33-04:00
15430   2020-09-01 14:36:34-04:00
15431   2020-09-01 14:40:38-04:00
Name: Date, Length: 15432, dtype: datetime64[ns, pytz.FixedOffset(-240)]    0        mercuri system receiv contract award base new ...
1        progress second quarter revenu exce guidanc in...
2        acuiti brand declar quarterli dividend atlanta...
3         share factset soar today earn came better expect
4        stifel say inseego leader inseego corp nasdaq ...
                               ...                        
15427    lead plaintiff deadlin alert faruqi faruqi llp...
15428    taiwan semiconductor stock best chip make game...
15429    twilio stock rock star may done yet twilio sto...
15430    incred sta

In [6]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.9)
Xv = tfidf_vectorizer.fit_transform(X['Article'])
Xv = pd.DataFrame(Xv.todense())
X_train,X_test,y_train,y_test = train_test_split(Xv,y, test_size=0.3,stratify=y)

In [6]:
alpha = 0.1 # This is the smoothing parameter for Laplace/Lidstone smoothing
model = naive_bayes.MultinomialNB(alpha=alpha)

model.fit(X_train,y_train)
y_predicted = model.predict(X_test)


recall = metrics.recall_score(y_test,y_predicted,average='macro')
precision = metrics.precision_score(y_test,y_predicted,average='macro')
f1 = metrics.f1_score(y_test,y_predicted,average='macro')
Accur=metrics.accuracy_score(y_test,y_predicted)

print(' Recall metric:',recall)
print(' F1 metric:',f1)
print(' Precision metric:',precision)
print(' Accuracy metric:',Accur)


 Recall metric: 0.6949830569513687
 F1 metric: 0.7023892187545657
 Precision metric: 0.7208625956083868
 Accuracy metric: 0.7587473002159827


In [7]:
skfold = model_selection.StratifiedKFold(n_splits=5)
model2 = RandomForestClassifier(n_estimators=100, max_features="auto", n_jobs=-1)
results = model_selection.cross_val_score(model2, Xv,y, cv=skfold)
print(results.mean())

0.7857074416348028


In [10]:
model2 = RandomForestClassifier(n_estimators=100, max_features="auto", n_jobs=-1)
model2.fit(X_train,y_train)
y_predicted = model2.predict(X_test)


recall = metrics.recall_score(y_test,y_predicted,average='macro')
precision = metrics.precision_score(y_test,y_predicted,average='macro')
f1 = metrics.f1_score(y_test,y_predicted,average='macro')
Accur=metrics.accuracy_score(y_test,y_predicted)

print(' Recall metric:',recall)
print(' F1 metric:',f1)
print(' Precision metric:',precision)
print(' Accuracy metric:',Accur)

 Recall metric: 0.7137764603299449
 F1 metric: 0.737260315385479
 Precision metric: 0.8085507532283159
 Accuracy metric: 0.8010799136069114
