In [1]:
import pandas as pd
import json
import numpy as np
import re
from nltk.stem.porter import *
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn import model_selection,metrics,naive_bayes,preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
stemmer = PorterStemmer()
import warnings
warnings.filterwarnings('ignore')

In [2]:
def clean_data(X):
    STOPWORDS = set(stopwords.words('english'))
    X=X.str.lower()
    X=X.str.replace("[/(){}\[\]\|@,;]", " ")
    X=X.str.replace("[^0-9a-z #+_]", " ")
    X = X.str.replace(r'\d+','')
    X = X.apply(lambda x: ' '.join([w for w in str(x).split() if (len(w)>2 and w not in STOPWORDS) ] ))
    X = X.apply(lambda x: x.split()) 
    return X

def target_arrange(y):
    
    for i in range(len(y)):
        if y.values[i]=="Negative":
            y.values[i]=0.0
        elif y[i]=="Positive":
            y.values[i]=1.0
        else:
            y.values[i]=2.0
            
    y=y.to_numpy()  
    y=y.reshape(y.shape[0],1)
    y= pd.DataFrame(data=y)
    y=np.ravel(y)
    y=y.astype('float')
    return y

In [3]:
with open("Tech_news.json","r") as fp:
#with open("General_Market.json",encoding='utf8') as fp:
    json_d = json.load(fp)
 
ticks_d = json_d['data']
df = pd.DataFrame(ticks_d)

X= pd.DataFrame(columns=['Date', 'Article','Target'])
X['Date']=pd.to_datetime(df['date'])
X['Article']=df['title']+" "+df['text']
X['Target']=df['sentiment']

X=X.sort_values("Date")
print("Number of Examples : ",len(X),"\n")
X.drop_duplicates(inplace=True)
X.index = range(len(X))
print("Number of Examples after removing duplicates: ",len(X),"\n")
X.to_csv (r'Tech.csv', index = False, header=True)

print('Number of words before cleaning : ',X['Article'].apply(lambda x: len(str(x).split(' '))).sum())
X['Article']=clean_data(X['Article'])
print('Number of words after cleaning : ',X['Article'].apply(lambda x: len(str(x).split(' '))).sum())

print(X.groupby(['Target']).count())

y=target_arrange(X['Target'])



Number of Examples :  16651 

Number of Examples after removing duplicates:  16413 

Number of words before cleaning :  619298
Number of words after cleaning :  416411
          Date  Article
Target                 
Negative  2040     2040
Neutral   8178     8178
Positive  6195     6195


In [4]:
X['Article']= X['Article'].apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
for i in range(len(X['Article'])): #φέρνω τα tokens ξανά μαζί διαχωριζόμενα με κενά
    X['Article'][i] = ' '.join(X['Article'][i])

print(X['Date'],"  ",X['Article'])

0       2020-06-25 16:15:00-04:00
1       2020-06-25 16:15:00-04:00
2       2020-06-25 16:15:00-04:00
3       2020-06-25 16:18:00-04:00
4       2020-06-25 16:21:36-04:00
                   ...           
16408   2020-09-04 14:03:00-04:00
16409   2020-09-04 14:03:00-04:00
16410   2020-09-04 14:19:00-04:00
16411   2020-09-04 14:33:00-04:00
16412   2020-09-04 14:33:55-04:00
Name: Date, Length: 16413, dtype: datetime64[ns, pytz.FixedOffset(-240)]    0        acuiti brand declar quarterli dividend atlanta...
1        progress second quarter revenu exce guidanc in...
2        mercuri system receiv contract award base new ...
3         share factset soar today earn came better expect
4        stifel say inseego leader inseego corp nasdaq ...
                               ...                        
16408    adob stock fall today investor take profit mad...
16409    dow jone drop point appl salesforc stock head ...
16410    solar power stock crash friday solar hot suddenli
16411    rosen top 

In [5]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.9)
Xv = tfidf_vectorizer.fit_transform(X['Article'])
Xv = pd.DataFrame(Xv.todense())
X_train,X_test,y_train,y_test = train_test_split(Xv,y, test_size=0.25,stratify=y)

In [6]:
alpha = 0.1 # This is the smoothing parameter for Laplace/Lidstone smoothing
model = naive_bayes.MultinomialNB(alpha=alpha)

model.fit(X_train,y_train)
y_predicted = model.predict(X_test)


recall = metrics.recall_score(y_test,y_predicted,average='macro')
precision = metrics.precision_score(y_test,y_predicted,average='macro')
f1 = metrics.f1_score(y_test,y_predicted,average='macro')
Accur=metrics.accuracy_score(y_test,y_predicted)

print(' Recall metric:',recall)
print(' F1 metric:',f1)
print(' Precision metric:',precision)
print(' Accuracy metric:',Accur)


 Recall metric: 0.704794073043521
 F1 metric: 0.7104724016741927
 Precision metric: 0.727225350061271
 Accuracy metric: 0.7655945419103314


In [7]:
skfold = model_selection.StratifiedKFold(n_splits=5)
model2 = RandomForestClassifier(n_estimators=100, max_features="auto", n_jobs=-1)
results = model_selection.cross_val_score(model2, Xv,y, cv=skfold)
print(results.mean())

0.7847433540798786


In [8]:
model2 = RandomForestClassifier(n_estimators=100, max_features="auto", n_jobs=-1)
model2.fit(X_train,y_train)
y_predicted = model2.predict(X_test)


recall = metrics.recall_score(y_test,y_predicted,average='macro')
precision = metrics.precision_score(y_test,y_predicted,average='macro')
f1 = metrics.f1_score(y_test,y_predicted,average='macro')
Accur=metrics.accuracy_score(y_test,y_predicted)

print(' Recall metric:',recall)
print(' F1 metric:',f1)
print(' Precision metric:',precision)
print(' Accuracy metric:',Accur)

 Recall metric: 0.7194772234280703
 F1 metric: 0.7447079035266074
 Precision metric: 0.8272809286265966
 Accuracy metric: 0.8070175438596491
