In [10]:
import pandas as pd
import json
import numpy as np
import re
from nltk.stem.porter import *
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn import model_selection,metrics,naive_bayes,preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
stemmer = PorterStemmer()
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')
mypca = PCA(0.95)
lda = LinearDiscriminantAnalysis(n_components=2)

In [11]:
def clean_data(X):
    STOPWORDS = set(stopwords.words('english'))
    X=X.str.lower()
    X=X.str.replace("[/(){}\[\]\|@,;]", " ")
    X=X.str.replace("[^0-9a-z #+_]", " ")
    X = X.str.replace(r'\d+','')
    X = X.apply(lambda x: ' '.join([w for w in str(x).split() if (len(w)>2 and w not in STOPWORDS) ] ))
    X = X.apply(lambda x: x.split()) 
    return X

def target_arrange(y):
    
    for i in range(len(y)):
        if y.values[i]=="Negative":
            y.values[i]=0.0
        elif y[i]=="Positive":
            y.values[i]=1.0
        else:
            y.values[i]=2.0
            
    y=y.to_numpy()  
    y=y.reshape(y.shape[0],1)
    y= pd.DataFrame(data=y)
    y=np.ravel(y)
    y=y.astype('float')
    return y

def select_n_components(ratio, goal):
        # Set initial variance explained so far
        s=0.0 
        # Set initial number of features
        num_components = 0

        for i in ratio:
            s += i
            num_components += 1
            if s >= goal:
                break

        # Return the number of components
        return num_components

In [12]:
with open("All_Tickers.json","r") as fp:
#with open("General_Market.json",encoding='utf8') as fp:
    json_d = json.load(fp)
 
ticks_d = json_d['data']
df = pd.DataFrame(ticks_d)

X= pd.DataFrame(columns=['Date', 'Article','Target'])
X['Date']=pd.to_datetime(df['date'])
X['Article']=df['title']+" "+df['text']
X['Target']=df['sentiment']

X=X.sort_values("Date")
print("Number of Examples : ",len(X),"\n")
X.drop_duplicates(inplace=True)
X.index = range(len(X))
print("Number of Examples after removing duplicates: ",len(X),"\n")
X.to_csv (r'General_pca.csv', index = False, header=True)

print('Number of words before cleaning : ',X['Article'].apply(lambda x: len(str(x).split(' '))).sum())
X['Article']=clean_data(X['Article'])
print('Number of words after cleaning : ',X['Article'].apply(lambda x: len(str(x).split(' '))).sum())

print(X.groupby(['Target']).count())
print(X['Date'])
y=target_arrange(X['Target'])



Number of Examples :  28130 

Number of Examples after removing duplicates:  27937 

Number of words before cleaning :  1067277
Number of words after cleaning :  718391
           Date  Article
Target                  
Negative   3143     3143
Neutral   14893    14893
Positive   9901     9901
0       2020-08-07 08:00:00-04:00
1       2020-08-07 08:00:00-04:00
2       2020-08-07 08:00:00-04:00
3       2020-08-07 08:00:00-04:00
4       2020-08-07 08:00:00-04:00
                   ...           
27932   2020-09-04 14:33:55-04:00
27933   2020-09-04 14:34:56-04:00
27934   2020-09-04 14:35:00-04:00
27935   2020-09-04 14:35:12-04:00
27936   2020-09-04 14:39:51-04:00
Name: Date, Length: 27937, dtype: datetime64[ns, pytz.FixedOffset(-240)]


In [13]:
X['Article']= X['Article'].apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
for i in range(len(X['Article'])): #φέρνω τα tokens ξανά μαζί διαχωριζόμενα με κενά
    X['Article'][i] = ' '.join(X['Article'][i])

print(X['Article'])

0        vianet group inc announc unaudit second quarte...
1        krato present canaccord virtual growth confer ...
2        rewalk robot report second quarter financi res...
3        pyxi tanker announc date releas second quarter...
4        bionano genom report second quarter financi re...
                               ...                        
27932    facebook block new polit ad may fall short sti...
27933    amazon growth problem buy good good enough amz...
27934    pyrogenesi sign contract navi two ship build p...
27935    guardant health high price lot potenti guardan...
27936    oil futur post first weekli fall week oil futu...
Name: Article, Length: 27937, dtype: object


In [14]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.9)
Xv = tfidf_vectorizer.fit_transform(X['Article'])
Xv = pd.DataFrame(Xv.todense())
X_train,X_test,y_train,y_test = train_test_split(Xv,y, test_size=0.25,stratify=y)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = mypca.fit_transform(X_train)
X_test = mypca.transform(X_test)

X_lda = lda.fit(X_train, y_train)

X_train=lda.transform(X_train)
X_test=lda.transform(X_test)


In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.decomposition import KernelPCA

#mykpca=KernelPCA(kernel="rbf")
#X_train_rbf1=mykpca.fit_transform(X_train)
#X_test_rbf1=mykpca.transform(X_test)

tuned_parameters = [
  {'C': [0.01,0.03, 0.1, 1, 10, 100 ], 'gamma': [5, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001], 'kernel': ['rbf']},
  
 ]

grid = GridSearchCV(SVC(class_weight='balanced', decision_function_shape='ovo'), tuned_parameters, n_jobs=-1, refit = True, cv=5) 
grid.fit(X_train, y_train) 
print(grid.best_params_) 
grid_predictions = grid.predict(X_test) 
recall = metrics.recall_score(y_test,grid_predictions,average='macro')
precision = metrics.precision_score(y_test,grid_predictions,average='macro')
f1 = metrics.f1_score(y_test,grid_predictions,average='macro')
Accur=metrics.accuracy_score(y_test,grid_predictions)

print(' Recall metric:',recall)
print(' F1 metric:',f1)
print(' Precision metric:',precision)
print(' Accuracy metric:',Accur)

{'C': 1, 'gamma': 1e-05, 'kernel': 'rbf'}
 Recall metric: 0.7574434759557872
 F1 metric: 0.7499986320958186
 Precision metric: 0.7444729742826985
 Accuracy metric: 0.7909806728704366
