In [None]:
#libraries
import pandas as pd
import os
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D,Dropout,Flatten, MaxPooling2D, Dense, Softmax, Input, concatenate
from tensorflow.keras import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import SGD

from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB


from numpy.random import seed
seed(0)
#from tensorflow.random import set_seed
#set_seed(0)

In [None]:
import tensorflow.compat.v1 as tf
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

In [None]:
#loading data
pos = np.load("../processed_final_texts/pos_lemma.npy")
neg = np.load("../processed_final_texts/neg_lemma.npy")
data = np.concatenate([pos,neg])
targets = np.array(list([1] * 719))#([20194,1])
targets[595:] = 0

In [None]:
texts_array = pd.read_pickle("../processed_final_texts/_lemma.pkl")

In [None]:
#Traditional Kim 2014 (CNN)
def create_CNN():
    
    input_shape = Input(shape=(397, 50, 1))
        
    f_1 = Conv2D(100,kernel_size = (3,50))(input_shape)
    f_1 = MaxPooling2D((395, 1), strides=(1, 1), padding='same')(f_1)

    f_2 = Conv2D(100,kernel_size = (4,50))(input_shape)
    f_2 = MaxPooling2D((394, 1), strides=(1, 1), padding='same')(f_2)
    
    f_3 = Conv2D(100,kernel_size = (5,50))(input_shape)
    f_3 = MaxPooling2D((393, 1), strides=(1, 1), padding='same')(f_3)

    merged = concatenate([f_1, f_2, f_3], axis=1)
    merged = Flatten()(merged)

    out = Dense(100, activation='relu')(merged)
    out = Dense(2, activation='softmax')(out)

    model = Model(input_shape, out)
    
    return model

def process_embedding_data(input_data):
    from statistics import mean
    values = np.zeros([input_data.shape[0],3])
    for obs in range(input_data.shape[0]):
        #need to check how exactly the max/min/mean are done
        values[obs,0] = max(input_data[obs].ravel())
        values[obs,1] = min(input_data[obs].ravel())
        values[obs,2] = mean(list(input_data[obs].ravel()))
    
    return values



def KNN_transform(input_data,aggregate_embedding = False):   
    if aggregate_embedding == True:
        values = process_embedding_data(input_data)       
    else:
        values = input_data.reshape([input_data.shape[0],19850])
        
    return values

def tokenize_texts(x_train,x_test,n_words = 1000):
    
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df = 3, max_features = n_words)
    x_train_tfidf = vectorizer.fit_transform(x_train).toarray()
    x_test_tfidf = vectorizer.transform(x_test).toarray()

    return x_train_tfidf, x_test_tfidf

def CNN(x_train,x_test,y_train,y_test):
    opt = SGD(learning_rate=0.0001)
    x_train = x_train.reshape([x_train.shape[0],397,50,1])
    x_test = x_test.reshape([x_test.shape[0],397,50,1])
    y_train = to_categorical(y_train)
    #print("train value", y_train)
    #print("test value", y_test)
    model = create_CNN()
    model.compile(loss="binary_crossentropy",
              optimizer=opt,
              metrics=['accuracy'])
    model.fit(x_train, y_train, batch_size = len(x_train), epochs=30, verbose=2, validation_split = 0.0,
          callbacks = [EarlyStopping(monitor="accuracy",patience = 3, restore_best_weights=True)])
    predicted = model.predict(x_test).argmax(axis = 1)
    #print("predicted value" ,predicted)
    accuracy = accuracy_score(y_test, predicted)
    return accuracy, y_test.item(), predicted.item()

def SVM(x_train,x_test,y_train,y_test, tfidf):
    
    #tfidf parameter is prepared for diffentiatig of SVM parameters in case of tfidf data and embedding data
    if tfidf:
        x_train, x_test = tokenize_texts(x_train,x_test)
    else:
        x_train = process_embedding_data(x_train)
        x_test = process_embedding_data(x_test)
    
    SVM = LinearSVC(random_state = 0)
    SVM.fit(x_train,y_train)
    predicted = SVM.predict(x_test)
    accuracy = accuracy_score(y_test,predicted)
    print("y_test",y_test,"predicted",predicted,"accuracy",accuracy)
    print(y_test, y_test.item())
    return accuracy, y_test.item(), predicted.item()

def KNN(x_train,x_test,y_train,y_test):
    x_train = KNN_transform(x_train,True)
    x_test = KNN_transform(x_test,True)
    KNN = KNeighborsClassifier()
    KNN.fit(x_train,y_train)
    predicted = KNN.predict(x_test)
    accuracy = accuracy_score(y_test,predicted)
    print("y_test",y_test,"predicted",predicted,"accuracy",accuracy)
    return accuracy, y_test.item(), predicted.item()

def NB(x_train,x_test,y_train,y_test,tfidf = True):
    x_train, x_test = tokenize_texts(x_train,x_test)
    naive = MultinomialNB()
    classifier = naive.fit(x_train,y_train)
    predicted = classifier.predict(x_test)
    accuracy = accuracy_score(y_test,predicted)
    return accuracy, y_test.item(), predicted.item()

##
def choose_model(x_train,x_test,y_train,y_test,model,tfidf = False):
    if model == "CNN":
        return CNN(x_train,x_test,y_train,y_test)
    elif model == "SVM":
        return SVM(x_train,x_test,y_train,y_test,tfidf = tfidf)
    elif model == "KNN":
        return KNN(x_train,x_test,y_train,y_test)
    elif model == "NB":
        return NB(x_train,x_test,y_train,y_test,tfidf = tfidf)
    else:
        print("model not known")

In [None]:
#Function for leave-one-out testing
def robust_accuracy(data,targets,model,texts = None,upsample = False, tfidf = False):
    import statistics as stat
    all_accuracies = []
    actual_list = []
    predicted_list = []
    crossVal = LeaveOneOut()
    
    if texts:
        data = np.array(texts)
    
    for train_index, test_index in crossVal.split(data):
        #print("TRAIN:", train_index, "TEST:", test_index)
        x_train, x_test = data[train_index], data[test_index]
        y_train, y_test = targets[train_index], targets[test_index]
        if upsample == True:
            x_to_add = x_train[y_train == 0]
            y_to_add = y_train[y_train == 0]
            for i in range(0,2):
                x_train = np.append(x_train,x_to_add,axis = 0)
                y_train = np.append(y_train,y_to_add,axis = 0)
        #print(x_train.shape," ",x_test.shape," ",y_train.shape," ",y_test.shape)
        if texts:
            x_train = x_train.tolist()
            x_test = x_test.tolist()
        accuracy, actual, predicted = choose_model(x_train,x_test,y_train,y_test,model,tfidf = tfidf)
        
        all_accuracies.append(accuracy)
        test_accuracy = stat.mean(all_accuracies)
        actual_list.append(actual)
        predicted_list.append(predicted)
        print("index",test_index,"accuracy",accuracy)
    return test_accuracy, all_accuracies, actual_list, predicted_list

results = robust_accuracy(data,
                          targets,
                          #texts = pd.read_pickle("../preprocessed_texts_lemma.pkl"), 
                          model = "KNN",
                          tfidf = False,
                          upsample = False)

print(classification_report(results[2], results[3], digits=3))

In [None]:
import pickle
pickle.dump(results,open("../results/bech_res_KNN_lemma_sent.pkl","wb"))

In [None]:
results1 = robust_accuracy(data,
                          targets,
                          texts = pd.read_pickle("../preprocessed_texts_lemma.pkl"), 
                          model = "SVM",
                          tfidf = True,
                          upsample = True)


In [None]:
pickle.dump(results1,open("../results/SVM1000_lemma_upsample.pkl","wb"))