In [None]:
# Import data into pandas dataframe
import pandas as pd
df_train = pd.read_excel('relevance_train_processed.xlsx')
df_test = pd.read_excel('relevance_test_processed.xlsx')

In [None]:
#Import other relevant libraries
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import statistics
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

In [None]:
#Make all nan values into strings
df_train_str = df_train.where(pd.notnull(df_train),'')
df_test_str = df_test.where(pd.notnull(df_test),'')

In [None]:
#Taking into account author, topic, body, and title
#Separate training data into content/features (x), and judgement/label (y)
x_train = df_train_str[['author', 'topic_id', 'body', 'title']]
y_train = df_train_str['judgement']
x_test = df_test_str[['author', 'topic_id', 'body', 'title']]

#For training purposes separate training into learning and trialing data, trial size 20%
x_learn, x_trial, y_learn, y_trial_real = train_test_split(x_train, y_train, test_size = 0.2, random_state=42) 

In [None]:
#Define the function that creates the neural network and performs training 
def threeneural (X, Y, tbd, dim, optimiser='adam', loss='binary_crossentropy', epochs = 10):
    #Perform token vectorisation on data
    feature_extraction = TfidfVectorizer(min_df = 0, stop_words='english', lowercase=True) #might change to keras to check if it performs different
    X_features = feature_extraction.fit_transform(X)
    tbd_features = feature_extraction.transform(tbd)
    
    #Model architecture <-- Need to experiment with this
    model = Sequential([
        Embedding(input_dim = 1000, outpu_dim = 64, input_length = 64),
        Flatten()
        Dense(64, activation='relu')
        Dense(1, activation='sigmoid')
    ])
    
    #Model compilation
    model.compile(optimizer = optimiser, loss = loss)
    
    #Train neural network model 
    model.fit(X_features, Y, epochs = epochs, batch_size = 64)
    result = model.predict(tbd_features)
    return result
    

In [None]:
#Implement model on training set for evaluation
y_trial_predicted = threeneural(x_learn, y_learn, x_trial)

In [None]:
#Evaluation of the model
accuracy = accuracy_score(y_trial_predicted, y_trial_real)
print(accuracy)


In [None]:
#Implement model on train and test set.
prediction = threeneural(x_train, y_train, x_test)


In [None]:
#Output CSV with results
prediction_pd = pd.DataFrame({'id': df_test_str['id'], 'judgement': prediction})
prediction_pd.to_csv('ThreeNeural.csv', index = False)


In [None]:
#For report purposes run model with multiple seeds
cycles = 100 #initiate as desired with int (there is a max iter for sklearn)
accuracy_arr = [0] * cycles
for i in range(cycles):
    x_learn, x_trial, y_learn, y_trial_real = train_test_split(x_train, y_train, test_size = 0.2, random_state=i)
    y_trial_predicted = threeneural(x_learn, y_learn, x_trial)
    accuracy_arr[i] = accuracy_score(y_trial_predicted, y_trial_real)
    #print(i)

#Print accuracy average and standard deviation
print('Average Accuracy:', sum(accuracy_arr)/len(accuracy_arr))
print('Standard Deviation:', statistics.pstdev(accuracy_arr))
