In [None]:
# Import data into pandas dataframe
import pandas as pd
df_train = pd.read_excel('relevance_train_processed.xlsx')
df_test = pd.read_excel('relevance_test_processed.xlsx')

In [None]:
#Import other relevant libraries
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import statistics

In [None]:
#Make all nan values into strings
df_train_str = df_train.where(pd.notnull(df_train),'')
df_test_str = df_test.where(pd.notnull(df_test),'')

In [None]:
#For this simple benchmark model only accounting for body
#Separate training data into content/features (x), and judgement/label (y)
x_train = df_train_str['body']
y_train = df_train_str['judgement']
x_test = df_test_str['body'] 

#For training purposes separate training into learning and trialing data, trial size 20%
x_learn, x_trial, y_learn, y_trial_real = train_test_split(x_train, y_train, test_size = 0.2, random_state=42) 

In [None]:
#Define function that performs learning
def standard_benchmark (X, Y, tbd):
    #Perform token vectorisation on body data
    feature_extraction = TfidfVectorizer(min_df = 0, stop_words='english', lowercase=True)
    X_features = feature_extraction.fit_transform(X)
    tbd_features = feature_extraction.transform(tbd)
    
    #Train a logistic regression model
    model = LogisticRegression()
    model.fit(X_features, Y)
    result = model.predict(tbd_features)
    return result

In [None]:
#Implement model on training set for evaluation
y_trial_predicted = standard_benchmark(x_learn, y_learn, x_trial)

In [None]:
#Evaluation of the model
accuracy = accuracy_score(y_trial_predicted, y_trial_real)
print(accuracy)

In [None]:
#Implement model on train and test set.
prediction = standard_benchmark(x_train, y_train, x_test)

In [None]:
#Output CSV with results
prediction_pd = pd.DataFrame({'id': df_test_str['id'], 'judgement': prediction})
prediction_pd.to_csv('StandardBenchmark.csv', index = False)

In [None]:
#For report purposes run model with multiple seeds
cycles = 10 #initiate as desired with int (there is a max iter for sklearn)
accuracy_arr = [0] * cycles
for i in range(cycles):
    x_learn, x_trial, y_learn, y_trial_real = train_test_split(x_train, y_train, test_size = 0.2, random_state=i)
    y_trial_predicted = standard_benchmark(x_learn, y_learn, x_trial)
    accuracy_arr[i] = accuracy_score(y_trial_predicted, y_trial_real)
    print(i)

#Print accuracy average and standard deviation
print('Average Accuracy:', sum(accuracy_arr)/len(accuracy_arr))
print('Standard Deviation:', statistics.pstdev(accuracy_arr))