In [1]:
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [2]:
def read_csv(file_name):
    data = []
    with open(file_name, errors='ignore') as file:
        data = file.readlines()
    return data

In [3]:
data = read_csv('agr_en_train.csv')

In [4]:
random.shuffle(data)

In [67]:
def preprocess_data_SVM(data):
    label = []
    Text = []
    for text in data:
        label.append(text[0])
        Text.append(text[2:])
    return (label,Text)

In [68]:
def createTestAndTrainPortion(x, split_len, data):
    
    #if this is the 1st fold, split the dataset into test and training set as follows
    if x == 1:
        #the data in positions 0-split_len will be the test set
        test_set = data[:split_len]
        #the data in positions from split_len until the end will be the test set
        train_set = data[split_len:]
    elif x == 10:
        #similarly, if it is the10th fold, we need to split the data after the 9th fold. 
        test_set = data[(split_len*(x-1)):]
        train_set = data[:(split_len*(x-1))]
    else:
        #if it is a fold between 2nd-9th, we will need to take the data between two data points as follows.
        test_set = data[(split_len*(x-1)):(split_len*x)]
        train_set= data[:(split_len*(x-1))]
        train_set.extend(data[(split_len*x):])
        
    return test_set, train_set

In [69]:
k = 10 

#calculate the length of each fold by dividing the size of the dataset with the number of k
split_len = int(len(data)/k)


In [70]:
precision = []
recall = []
f1 = []
accuracy = []

In [71]:
for i in range(k):
    x=i+1#i is 0-9, need 1-10 for functions

    #create different test & train portions for each loop in 10-fold
    test_set, train_set = createTestAndTrainPortion(x, split_len,data)
    
    
    train_labels, train_Text = preprocess_data_SVM(train_set)
    test_labels, test_Text = preprocess_data_SVM(test_set)

    #Feature Extraction/Vectorisation
    vectorizer = TfidfVectorizer(min_df=1,sublinear_tf=True)
    
    train_vectors = vectorizer.fit_transform(train_Text)
    test_vectors = vectorizer.transform(test_Text)
    
    #SVM Linear Kernel 
    classifier = svm.SVC(kernel = 'linear')
    classifier.fit(train_vectors, train_labels)#train classifier on train data
    prediction = classifier.predict(test_vectors)#predict test data
    
    #get classification results/evaluate model
    report = metrics.precision_recall_fscore_support(test_labels, prediction)
    report_accuracy = accuracy_score(test_labels, prediction)

    #save tuple results to lists to later get 10-fold average
    precision.append(report[0])
    recall.append(report[1])
    f1.append(report[2])
    accuracy.append(report_accuracy)


In [72]:
def getAverageResults(report_list):
    
    total_cag = 0
    total_nag = 0
    total_oag = 0

    for item in report_list:
        total_cag += item[0]
        total_nag += item[1]
        total_oag += item[2]

    av_cag = total_cag/len(report_list)
    av_nag = total_nag/len(report_list)
    av_oag = total_oag/len(report_list)
    
    
    return (av_cag + av_nag + av_oag)/3

#There is only one score per fold for all the three labels regarding accuracy 
def getAccuracyAverage(report_list):
    total = 0

    for item in report_list:
        total += item

    score = total/len(report_list)
    
    return score


In [73]:
precision_result = getAverageResults(precision)
recall_result = getAverageResults(recall)
f_score_result = getAverageResults(f1)
accuracy_result = getAccuracyAverage(accuracy)
print("Precision: ", precision_result, "\nRecall: ", recall_result, "\nF-score: ", f_score_result)
print("Accuracy: ", accuracy_result)

Precision:  0.5848482184532858 
Recall:  0.5450336657929572 
F-score:  0.5253640150052398
Accuracy:  0.8335845862090636
