## This notebook generates all the QWK scores for set 1 to 8 using the features csv files found in 'features' directory

Note that these QWK scores do not include domain adaptation yet

All implementation are obtained from maes.ipynb

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict

from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm #SVR is in SVM
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import train_test_split

## Run Naive Bayes Model

In [129]:
def qwk_nb(x_train, x_test, y_train, y_test):
    # Preprocess
    x_trainNB = x_train
    y_trainNB = y_train
    x_testNB = x_test
    y_testNB = y_test

    # Fit the model
    model_nb = naive_bayes.MultinomialNB()
    model_nb.fit(x_trainNB, y_trainNB.ravel())
    
    # Get predicted scores
    y_predNB = model_nb.predict(x_testNB)
    
    # Get QWK score
    score = cohen_kappa_score(y_test, y_predNB, weights="quadratic")
    return score, y_predNB

## Run SVM Model

In [136]:
def qwk_svm(x_train, x_test, y_train, y_test):
    # Preprocess
    sc_Xsvm = StandardScaler()
    sc_ysvm = StandardScaler()
    x_trainSVM = sc_Xsvm.fit_transform(x_train)
    y_trainSVM = sc_ysvm.fit_transform(y_train)
    x_testSVM = sc_Xsvm.transform(x_test)
    y_testSVM = sc_ysvm.transform(y_test)
    
    # Fit the model
    from sklearn.svm import SVR
    model_svm = SVR(kernel='rbf', gamma='auto', verbose=True)
    model_svm.fit(x_trainSVM, y_trainSVM.ravel())
    
    # Get predicted scores
    y_predSVM = model_svm.predict(x_testSVM)
    y_predSVM = sc_ysvm.inverse_transform(y_predSVM).round()

    # Get QWK score
    score = cohen_kappa_score(y_test, y_predSVM, weights="quadratic")
    return score, y_predSVM

## Run BLRR Model

In [143]:
def qwk_blrr(x_train, x_test, y_train, y_test):
    # Preprocess
    sc_Xblrr = StandardScaler()
    sc_yblrr = StandardScaler()
    x_trainBLRR = sc_Xblrr.fit_transform(x_train)
    y_trainBLRR = sc_yblrr.fit_transform(y_train)
    x_testBLRR = sc_Xblrr.transform(x_test)
    y_testBLRR = sc_yblrr.transform(y_test)
    
    # Fit the model
    from sklearn import linear_model
    model_blrr = linear_model.BayesianRidge()
    model_blrr.fit(x_trainBLRR, y_trainBLRR.ravel())
    
    # Get predicted scores
    y_predBLRR = model_blrr.predict(x_testBLRR)
    y_predBLRR = sc_yblrr.inverse_transform(y_predBLRR).round()

    # Get QWK score
    score = cohen_kappa_score(y_test, y_predBLRR, weights="quadratic")
    return score, y_predBLRR

## Run Ensemble Model

In [138]:
def qwk_ensemble(y_test, y_predNB, y_predSVM, y_predBLRR):
    # Preprocess
    actual = pd.Series(y_test.ravel())
    predNB = pd.Series(y_predNB)
    predSVM = pd.Series(y_predSVM)
    predBLRR = pd.Series(y_predBLRR)

    data = {"Actual": actual,
            "NB": predNB, 
            "SVM": predSVM, 
            "BLRR": predBLRR} 
    results = pd.concat(data, axis=1)
    
    # Ensembling process
    results['Ensemble'] = np.where(
                            (results['NB'] == results['BLRR']) |
                            (results['NB'] == results['SVM']),
                            results['NB'],
                            results['BLRR'])

    # Get QWK score
    score = cohen_kappa_score(y_test, results['Ensemble'], weights="quadratic")
    return score, results['Ensemble']

## Getting QWK scores for all sets

In [157]:
def get_qwk_score(set_no):
    # Read input features file
    input_file = 'features/features_set' + str(set_no) + '.csv'
    dataset = pd.read_csv(input_file)
     
    # Reshape data and model 
    X = dataset.iloc[:,1:16].values.astype(float)
    y = dataset.iloc[:,18].values.astype(float)
    y = np.array(y).reshape(-1,1)
    
    # Split dataset into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    # Getting the QWK scores for all methods
    nb_score, y_predNB = qwk_nb(X_train, X_test, y_train, y_test)
    svm_score, y_predSVM = qwk_svm(X_train, X_test, y_train, y_test)
    blrr_score, y_predBLRR = qwk_blrr(X_train, X_test, y_train, y_test)
    ensemble_score, y_predEnsemble = qwk_ensemble(y_test, y_predNB, y_predSVM, y_predBLRR)
    
    return set_no, blrr_score, svm_score, nb_score, ensemble_score

In [165]:
scores = []
for set_no in range(1, 9):
    scores.append(get_qwk_score(set_no))

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [166]:
scores_df = pd.DataFrame(scores, columns=['Set', 'BLRR', 'SVM', 'NB', 'Ensemble'])
scores_df

Unnamed: 0,Set,BLRR,SVM,NB,Ensemble
0,1,0.817064,0.799501,0.800366,0.826096
1,2,0.586017,0.547739,0.640441,0.610858
2,3,0.64555,0.668943,0.633179,0.680847
3,4,0.692826,0.706158,0.710284,0.720952
4,5,0.789499,0.802743,0.78026,0.809514
5,6,0.668089,0.639169,0.639359,0.649767
6,7,0.715227,0.743958,0.669716,0.717442
7,8,0.620724,0.640928,0.577474,0.634011
