# Reference 
https://www.kaggle.com/aroraaman/quadratic-kappa-metric-explained-in-5-simple-steps

http://kagglesolutions.com/r/evaluation-metrics--quadratic-weighted-kappa

In [199]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, confusion_matrix

In [208]:
train_set = pd.read_csv("./asap-aes/training_set_rel3.tsv", sep='\t', encoding="latin-1")

In [209]:
train_set = train_set[train_set['essay_set'] == 2]  # filter for set 2
train_set = train_set.reset_index() # resets index

In [210]:
train_set['essay'] = [entry.lower() for entry in train_set['essay']] # lower case for all words in essay
train_set["essay"] = [word_tokenize(entry) for entry in train_set["essay"]] # break paragraphs string into tokens

In [211]:
train_set.drop(train_set.columns[0], axis=1, inplace=True) # removes old index
train_set.drop(train_set.columns[10:], axis=1, inplace=True) # filter only domain 1 scores
train_set.drop(train_set.columns[5], axis=1, inplace=True) # removes rater3_domain1 (NaN for all)

In [212]:
train_set['avg_score'] = (train_set['rater1_domain1'] + train_set['rater2_domain1']) / 2 # calculate average score
train_set['avg_score'] = train_set['avg_score'].apply(np.ceil).astype(int) # round off average score

In [54]:
train_set['avg_score2'] = (train_set['rater1_domain2'] + train_set['rater2_domain2']) / 2 # calculate average score
train_set['avg_score2'] = train_set['avg_score2'].apply(np.ceil).astype(int) # round off average score

In [55]:
train_set['avg_score3'] = (train_set['domain1_score'] + train_set['domain2_score']) / 2 # calculate average score
train_set['avg_score3'] = train_set['avg_score3'].apply(np.ceil).astype(int) # round off average score

# Description

avg_score: average score of rater1_domain1 and rater2_domain1 (i.e., average scores of domain1)

avg_score2: average score of rater1_domain2 and rater2_domain2 (i.e., average scores of domain2) 

avg_score3: average score of domain1_score and domain2_score (i.e., average scores of the resolved scores of both domain) 

In [213]:
np.random.seed(500)
tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index, entry in enumerate(train_set['essay']):
    final_words = []
    word_lemmatized = WordNetLemmatizer()
    
    for word, tag in pos_tag(entry):
        if word not in stopwords.words("english") and word.isalpha():
            word_final = word_lemmatized.lemmatize(word, tag_map[tag[0]])
            final_words.append(word_final)

    train_set.loc[index, "essay_final"] = str(final_words)
train_set.drop(train_set.columns[2], axis=1, inplace=True) # removes the original essay column

## To be tested:
the model selection code run as follow: model_selection.train_test_split(train_set['essay_final'], train_set['domain1_score'], test_size=0.3)

Replace 'domain1_score' with "avg_score", "avg_score2", "avg_score3" for further testing
    

In [274]:
# splitting into training and testing set
train_essay, test_essay, train_label, test_label = model_selection.train_test_split(train_set['essay_final'], train_set['avg_score3'], test_size=0.3)

# transform the avg score into label of 0,1,2,3....
Encoder = LabelEncoder()
train_label = Encoder.fit_transform(train_label)
test_label = Encoder.transform(test_label)

# transform essay into matrix
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(train_set["essay_final"])
train_essay_vect = Tfidf_vect.transform(train_essay)
test_essay_vect = Tfidf_vect.transform(test_essay)

In [275]:
# fitting training set into naive bayes
naive = naive_bayes.MultinomialNB()
naive.fit(train_essay_vect, train_label)

# fitting testing set on NB classifier
predictions_NB = naive.predict(test_essay_vect)

# producing confusion matrix
O = confusion_matrix(test_label, predictions_NB)
print(O)

[[  0   0   0   2   0]
 [  0   0   2  28   0]
 [  0   0   2 180   0]
 [  0   0   0 294   0]
 [  0   0   0  32   0]]


In [276]:
N = len(O)

In [277]:
w = np.zeros((N,N)); w

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [278]:
d = (N-1)**2

In [279]:
for i in range(len(w)):
    for j in range(len(w)):
        w[i][j] = float(((i-j)**2)/d) 

In [281]:
act_hist=np.zeros([N])
for item in test_label: 
    act_hist[item]+=1
    
pred_hist=np.zeros([N])
for item in predictions_NB: 
    pred_hist[item]+=1

In [282]:
E = np.outer(act_hist, pred_hist); E

array([[0.00000e+00, 0.00000e+00, 8.00000e+00, 1.07200e+03, 0.00000e+00],
       [0.00000e+00, 0.00000e+00, 1.20000e+02, 1.60800e+04, 0.00000e+00],
       [0.00000e+00, 0.00000e+00, 7.28000e+02, 9.75520e+04, 0.00000e+00],
       [0.00000e+00, 0.00000e+00, 1.17600e+03, 1.57584e+05, 0.00000e+00],
       [0.00000e+00, 0.00000e+00, 1.28000e+02, 1.71520e+04, 0.00000e+00]])

In [283]:
E = E/E.sum(); E.sum()

1.0

In [284]:
O = O/O.sum(); O.sum()

0.9999999999999999

In [285]:
num=0
den=0
for i in range(len(w)):
    for j in range(len(w)):
        num+=w[i][j]*O[i][j]
        den+=w[i][j]*E[i][j]
        
weighted_kappa = (1 - (num/den))
weighted_kappa

0.024943310657596474

##### Ignore these at the moment

# Set 2
Domain 1 score:<br>
0.17861887795156706<br>
0.8213811220484329

Domain 2 score: <br>
0.1041177197972053<br>
0.8958822802027947

Avarage of domain 1 and domain 2 score: <br>
0.010827339513058365<br>
0.9891726604869416


# Set 5

Domain 1 score:<br>
0.354743571530047<br>
0.645256428469953