# Reference 
https://www.kaggle.com/aroraaman/quadratic-kappa-metric-explained-in-5-simple-steps

http://kagglesolutions.com/r/evaluation-metrics--quadratic-weighted-kappa

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
train_set = pd.read_csv("./asap-aes/training_set_rel3.tsv", sep='\t', encoding="latin-1")

In [3]:
train_set = train_set[train_set['essay_set'] == 2]  # filter for set 2
train_set = train_set.reset_index() # resets index

In [4]:
train_set['essay'] = [entry.lower() for entry in train_set['essay']] # lower case for all words in essay
train_set["essay"] = [word_tokenize(entry) for entry in train_set["essay"]] # break paragraphs string into tokens

In [5]:
train_set.drop(train_set.columns[0], axis=1, inplace=True) # removes old index
train_set.drop(train_set.columns[10:], axis=1, inplace=True) # filter only domain 1 scores
train_set.drop(train_set.columns[5], axis=1, inplace=True) # removes rater3_domain1 (NaN for all)

In [6]:
train_set['avg_score'] = (train_set['rater1_domain1'] + train_set['rater2_domain1']) / 2 # calculate average score
train_set['avg_score'] = train_set['avg_score'].apply(np.ceil).astype(int) # round off average score

In [7]:
train_set['avg_score2'] = (train_set['rater1_domain2'] + train_set['rater2_domain2']) / 2 # calculate average score
train_set['avg_score2'] = train_set['avg_score2'].apply(np.ceil).astype(int) # round off average score

In [8]:
train_set['avg_score3'] = (train_set['domain1_score'] + train_set['domain2_score']) / 2 # calculate average score
train_set['avg_score3'] = train_set['avg_score3'].apply(np.ceil).astype(int) # round off average score

# Description

avg_score: average score of rater1_domain1 and rater2_domain1 (i.e., average scores of domain1)

avg_score2: average score of rater1_domain2 and rater2_domain2 (i.e., average scores of domain2) 

avg_score3: average score of domain1_score and domain2_score (i.e., average scores of the resolved scores of both domain) 

In [9]:
np.random.seed(500)
tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index, entry in enumerate(train_set['essay']):
    final_words = []
    word_lemmatized = WordNetLemmatizer()
    
    for word, tag in pos_tag(entry):
        if word not in stopwords.words("english") and word.isalpha():
            word_final = word_lemmatized.lemmatize(word, tag_map[tag[0]])
            final_words.append(word_final)

    train_set.loc[index, "essay_final"] = str(final_words)
train_set.drop(train_set.columns[2], axis=1, inplace=True) # removes the original essay column

## To be tested:
the model selection code run as follow: model_selection.train_test_split(train_set['essay_final'], train_set['domain1_score'], test_size=0.3)

Replace 'domain1_score' with "avg_score", "avg_score2", "avg_score3" for further testing
    

In [19]:
Encoder = LabelEncoder()
real_label = Encoder.fit_transform(train_set['domain1_score'])

In [20]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(train_set['essay_final'])
train_essay_vect = Tfidf_vect.transform(train_set['essay_final'])
test_essay_vect = Tfidf_vect.transform(train_set['essay_final'])

In [21]:
train_essay_vect

<1800x11252 sparse matrix of type '<class 'numpy.float64'>'
	with 168788 stored elements in Compressed Sparse Row format>

In [293]:
naive = naive_bayes.MultinomialNB()
naive.fit(train_essay_vect, real_label)

predictions_NB = naive.predict(test_essay_vect)

In [294]:
real_label.min()

0

In [295]:
real_label.max()

10

In [296]:
O = confusion_matrix(real_label, predictions_NB)
print(O)

[[  0   0   0   0   0   0  10   0   0   0   0]
 [  0   0   0   0   0   0   1   0   0   0   0]
 [  0   0   0   0   0   0  17   0   0   0   0]
 [  0   0   0   0   0   0  17   0   0   0   0]
 [  0   0   0   0   0   0 110   0   0   0   0]
 [  0   0   0   0   0   0 135   0   0   0   0]
 [  0   0   0   0   0   0 687   0   0   0   0]
 [  0   0   0   0   0   0 334   0   0   0   0]
 [  0   0   0   0   0   0 316   0   0   0   0]
 [  0   0   0   0   0   0 109   0   0   0   0]
 [  0   0   0   0   0   0  47   0   0   0   0]]


In [297]:
N = len(O)

In [298]:
N

11

In [299]:
w = np.zeros((N,N)); w

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [300]:
d = (N-1)**2

In [301]:
for i in range(len(w)):
    for j in range(len(w)):
        w[i][j] = float(((i-j)**2)/d) 

In [302]:
real_label.max()

10

In [303]:
predictions_NB.max()

6

In [304]:
predictions_NB.max()

6

In [305]:
act_hist=np.zeros([N])
for item in real_label: 
    act_hist[item]+=1
    
pred_hist=np.zeros([N])
for item in predictions_NB: 
    pred_hist[item]+=1

In [306]:
E = np.outer(act_hist, pred_hist); E

array([[      0.,       0.,       0.,       0.,       0.,       0.,
          17830.,       0.,       0.,       0.,       0.],
       [      0.,       0.,       0.,       0.,       0.,       0.,
           1783.,       0.,       0.,       0.,       0.],
       [      0.,       0.,       0.,       0.,       0.,       0.,
          30311.,       0.,       0.,       0.,       0.],
       [      0.,       0.,       0.,       0.,       0.,       0.,
          30311.,       0.,       0.,       0.,       0.],
       [      0.,       0.,       0.,       0.,       0.,       0.,
         196130.,       0.,       0.,       0.,       0.],
       [      0.,       0.,       0.,       0.,       0.,       0.,
         240705.,       0.,       0.,       0.,       0.],
       [      0.,       0.,       0.,       0.,       0.,       0.,
        1224921.,       0.,       0.,       0.,       0.],
       [      0.,       0.,       0.,       0.,       0.,       0.,
         595522.,       0.,       0.,      

In [307]:
E = E/E.sum(); E.sum()

1.0

In [308]:
O = O/O.sum(); O.sum()

1.0

In [309]:
num=0
den=0
for i in range(len(w)):
    for j in range(len(w)):
        num+=w[i][j]*O[i][j]
        den+=w[i][j]*E[i][j]
            
weighted_kappa = (1 - (num/den))
weighted_kappa

0.0

In [23]:
# splitting into training and testing set
train_essay, test_essay, train_label, test_label = model_selection.train_test_split(train_set['essay_final'], train_set['domain1_score'], test_size=0.3)

# transform the avg score into label of 0,1,2,3....
Encoder = LabelEncoder()
train_label = Encoder.fit_transform(train_label)
test_label = Encoder.transform(test_label)

# transform essay into matrix
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(train_set["essay_final"])
train_essay_vect = Tfidf_vect.transform(train_essay)
test_essay_vect = Tfidf_vect.transform(test_essay)

In [26]:
test_label.max()

5

In [27]:
# fitting training set into naive bayes
naive = naive_bayes.MultinomialNB()
naive.fit(train_essay_vect, train_label)

# fitting testing set on NB classifier
predictions_NB = naive.predict(test_essay_vect)

# producing confusion matrix
O = confusion_matrix(test_label, predictions_NB)
print(O)

[[  0   0   4   2   0   0]
 [  0   0  25  25   0   0]
 [  0   0  52 187   0   0]
 [  0   0   7 209   0   0]
 [  0   0   1  27   0   0]
 [  0   0   0   1   0   0]]


In [12]:
N = len(O)

In [13]:
w = np.zeros((N,N)); w

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [14]:
d = (N-1)**2

In [15]:
for i in range(len(w)):
    for j in range(len(w)):
        w[i][j] = float(((i-j)**2)/d) 

In [16]:
act_hist=np.zeros([N])
for item in test_label: 
    act_hist[item]+=1
    
pred_hist=np.zeros([N])
for item in predictions_NB: 
    pred_hist[item]+=1

In [17]:
E = np.outer(act_hist, pred_hist); E

array([[     0.,      0.,    497.,   3283.,      0.,      0.],
       [     0.,      0.,   2627.,  17353.,      0.,      0.],
       [     0.,      0.,  17111., 113029.,      0.,      0.],
       [     0.,      0.,  15691., 103649.,      0.,      0.],
       [     0.,      0.,   2130.,  14070.,      0.,      0.],
       [     0.,      0.,    284.,   1876.,      0.,      0.]])

In [18]:
E = E/E.sum(); E.sum()

1.0

In [19]:
O = O/O.sum(); O.sum()

1.0

In [20]:
num=0
den=0
for i in range(len(w)):
    for j in range(len(w)):
        num+=w[i][j]*O[i][j]
        den+=w[i][j]*E[i][j]
        
weighted_kappa = (1 - (num/den))
weighted_kappa

0.17861887795156706

##### Ignore these at the moment

# Set 2
Domain 1 score:<br>
0.17861887795156706<br>
0.8213811220484329

Domain 2 score: <br>
0.1041177197972053<br>
0.8958822802027947

Avarage of domain 1 and domain 2 score: <br>
0.010827339513058365<br>
0.9891726604869416


# Set 5

Domain 1 score:<br>
0.354743571530047<br>
0.645256428469953