In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [2]:
train_set = pd.read_csv("./asap-aes/training_set_rel3.tsv", sep='\t', encoding="latin-1")

In [3]:
train_set = train_set[train_set['essay_set'] == 2]
train_set = train_set.reset_index()

In [4]:
train_set.drop(train_set.columns[0], axis=1, inplace=True)
train_set.drop(train_set.columns[7:], axis=1, inplace=True)
train_set.drop(train_set.columns[5], axis=1, inplace=True)

In [5]:
train_set['essay'] = [entry.lower() for entry in train_set['essay']]
train_set["essay"] = [word_tokenize(entry) for entry in train_set["essay"]]

In [6]:
train_set['avg_score'] = (train_set['rater1_domain1'] + train_set['rater2_domain1']) / 2
train_set['avg_score'] = train_set['avg_score'].apply(np.ceil).astype(int)

In [7]:
np.random.seed(500)
tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
count = 0
for index, entry in enumerate(train_set['essay']):
    final_words = []
    word_lemmatized = WordNetLemmatizer()
    
    for word, tag in pos_tag(entry):
        if word not in stopwords.words("english") and word.isalpha():
            word_final = word_lemmatized.lemmatize(word, tag_map[tag[0]])
            final_words.append(word_final)

    train_set.loc[index, "essay_final"] = str(final_words)
train_set.drop(train_set.columns[2], axis=1, inplace=True)

In [8]:
train_set

Unnamed: 0,essay_id,essay_set,rater1_domain1,rater2_domain1,domain1_score,avg_score,essay_final
0,2978,2,4,4,4,4,"['certain', 'material', 'remove', 'library', '..."
1,2979,2,1,2,1,2,"['write', 'persuasive', 'essay', 'newspaper', ..."
2,2980,2,2,3,2,3,"['think', 'library', 'remove', 'certain', 'mat..."
3,2981,2,4,4,4,4,"['world', 'many', 'thing', 'find', 'offensive'..."
4,2982,2,4,4,4,4,"['life', 'thing', 'little', 'stuff', 'get', 's..."
...,...,...,...,...,...,...,...
1795,4773,2,3,2,3,3,"['author', 'writting', 'take', 'book', 'adult'..."
1796,4774,2,3,3,3,3,"['think', 'material', 'book', 'music', 'movie'..."
1797,4775,2,2,2,2,2,"['yes', 'keep', 'book', 'music', 'movie', 'mag..."
1798,4776,2,3,4,3,4,"['believe', 'book', 'magazine', 'music', 'movi..."


In [9]:
# splitting into training and testing set
train_essay, test_essay, train_label, test_label = model_selection.train_test_split(train_set['essay_final'], train_set['avg_score'], test_size=0.3)

In [10]:
# transform the avg score into label of 0,1,2,3....
Encoder = LabelEncoder()
train_label = Encoder.fit_transform(train_label)
test_label = Encoder.fit_transform(test_label)

In [401]:
train_label

array([3, 3, 3, ..., 2, 2, 3])

In [402]:
test_label

array([3, 2, 3, 2, 3, 3, 1, 3, 3, 2, 1, 1, 1, 1, 2, 4, 3, 3, 4, 4, 2, 2,
       3, 2, 1, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 2, 3, 2, 3, 3, 3, 0, 2, 2,
       3, 3, 3, 4, 2, 3, 3, 3, 2, 2, 3, 2, 2, 2, 3, 2, 3, 3, 3, 2, 3, 2,
       2, 4, 2, 3, 3, 2, 2, 3, 2, 2, 2, 3, 2, 1, 3, 3, 3, 3, 3, 2, 3, 2,
       2, 3, 3, 2, 5, 3, 3, 3, 2, 2, 4, 2, 2, 3, 2, 4, 2, 3, 3, 3, 2, 2,
       2, 3, 2, 1, 3, 2, 3, 4, 2, 2, 3, 2, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3,
       1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 2, 2, 2, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 1, 2, 2, 2, 3, 3, 3, 3, 1, 1, 2, 3, 3, 2, 3, 2, 1,
       2, 2, 3, 3, 2, 3, 2, 3, 2, 2, 2, 3, 3, 2, 2, 1, 3, 2, 2, 3, 3, 2,
       2, 2, 3, 4, 3, 3, 2, 2, 2, 4, 1, 3, 4, 2, 3, 3, 1, 2, 3, 3, 2, 3,
       3, 2, 2, 3, 3, 3, 3, 3, 2, 1, 3, 3, 3, 2, 3, 2, 2, 2, 3, 2, 3, 2,
       3, 3, 3, 5, 3, 2, 3, 2, 4, 2, 2, 3, 2, 1, 4, 3, 4, 2, 2, 2, 3, 2,
       3, 4, 3, 2, 2, 2, 3, 3, 2, 3, 2, 3, 2, 3, 3, 2, 3, 0, 3, 2, 2, 2,
       3, 3, 2, 3, 3, 1, 2, 3, 3, 4, 2, 0, 4, 3, 2,

In [13]:
## NOTE: Not too sure if this section is done correctly / do not understand the what's going on at the back
# transform essay into matrix
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(train_set["essay_final"])
train_essay_vect = Tfidf_vect.transform(train_essay)
test_essay_vect = Tfidf_vect.transform(test_essay)

In [14]:
test_essay

1740    ['find', 'book', 'think', 'innapropiate', 'oth...
69      ['many', 'people', 'stern', 'thought', 'believ...
919     ['believe', 'find', 'book', 'offensive', 'read...
654     ['cause', 'case', 'evry', 'book', 'read', 'cou...
413     ['say', 'word', 'exactly', 'say', 'brother', '...
                              ...                        
851     ['much', 'censorship', 'occur', 'world', 'musi...
1663    ['think', 'offensive', 'avavible', 'kid', 'int...
656     ['yes', 'think', 'material', 'library', 'offen...
645     ['think', 'anything', 'get', 'check', 'library...
178     ['citizen', 'believe', 'censorship', 'material...
Name: essay_final, Length: 540, dtype: object

In [391]:
test_label

array([3, 3, 3, 2, 2, 2, 2, 3, 2, 2, 3, 3, 2, 5, 3, 3, 3, 1, 2, 3, 3, 3,
       3, 2, 2, 3, 3, 2, 2, 1, 3, 2, 2, 2, 3, 3, 3, 3, 3, 2, 2, 2, 3, 3,
       2, 3, 4, 2, 3, 3, 3, 3, 3, 4, 1, 3, 2, 3, 2, 4, 2, 1, 2, 4, 2, 3,
       3, 3, 3, 3, 0, 3, 2, 2, 2, 2, 2, 3, 3, 2, 2, 4, 3, 3, 3, 4, 2, 2,
       2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 2, 3, 2, 2, 3, 3, 3, 3, 3, 2, 3, 2,
       3, 2, 3, 2, 3, 3, 2, 4, 2, 3, 3, 3, 1, 2, 3, 2, 2, 2, 2, 3, 2, 3,
       2, 3, 3, 3, 2, 3, 2, 3, 4, 2, 2, 3, 3, 3, 2, 2, 2, 3, 0, 3, 3, 4,
       3, 3, 2, 3, 3, 2, 2, 4, 4, 2, 2, 3, 3, 3, 3, 3, 2, 4, 1, 2, 3, 3,
       3, 3, 2, 3, 5, 3, 3, 3, 1, 2, 3, 2, 2, 2, 2, 2, 3, 2, 2, 3, 2, 2,
       3, 2, 2, 3, 2, 2, 3, 2, 3, 3, 3, 2, 1, 3, 3, 2, 3, 3, 2, 3, 3, 3,
       3, 2, 3, 1, 3, 2, 3, 4, 3, 1, 3, 3, 3, 3, 3, 3, 3, 4, 3, 2, 2, 2,
       3, 1, 2, 1, 2, 3, 2, 3, 2, 2, 3, 2, 2, 3, 0, 3, 1, 2, 3, 3, 2, 3,
       3, 3, 3, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 3, 2, 2, 4, 3, 4, 2, 3,
       2, 3, 1, 3, 2, 3, 3, 2, 2, 2, 3, 2, 3, 4, 3,

In [385]:
train_essay

1066    ['author', 'katherine', 'paterson', 'once', 'c...
59      ['when', 'you', 'be', 'a', 'child', 'how', 'mu...
303     ['at', 'a', 'local', 'library', 'you', 'can', ...
759     ['should', 'book', 'that', 'be', 'offensive', ...
411     ['people', 'should', 'not', 'take', 'all', 'th...
                              ...                        
1345    ['should', 'there', 'be', 'cenorship', 'in', '...
284     ['how', 'many', 'of', 'you', 'have', 'ever', '...
1202    ['i', 'believe', 'that', 'many', 'book', 'in',...
1250    ['a', 'a', 'student', 'at', 'a', 'public', 'sc...
114     ['do', 'you', 'think', 'there', 'should', 'be'...
Name: essay_final, Length: 1260, dtype: object

In [15]:
# fitting training set into naive bayes
naive = naive_bayes.MultinomialNB()
naive.fit(train_essay_vect, train_label)

# fitting testing set on NB classifier
predictions_NB = naive.predict(test_essay_vect)
print(accuracy_score(predictions_NB, test_label)*100)
# predictions_NB

47.592592592592595


In [16]:
predictions_NB

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

In [389]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='rbf', degree=2, gamma='auto')
SVM.fit(train_essay_vect, train_label)
predictions_SVM = SVM.predict(test_essay_vect)
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, test_label)*100)

SVM Accuracy Score ->  48.7037037037037


In [369]:
predictions_SVM

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,