In [219]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, confusion_matrix

In [220]:
train_set = pd.read_csv("./asap-aes/training_set_rel3.tsv", sep='\t', encoding="latin-1")

In [221]:
train_set = train_set[train_set['essay_set'] == 2]  # filter for set 2
train_set = train_set.reset_index() # resets index

In [222]:
train_set.drop(train_set.columns[0], axis=1, inplace=True) # removes old index
train_set.drop(train_set.columns[7:], axis=1, inplace=True) # filter only domain 1 scores
train_set.drop(train_set.columns[5], axis=1, inplace=True) # removes rater3_domain1 (NaN for all)

In [223]:
train_set['essay'] = [entry.lower() for entry in train_set['essay']] # lower case for all words in essay
train_set["essay"] = [word_tokenize(entry) for entry in train_set["essay"]] # break paragraphs string into tokens

In [224]:
train_set['avg_score'] = (train_set['rater1_domain1'] + train_set['rater2_domain1']) / 2 # calculate average score
train_set['avg_score'] = train_set['avg_score'].apply(np.ceil).astype(int) # round off average score

In [225]:
np.random.seed(500)
tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
count = 0
for index, entry in enumerate(train_set['essay']):
    final_words = []
    word_lemmatized = WordNetLemmatizer()
    
    for word, tag in pos_tag(entry):
        if word not in stopwords.words("english") and word.isalpha():
            word_final = word_lemmatized.lemmatize(word, tag_map[tag[0]])
            final_words.append(word_final)

    train_set.loc[index, "essay_final"] = str(final_words)
train_set.drop(train_set.columns[2], axis=1, inplace=True) # removes the original essay column

In [226]:
train_set['type1'] = train_set.apply(lambda x: 0 if x['avg_score'] <= 3 else 1, axis=1)
train_set['type2'] = train_set.apply(lambda x: 0 if x['avg_score'] <= 4 else 1, axis=1)
train_set['type3'] = train_set.apply(lambda x: 0 if x['avg_score'] <= 5 else 1, axis=1)

In [227]:
train_set['type4'] = train_set.apply(lambda x: 0 if x['avg_score'] <= 1 else 1, axis=1)
train_set['type5'] = train_set.apply(lambda x: 0 if x['avg_score'] <= 2 else 1, axis=1)

In [228]:
train_set['avg_score'] = (train_set['rater1_domain1'] + train_set['rater2_domain1']) / 2 # calculate average score
train_set['avg_score'] = train_set['avg_score'].apply(np.ceil).astype(int) # round off average score

In [229]:
train_set

Unnamed: 0,essay_id,essay_set,rater1_domain1,rater2_domain1,domain1_score,avg_score,essay_final,type1,type2,type3,type4,type5
0,2978,2,4,4,4,4,"['certain', 'material', 'remove', 'library', '...",1,0,0,1,1
1,2979,2,1,2,1,2,"['write', 'persuasive', 'essay', 'newspaper', ...",0,0,0,1,0
2,2980,2,2,3,2,3,"['think', 'library', 'remove', 'certain', 'mat...",0,0,0,1,1
3,2981,2,4,4,4,4,"['world', 'many', 'thing', 'find', 'offensive'...",1,0,0,1,1
4,2982,2,4,4,4,4,"['life', 'thing', 'little', 'stuff', 'get', 's...",1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1795,4773,2,3,2,3,3,"['author', 'writting', 'take', 'book', 'adult'...",0,0,0,1,1
1796,4774,2,3,3,3,3,"['think', 'material', 'book', 'music', 'movie'...",0,0,0,1,1
1797,4775,2,2,2,2,2,"['yes', 'keep', 'book', 'music', 'movie', 'mag...",0,0,0,1,0
1798,4776,2,3,4,3,4,"['believe', 'book', 'magazine', 'music', 'movi...",1,0,0,1,1


## Classification with Average Score (6 labels from 1-6)

In [234]:
# splitting into training and testing set
train_essay, test_essay, train_label, test_label = model_selection.train_test_split(train_set['essay_final'], train_set['avg_score'], test_size=0.3)

# transform the avg score into label of 0,1,2,3....
Encoder = LabelEncoder()
train_label = Encoder.fit_transform(train_label)
test_label = Encoder.transform(test_label)

# transform essay into matrix
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(train_set["essay_final"])
train_essay_vect = Tfidf_vect.transform(train_essay)
test_essay_vect = Tfidf_vect.transform(test_essay)


In [235]:
# fitting training set into naive bayes
naive = naive_bayes.MultinomialNB()
naive.fit(train_essay_vect, train_label)

# fitting testing set on NB classifier
predictions_NB = naive.predict(test_essay_vect)
print("accuracy score:", accuracy_score(predictions_NB, test_label)*100)

# producing confusion matrix
con_matrix = confusion_matrix(test_label, predictions_NB)
print(con_matrix)

accuracy score: 50.37037037037037
[[  0   0   1   6   0   0]
 [  0   0   5  29   0   0]
 [  0   0   3 197   0   0]
 [  0   0   0 269   0   0]
 [  0   0   0  29   0   0]
 [  0   0   0   1   0   0]]


In [236]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='rbf', degree=2, gamma='auto')
SVM.fit(train_essay_vect, train_label)
predictions_SVM = SVM.predict(test_essay_vect)
print("accuracy score:", accuracy_score(predictions_SVM, test_label)*100)

# producing confusion matrix
con_matrix = confusion_matrix(test_label, predictions_SVM)
print(con_matrix)

accuracy score: 49.81481481481482
[[  0   0   0   7   0   0]
 [  0   0   0  34   0   0]
 [  0   0   0 200   0   0]
 [  0   0   0 269   0   0]
 [  0   0   0  29   0   0]
 [  0   0   0   1   0   0]]


In [237]:
from sklearn.metrics import classification_report

rep = classification_report(test_label, predictions_NB)
print(rep)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.00      0.00      0.00        34
           2       0.33      0.01      0.03       200
           3       0.51      1.00      0.67       269
           4       0.00      0.00      0.00        29
           5       0.00      0.00      0.00         1

    accuracy                           0.50       540
   macro avg       0.14      0.17      0.12       540
weighted avg       0.38      0.50      0.35       540



  _warn_prf(average, modifier, msg_start, len(result))


### Type 1: 1-3, 4-6<br/>Type 2: 1-4, 5-6<br/>Type 3: 1-5, 6<br/>Type 4: 1, 5-6<br/>Type 5: 1-2, 3-6

In [233]:
types = ['type1', 'type2', 'type3', 'type4', 'type5']
for i in range(len(types)) :
    print()
    print("===============================")
    print("TYPE:", i+1, )
    typ = types[i]
    
    # splitting into training and testing set
    train_essay, test_essay, train_label, test_label = model_selection.train_test_split(train_set['essay_final'], train_set[typ], test_size=0.3)
    
    # transform the avg score into label of 0,1,2,3....
    Encoder = LabelEncoder()
    train_label = Encoder.fit_transform(train_label)
    test_label = Encoder.transform(test_label)

    # transform essay into matrix
    Tfidf_vect = TfidfVectorizer()
    Tfidf_vect.fit(train_set["essay_final"])
    train_essay_vect = Tfidf_vect.transform(train_essay)
    test_essay_vect = Tfidf_vect.transform(test_essay)
    
    print("NAIVE BAYES: ")
    ## NAIVE BAYES
    # fitting training set into naive bayes
    naive = naive_bayes.MultinomialNB()
    naive.fit(train_essay_vect, train_label)
    
    # fitting testing set on NB classifier
    predictions_NB = naive.predict(test_essay_vect)
    print("accuracy score:", accuracy_score(predictions_NB, test_label)*100)

    # producing confusion matrix
    con_matrix = confusion_matrix(test_label, predictions_NB)
    print(con_matrix)

    print("------------------------")
    print("SVM: ")
    ## SVM 
    # fit the training dataset on the classifier
    SVM = svm.SVC(C=1.0, kernel='rbf', degree=2, gamma='auto')
    SVM.fit(train_essay_vect, train_label)
    predictions_SVM = SVM.predict(test_essay_vect)
    print("accuracy score:", accuracy_score(predictions_SVM, test_label)*100)

    # producing confusion matrix
    con_matrix = confusion_matrix(test_label, predictions_SVM)
    print(con_matrix)
    print()


TYPE: 1
NAIVE BAYES: 
accuracy score: 56.851851851851855
[[ 18 232]
 [  1 289]]
------------------------
SVM: 
accuracy score: 53.70370370370371
[[  0 250]
 [  0 290]]


TYPE: 2
NAIVE BAYES: 
accuracy score: 93.88888888888889
[[507   0]
 [ 33   0]]
------------------------
SVM: 
accuracy score: 93.88888888888889
[[507   0]
 [ 33   0]]


TYPE: 3
NAIVE BAYES: 
accuracy score: 99.44444444444444
[[537   0]
 [  3   0]]
------------------------
SVM: 
accuracy score: 99.44444444444444
[[537   0]
 [  3   0]]


TYPE: 4
NAIVE BAYES: 
accuracy score: 99.44444444444444
[[  0   3]
 [  0 537]]
------------------------
SVM: 
accuracy score: 99.44444444444444
[[  0   3]
 [  0 537]]


TYPE: 5
NAIVE BAYES: 
accuracy score: 92.4074074074074
[[  0  41]
 [  0 499]]
------------------------
SVM: 
accuracy score: 92.4074074074074
[[  0  41]
 [  0 499]]



#### Type 1
Issue: Most Class 0 is classified as Class 1
<br>Type 1 is 1-3, 4-6
<br>This implies that most of score 1-3 is classified as 4-6

#### Type 2
Issue: Most Class 1 is classified as Class 0
<br>Type 2 is 1-4, 5-6
<br>This implies that most of score 5-6 is classified as 1-4

#### Type 3
Issue: "same as type 2"
<br>Type 3 is 1-5, 6
<br>This implies that most of score 6 is classified as 1-5

#### Type 4
Issue: "same as type 1"
<br>Type 4 is 1, 5-6
<br>This implies that most of score 1 is classified as 5-6

#### Type 5
Issue: "same as type 1"
<br>Type 5 is 1-2, 3-6
<br>This implies that most of score 1-2 is classified as 3-6

Type 1, Type 4 and 5 have same issue: Class 0 classified as Class 1
<br>Type 2 and Type 3 have same issue: Class 1 classified as Class 0

## Classification with Type 1 (2 labels: Label 0: Score 1-3, Label 1: Score 4-6)

In [196]:
# splitting into training and testing set
train_essay, test_essay, train_label, test_label = model_selection.train_test_split(train_set['essay_final'], train_set['type1'], test_size=0.3)

# transform the avg score into label of 0,1,2,3....
Encoder = LabelEncoder()
train_label = Encoder.fit_transform(train_label)
test_label = Encoder.transform(test_label)

# transform essay into matrix
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(train_set["essay_final"])
train_essay_vect = Tfidf_vect.transform(train_essay)
test_essay_vect = Tfidf_vect.transform(test_essay)


In [197]:
# fitting training set into naive bayes
naive = naive_bayes.MultinomialNB()
naive.fit(train_essay_vect, train_label)

# fitting testing set on NB classifier
predictions_NB = naive.predict(test_essay_vect)
print("accuracy score:", accuracy_score(predictions_NB, test_label)*100)

# producing confusion matrix
con_matrix = confusion_matrix(test_label, predictions_NB)
print(con_matrix)

accuracy score: 63.51851851851852
[[ 33 195]
 [  2 310]]


In [198]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='rbf', degree=2, gamma='auto')
SVM.fit(train_essay_vect, train_label)
predictions_SVM = SVM.predict(test_essay_vect)
print("accuracy score:", accuracy_score(predictions_SVM, test_label)*100)

# producing confusion matrix
con_matrix = confusion_matrix(test_label, predictions_SVM)
print(con_matrix)

accuracy score: 57.77777777777777
[[  0 228]
 [  0 312]]


In [167]:
test_essay

1005    ['censorship', 'touchy', 'subject', 'society',...
1471    ['would', 'like', 'someone', 'take', 'book', '...
967     ['find', 'thing', 'offensive', 'wether', 'book...
1268    ['every', 'parent', 'want', 'protect', 'child'...
1622    ['censorship', 'method', 'limit', 'certain', '...
                              ...                        
1362    ['censorship', 'future', 'kid', 'america', 'pe...
205     ['consitered', 'offensive', 'take', 'book', 'm...
1496    ['censorship', 'library', 'question', 'side', ...
369     ['censorship', 'world', 'get', 'bad', 'everyda...
707     ['think', 'library', 'book', 'make', 'start', ...
Name: essay_final, Length: 540, dtype: object

In [176]:
te = test_essay.reset_index()
te.iloc[99,:]

index                                                       1654
essay_final    ['happen', 'use', 'book', 'film', 'simple', 'e...
Name: 99, dtype: object

In [114]:
# splitting into training and testing set
train_essay, test_essay, train_label, test_label = model_selection.train_test_split(train_set['essay_final'], train_set['type1'], test_size=0.3)

In [115]:
# transform the avg score into label of 0,1,2,3....
Encoder = LabelEncoder()
train_label = Encoder.fit_transform(train_label)
test_label = Encoder.transform(test_label)

In [119]:
# transform essay into matrix
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(train_set["essay_final"])
train_essay_vect = Tfidf_vect.transform(train_essay)
test_essay_vect = Tfidf_vect.transform(test_essay)

In [121]:
# fitting training set into naive bayes
naive = naive_bayes.MultinomialNB()
naive.fit(train_essay_vect, train_label)

# fitting testing set on NB classifier
predictions_NB = naive.predict(test_essay_vect)
print(accuracy_score(predictions_NB, test_label)*100)
# predictions_NB

63.70370370370371


In [122]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='rbf', degree=2, gamma='auto')
SVM.fit(train_essay_vect, train_label)
predictions_SVM = SVM.predict(test_essay_vect)
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, test_label)*100)

SVM Accuracy Score ->  59.81481481481481
