In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
import create
import grade 
import model_creator 
import predictor_extractor 
import predictor_set 
import util_functions
import essay_set
import feature_extractor

from essay_set import EssaySet
from feature_extractor import FeatureExtractor

In [3]:
"""
Reading training set and cleaning for essay set 2
"""

train_set = pd.read_csv("../asap-aes/training_set_rel3.tsv", sep='\t', encoding="latin-1")

train_set = train_set[train_set['essay_set'] == 2]  # filter for set 2
train_set = train_set.reset_index() # resets index
train_set['essay'] = [entry.lower() for entry in train_set['essay']] # lower case for all words in essay

In [4]:
"""
Filtering unwanted columns for readability
"""

train_set = train_set[train_set['essay_set'] == 2]  # filter for set 2
train_set = train_set.reset_index() # resets index
train_set['essay'] = [entry.lower() for entry in train_set['essay']] # lower case for all words in essay

In [5]:
"""
Some helper variables
"""

essays = train_set['essay']
scores = train_set['domain1_score']

In [9]:
"""
Helper section for reading the complete essay prompts of each essay set
"""

essay_prompts = []

for i in range(1,9):
    file = "../prompts/set" + str(i) + ".txt"
    f = open(file, "r", encoding="utf-8")
    essay_prompts.append(f.read())
    
def get_essay_prompt(essay_set):
    return essay_prompts[essay_set-1]

In [10]:
"""
Initializes the EssaySet and FeatureExtractor
"""

e_set = EssaySet()
f_extractor = FeatureExtractor()

for i in range(len(essays)):
    e_set.add_essay(essays[i], scores[i])

In [29]:
"""
Extracting the length and POS features
"""

length = f_extractor.gen_length_feats(e_set)
length_df = pd.DataFrame(length, columns = ['chars', 'words', 'commas', 'apostrophes', 'punctuations', 'avg_word_length', 'POS', 'POS/total_words']) 

In [30]:
"""
Extracting the prompt features"""
e_set.update_prompt(get_essay_prompt(2))

prompts = f_extractor.gen_prompt_feats(e_set)
prompts_df = pd.DataFrame(prompts, columns = ['prompt_words', 'prompt_words/total_words', 'synonym_words', 'synonym_words/total_words'])

In [31]:
"""
Extracting the BoW features
"""

unstemmed = util_functions.get_vocab_essays_count(e_set._text, e_set._score)
stemmed = util_functions.get_vocab_essays_count(e_set._clean_stem_text, e_set._score)

bow = list(map(lambda a,b:[a,b], unstemmed, stemmed))
bow_df = pd.DataFrame(bow, columns = ['unstemmed', 'stemmed'])

In [32]:
"""
Merging the length, POS, prompt and BoW features
"""

features = pd.concat([length_df, prompts_df, bow_df], axis=1, sort=False)

In [33]:
features

Unnamed: 0,chars,words,commas,apostrophes,punctuations,avg_word_length,POS,POS/total_words,prompt_words,prompt_words/total_words,synonym_words,synonym_words/total_words,unstemmed,stemmed
0,2639.0,527.0,15.0,13.0,21.0,5.007590,524.330784,0.994935,220.0,0.417457,112.0,0.212524,584,559
1,841.0,180.0,5.0,2.0,3.0,4.672222,178.662900,0.992572,82.0,0.455556,66.0,0.366667,210,210
2,1181.0,261.0,12.0,15.0,14.0,4.524904,257.992218,0.988476,144.0,0.551724,83.0,0.318008,291,285
3,2705.0,527.0,22.0,6.0,31.0,5.132827,521.653920,0.989856,245.0,0.464896,131.0,0.248577,547,528
4,2394.0,501.0,25.0,15.0,34.0,4.778443,484.298031,0.966663,216.0,0.431138,117.0,0.233533,591,562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1795,1339.0,286.0,0.0,34.0,11.0,4.681818,280.643026,0.981269,114.0,0.398601,78.0,0.272727,303,293
1796,1373.0,282.0,15.0,8.0,15.0,4.868794,277.649880,0.984574,152.0,0.539007,80.0,0.283688,339,333
1797,605.0,132.0,7.0,2.0,5.0,4.583333,125.322581,0.949413,71.0,0.537879,34.0,0.257576,164,160
1798,2737.0,561.0,22.0,5.0,27.0,4.878788,551.978417,0.983919,301.0,0.536542,168.0,0.299465,542,512
