## This notebook generates the features of all set 1 to set 8 and output them in csv format, using the modified version of feature_extractor.py

### Output of the features of all sets are stored in the same directory as this file

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm 
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
import sys
sys.path.append('../ease')
import create
import grade 
import model_creator 
import predictor_extractor 
import predictor_set 
import util_functions
import essay_set
import feature_extractor

from essay_set import EssaySet
from feature_extractor import FeatureExtractor

## Determine Essay Prompts

In [3]:
essay_prompts = []

for i in range(1,9):
    file = "../../prompts/set" + str(i) + ".txt"
    f = open(file, "r", encoding="latin-1") 
    essay_prompts.append(f.read())
    
def get_essay_prompt(essay_set):
    return essay_prompts[essay_set-1]

## Read Dataset

There are 8 different essay sets.  As an overview:
- Sets 1 & 2 are of persuasive/narrative in the form of letters
- Sets 3, 4, 5 & 6 are source dependent response to a given essay
- Sets 7 & 8 are of persuasive/narrative in the form of story writing essays

These format makes it good for transfer learning.

In [4]:
data_set = pd.read_csv("../../asap-aes/training_set_rel3.tsv", sep='\t', encoding="latin-1")
data_set['essay'] = [entry.lower() for entry in data_set['essay']] # lower case for all words in essay

## Generate csv file for features

In [5]:
def generate_features_csv(set_no):
    # Filter dataset
    data_set_n = data_set[data_set['essay_set'] == set_no]
    data_set_n = data_set_n.reset_index()
    
    # Extract essay and its scores
    essays = data_set_n['essay']
    scores = data_set_n['domain1_score']
    
    # Create essay set
    e_set = EssaySet()
    for i in range(len(essays)):
        e_set.add_essay(essays[i], scores[i])
        
    # Extract features
    f_extractor = FeatureExtractor()
    length = f_extractor.gen_length_feats(e_set)
    length_df = pd.DataFrame(
        length, 
        columns = ['chars', 'words', 'commas', 'apostrophes', 'punctuations', 
                   'avg_word_length', 'sentences', 'questions', 'avg_word_sentence',
                   'POS', 'POS/total_words'])
    
    # Update Essay Set's prompt and extract prompt features
    e_set.update_prompt(get_essay_prompt(set_no))
    prompts = f_extractor.gen_prompt_feats(e_set)
    prompts_df = pd.DataFrame(prompts, columns = [
        'prompt_words', 'prompt_words/total_words', 'synonym_words', 'synonym_words/total_words'
    ])
    
    # Get essays count for unstemmed and stemmed words
    unstemmed = util_functions.get_vocab_essays_count(e_set._text, e_set._score)
    stemmed = util_functions.get_vocab_essays_count(e_set._clean_stem_text, e_set._score)
    bow = list(map(lambda a,b:[a,b], unstemmed, stemmed))
    bow_df = pd.DataFrame(bow, columns = ['unstemmed', 'stemmed'])
    
    # Combine features of length, prompt and BoW (bag of words)
    features = pd.concat([length_df, prompts_df, bow_df], axis=1, sort=False)
    
    # Merge with scores
    dataset = features.merge(scores, left_index=True, right_index=True)
    
    # Export to dataset
    dataset.columns = ['chars', 'words', 'commas', 'apostrophes', 'punctuations',
                       'avg_word_length', 'sentences', 'questions', 'avg_word_sentence',
                       'POS', 'POS/total_words',
                       'prompt_words', 'prompt_words/total_words', 'synonym_words',
                       'synonym_words/total_words', 'unstemmed', 'stemmed',
                       'score']
    output_file = 'features_set' + str(set_no) + '.csv'
    dataset.to_csv(output_file, index=False)

## Generate all features csv files for set 1 to 8

In [6]:
for set_no in range(1, 9):
    generate_features_csv(set_no)