# AUTOMATIC ESSAY GRADING

## MSCA 31008 DATA MINING PRINCIPLES FINAL PROJECT

This project is implemented mainly into three phases described as follows:
1. PHASE-1: PRE-PROCESSING & EDA
   
2. PHASE-2: FEATURE EXTRACTION

3. PHASE-3: MODELLING

## PHASE-1: PRE-PROCESSING & EDA

This phase is further divided into following three parts:
1. DATA CLEANING
2. DATA VISUALIZATION
3. INSIGHTS DISCOVERY

Importing required libraries/packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

import string
import re
from spellchecker import SpellChecker
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize, sent_tokenize
from wordcloud import WordCloud
from collections import Counter
from nltk import pos_tag
from nltk.corpus import words
import contractions
import pkg_resources
from symspellpy import SymSpell, Verbosity

nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt') 
nltk.download('stopwords')

# maximum edit distance per dictionary precalculation
max_edit_distance_dictionary = 2
prefix_length = 7

# create objects
sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)

# load dictionary
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

# initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Create a spellchecker object for English
spell = SpellChecker(language='en')

ModuleNotFoundError: No module named 'symspellpy'

### 1.A DATA CLEANING

List of functions created to implement Data Cleaning:
1. text_cleaning(text)
2. words_correction(word_list)
3. lemmatize_with_pos(word)
4. count_spelling_mistakes(essay)
5. count_pos_tags(tokens)
6. assign_score_category(row)
7. process_df(df_train, output_csv_file_path)
  

Let's look at each of them in detail as follows:

1. 'text_cleaning(text)':

 INPUT: raw essay  
OUTPUT: a list of cleaned tokens (words) 

 
This function is performing following operations:
1. Contractions - to expand the shortened words
2. Tokenization - to convert text in a list of words
3. Cleaning - to remove whitespaces, new lines, tabs, stopwords and punctuation
    

In [None]:
def text_cleaning(text):  
    
    # creating an empty list
    expanded_words = [] 
    
    #Perform contractions to convert words like don't to do not
    for word in text.split():
      # using contractions.fix to expand the shortened words
      expanded_words.append(contractions.fix(word))
    
    expanded_text = ' '.join(expanded_words)
    
    # tokenizing text 
    tokens = word_tokenize(text)
    
    # converting list to string
    text = ' '.join(tokens)
    
    # convert text to lowercase and remove leading/trailing white space
    text = ''.join(text.lower().strip()) 
    
    # remove newlines, tabs, and extra white spaces
    text = re.sub('\n|\r|\t', ' ', text)
    text = re.sub(' +', ' ', text)
    text = ''.join(text.lower().strip()) 

    # remove stop words and punctuation
    stop_words = set(stopwords.words('english'))
    cleaned_text = ' '.join([word for word in text.split() if word not in stop_words])
    cleaned_text = ''.join([char for char in cleaned_text if char not in string.punctuation])
    cleaned_text = ' '.join([char for char in cleaned_text.split() if len(char) > 2]) # Added this for only keeping words with lengths>2

    cleaned_tokens = cleaned_text.split()
    
    return cleaned_tokens

2. 'words_correction(word_list)':

 INPUT: cleaned tokenized list of words  
OUTPUT: list of correct spelled words

If the word is misspelled, then the corrected version of that specific word is added the list of corrected words; else word is directly added to corrected words list.

In [None]:
def words_correction(word_list):
    corrected_words = []
    for word in word_list:
        # check if word is misspelled
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
        if suggestions:
            corrected_word = suggestions[0].term
            corrected_words.append(corrected_word)
        else:
            corrected_words.append(word)
    return corrected_words

3. 'lemmatize_with_pos(word)':

 INPUT: list of corrected spelled words  
OUTPUT: list of words in base form 

This functions removes stem from words using part-of-speech tagging. It determines the appropriate POS tag for each word using the 'get_wordnet_pos()' function, which maps the POS tag to the first character used by the WordNetLemmatizer. 

In [None]:
# define a function to apply lemmatization with POS tagging to each word
def lemmatize_with_pos(word):
    pos = get_wordnet_pos(word)
    if pos:
        return lemmatizer.lemmatize(word, pos=pos)
    else:
        return lemmatizer.lemmatize(word)

# define a function to get the appropriate POS tag for a word
def get_wordnet_pos(word):
    """Map POS tag to first character used by WordNetLemmatizer"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)  # default to noun if not found

# define a function to apply lemmatization to each word
def lemmatize_text(text):
    return [lemmatize_with_pos(word) for word in text]

4. 'count_spelling_mistakes(essay)':

 INPUT: list of cleaned tokenize words  
OUTPUT: number of mistakes and list of words with spelling mistakes 

In [None]:
# Define a function to count the number of spelling mistakes in a given essay
def count_spelling_mistakes(essay):
    mistakes = []
    for word in essay:
        if word not in spell and not wn.synsets(word, pos='n'):`
            mistakes.append(word)
    return mistakes, len(mistakes)

5. 'count_pos_tags(tokens)':

 INPUT: list of words in base form (lemmatized words)  
OUTPUT: number of nouns, verbs, adverbs and adjectives in a essay

In [None]:
def count_pos_tags(tokens):
    noun_count = 0
    verb_count = 0
    adjective_count = 0
    adverb_count = 0
    
    # loop through each token and increment the corresponding counter
    for token, tag in pos_tag(tokens):
        if tag.startswith('N'):  # noun
            noun_count += 1
        elif tag.startswith('V'):  # verb
            verb_count += 1
        elif tag.startswith('J'):  # adjective
            adjective_count += 1
        elif tag.startswith('R'):  # adverb
            adverb_count += 1
    
    # return a dictionary with the counts
    return {'noun': noun_count, 'verb': verb_count, 'adjective': adjective_count, 'adverb': adverb_count}

6. 'assign_score_category(row)':

 INPUT: each row of target features  
OUTPUT: categorical label of low, high, medium

In [None]:
# define a function to assign score category based on scores
def assign_score_category(row):
    if all(row[['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']] <= 2.5):
        return 'low'
    elif all(row[['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']] >= 4):
        return 'high'
    else:
        return 'medium'

In [None]:
def list_to_string(lst):
    return ' '.join(lst)

7. 'process_df(df_train, output_csv_file_path)':

 INPUT: data and location where processed data can be stored  
OUTPUT: processed data

In [None]:
def process_df(df_train, output_csv_file_path):

    # 1. apply the text_cleaning function to the 'full_text' column using apply() method
    df_train['cleaned_tokenize_text'] = df_train['full_text'].apply(text_cleaning)

    # 2. apply word_correction function to the cleaned_tokenize_text
    df_train['corrected_text'] = df_train['cleaned_tokenize_text'].apply(lambda x: words_correction(x))

    # 3. apply lemmatize_text function to the corrected_text
    df_train['lemmatized_text'] = df_train['corrected_text'].apply(lambda x: lemmatize_text(x))

    # 4. Compute the statistics
    df_train['sent_count'] = df_train['full_text'].apply(lambda x: len(sent_tokenize(x)))

    # 5. Compute the average number of words in a sentence in an essay
    df_train['sent_len'] = df_train['full_text'].apply(lambda x: np.mean([len(w.split()) for w in sent_tokenize(x)]))

    # 6. Apply the function to the tokenized text column and store the results in new columns
    df_train[['mistakes', 'num_mistakes']] = df_train['cleaned_tokenize_text'].apply(lambda x: pd.Series(count_spelling_mistakes(x)))

    # 7. Apply the count_pos_tags function to each row
    df_train['pos_counts'] = df_train['lemmatized_text'].apply(count_pos_tags)
    
    # Compute the word count for each essay
    df_train['word_count'] = df_train.full_text.apply(lambda x: len(x.split()))

    # 8. Extract the count for each POS tag into a separate column
    df_train['noun_count'] = df_train['pos_counts'].apply(lambda x: x['noun'])
    df_train['verb_count'] = df_train['pos_counts'].apply(lambda x: x['verb'])
    df_train['adjective_count'] = df_train['pos_counts'].apply(lambda x: x['adjective'])
    df_train['adverb_count'] = df_train['pos_counts'].apply(lambda x: x['adverb'])
    
    # 9. apply the function to create a new column
    df_train['Score_Category'] = df_train.apply(assign_score_category, axis=1)

    # 10. drop the tokens and pos_counts columns
    df_train = df_train.drop(['pos_counts'], axis=1)
    
    df_train['cleaned_tokenize_text'] = df_train['cleaned_tokenize_text'].apply(list_to_string)
    df_train['corrected_text'] = df_train['corrected_text'].apply(list_to_string)
    df_train['lemmatized_text'] = df_train['lemmatized_text'].apply(list_to_string)
    df_train['mistakes'] = df_train['mistakes'].apply(list_to_string)
    
    # Write the processed data to a CSV file
    df_train.to_csv(output_csv_file_path, index=False)

In [None]:
df = pd.read_csv('train.csv')
output_csv_file_path = ('processed-essay.csv')
process_df(df, output_csv_file_path)