## PHASE-1 PRE-PROCESSING

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

import string
import re
from spellchecker import SpellChecker
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize, sent_tokenize
from wordcloud import WordCloud
from collections import Counter
from nltk import pos_tag
from nltk.corpus import words
import contractions
import pkg_resources
from symspellpy import SymSpell, Verbosity

nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt') 
nltk.download('stopwords')

# maximum edit distance per dictionary precalculation
max_edit_distance_dictionary = 2
prefix_length = 7

# create objects
sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)

# load dictionary
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

# initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Create a spellchecker object for English
spell = SpellChecker(language='en')

[nltk_data] Downloading package words to /Users/ananth/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ananth/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/ananth/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/ananth/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ananth/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ananth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### A. DATA CLEANING

In [3]:
def text_cleaning(text):  
    
    # creating an empty list
    expanded_words = [] 
    
    #Perform contractions to convert words like don't to do not
    for word in text.split():
      # using contractions.fix to expand the shortened words
      expanded_words.append(contractions.fix(word))
    
    expanded_text = ' '.join(expanded_words)
    
    # tokenizing text 
    tokens = word_tokenize(text)
    
    # converting list to string
    text = ' '.join(tokens)
    
    # convert text to lowercase and remove leading/trailing white space
    text = ''.join(text.lower().strip()) 
    
    # remove newlines, tabs, and extra white spaces
    text = re.sub('\n|\r|\t', ' ', text)
    text = re.sub(' +', ' ', text)
    text = ''.join(text.lower().strip()) 

    # remove stop words and punctuation
    stop_words = set(stopwords.words('english'))
    cleaned_text = ' '.join([word for word in text.split() if word not in stop_words])
    cleaned_text = ''.join([char for char in cleaned_text if char not in string.punctuation])
    cleaned_text = ' '.join([char for char in cleaned_text.split() if len(char) > 2]) # Added this for only keeping words with lengths>2

    cleaned_tokens = cleaned_text.split()
    
    return cleaned_tokens

In [4]:
def words_correction(word_list):
    corrected_words = []
    for word in word_list:
        # check if word is misspelled
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
        if suggestions:
            corrected_word = suggestions[0].term
            corrected_words.append(corrected_word)
        else:
            corrected_words.append(word)
    return corrected_words

In [5]:
# define a function to apply lemmatization with POS tagging to each word
def lemmatize_with_pos(word):
    pos = get_wordnet_pos(word)
    if pos:
        return lemmatizer.lemmatize(word, pos=pos)
    else:
        return lemmatizer.lemmatize(word)

# define a function to get the appropriate POS tag for a word
def get_wordnet_pos(word):
    """Map POS tag to first character used by WordNetLemmatizer"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)  # default to noun if not found

# define a function to apply lemmatization to each word
def lemmatize_text(text):
    return [lemmatize_with_pos(word) for word in text]

In [6]:
# Define a function to count the number of spelling mistakes in a given essay
def count_spelling_mistakes(essay):
    mistakes = []
    for word in essay:
        if word not in spell and not wn.synsets(word, pos='n'):
            mistakes.append(word)
    return mistakes, len(mistakes)

In [7]:
def count_pos_tags(tokens):
    noun_count = 0
    verb_count = 0
    adjective_count = 0
    adverb_count = 0
    
    # loop through each token and increment the corresponding counter
    for token, tag in pos_tag(tokens):
        if tag.startswith('N'):  # noun
            noun_count += 1
        elif tag.startswith('V'):  # verb
            verb_count += 1
        elif tag.startswith('J'):  # adjective
            adjective_count += 1
        elif tag.startswith('R'):  # adverb
            adverb_count += 1
    
    # return a dictionary with the counts
    return {'noun': noun_count, 'verb': verb_count, 'adjective': adjective_count, 'adverb': adverb_count}

In [10]:
#df_train = pd.read_csv('../data/train.csv')

def process_df(csv_file_path, output_csv_file_path):
    # Read the CSV file into a DataFrame
    df_train = pd.read_csv(csv_file_path)

    # 1. apply the text_cleaning function to the 'full_text' column using apply() method
    df_train['cleaned_tokenize_text'] = df_train['full_text'].apply(text_cleaning)

    # 2. apply word_correction function to the cleaned_tokenize_text
    df_train['corrected_text'] = df_train['cleaned_tokenize_text'].apply(lambda x: words_correction(x))

    # 3. apply lemmatize_text function to the corrected_text
    df_train['lemmatized_text'] = df_train['corrected_text'].apply(lambda x: lemmatize_text(x))

    # 4. Compute the statistics
    df_train['sent_count'] = df_train['full_text'].apply(lambda x: len(sent_tokenize(x)))

    # 5. Compute the average number of words in a sentence in an essay
    df_train['sent_len'] = df_train['full_text'].apply(lambda x: np.mean([len(w.split()) for w in sent_tokenize(x)]))

    # 6. Apply the function to the tokenized text column and store the results in new columns
    df_train[['mistakes', 'num_mistakes']] = df_train['cleaned_tokenize_text'].apply(lambda x: pd.Series(count_spelling_mistakes(x)))

    # 7. Apply the count_pos_tags function to each row
    df_train['pos_counts'] = df_train['lemmatized_text'].apply(count_pos_tags)

    # 8. Extract the count for each POS tag into a separate column
    df_train['noun_count'] = df_train['pos_counts'].apply(lambda x: x['noun'])
    df_train['verb_count'] = df_train['pos_counts'].apply(lambda x: x['verb'])
    df_train['adjective_count'] = df_train['pos_counts'].apply(lambda x: x['adjective'])
    df_train['adverb_count'] = df_train['pos_counts'].apply(lambda x: x['adverb'])

    # 9. drop the tokens and pos_counts columns
    df_train = df_train.drop(['pos_counts'], axis=1)
    
    # Write the processed data to a CSV file
    df_train.to_csv(output_csv_file_path, index=False)

In [None]:
process_df(csv_file_path, output_csv_file_path)