DATA PREPROCESSING

In [2]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.preprocessing import MinMaxScaler



In [4]:
import nltk
nltk.data.path.append("C:/Users/YourName/AppData/Roaming/nltk_data")


In [6]:
# Load the original dataset
df = pd.read_csv(r"D:\dlproject\dlproj\data\raw\training_set_rel3.csv", sep='\t')

# Drop unnecessary duplicates (if any)
df.drop_duplicates(subset=['essay_id'], inplace=True)

# Check structure
print(df.columns)
print(df.shape)


Index(['essay_id', 'essay_set', 'essay', 'rater1_domain1', 'rater2_domain1',
       'rater3_domain1', 'domain1_score', 'rater1_domain2', 'rater2_domain2',
       'domain2_score', 'rater1_trait1', 'rater1_trait2', 'rater1_trait3',
       'rater1_trait4', 'rater1_trait5', 'rater1_trait6', 'rater2_trait1',
       'rater2_trait2', 'rater2_trait3', 'rater2_trait4', 'rater2_trait5',
       'rater2_trait6', 'rater3_trait1', 'rater3_trait2', 'rater3_trait3',
       'rater3_trait4', 'rater3_trait5', 'rater3_trait6'],
      dtype='object')
(12976, 28)


In [7]:
def clean_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()     # Remove extra whitespace
    return text

df['clean_essay'] = df['essay'].apply(clean_text)


In [8]:
def compute_features(text):
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    num_words = len(words)
    num_sentences = len(sentences)
    num_chars = len(text)
    
    avg_word_length = np.mean([len(w) for w in words]) if words else 0
    words_per_sentence = num_words / num_sentences if num_sentences else 0
    chars_per_word = num_chars / num_words if num_words else 0

    return pd.Series([num_words, num_sentences, num_chars, avg_word_length, words_per_sentence, chars_per_word])

df[['essay_length', 'sentence_count', 'char_length', 'avg_word_length', 'words_per_sentence', 'chars_per_word']] = df['clean_essay'].apply(compute_features)


In [9]:
df = df[df['domain1_score'].notnull()]
df['mean_score'] = df['domain1_score']  # You can average raters if needed, here we just use resolved score


In [10]:
cols_to_drop = ['essay', 'rater1_domain1', 'rater2_domain1', 'rater3_domain1',
                'rater1_domain2', 'rater2_domain2', 'domain2_score',
                'rater1_trait1', 'rater1_trait2', 'rater1_trait3', 'rater1_trait4',
                'rater1_trait5', 'rater1_trait6', 'rater2_trait1', 'rater2_trait2',
                'rater2_trait3', 'rater2_trait4', 'rater2_trait5', 'rater2_trait6',
                'rater3_trait1', 'rater3_trait2', 'rater3_trait3', 'rater3_trait4',
                'rater3_trait5', 'rater3_trait6']
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')


In [11]:
scaler = MinMaxScaler()
length_features = ['essay_length', 'sentence_count', 'char_length', 'avg_word_length', 'words_per_sentence', 'chars_per_word']

df[length_features] = scaler.fit_transform(df[length_features])


In [12]:
df.to_csv("processed_essays.csv", index=False)
print("✅ Processed data saved to 'processed_essays.csv'")


✅ Processed data saved to 'processed_essays.csv'
