In [1]:
import numpy as np 
import pandas as pd 
from pathlib import Path  
import tensorflow as tf
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import re

from spacy.lang.en import English 
import matplotlib.pyplot as plt
from gensim.models import Word2Vec


Data Loading:

In [2]:
file_path = '/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv'
se = pd.read_csv(file_path)


Preprocessing:

In [3]:
se.dropna(subset=['wording', 'content'], inplace=True)
se['text'].fillna('', inplace=True)

# Preprocess text function
def preprocess_text(text):
    text = text.replace('\n', ' ').lower()  # replace newlines and convert to lowercase
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)  # remove punctuation
    text = ' '.join([word for word in text.split() if word.isalpha() and word not in stop_words])  # remove stop words
    return ' '.join(text.split())  # replace sequences of whitespace

stop_words = ENGLISH_STOP_WORDS
preprocessed_texts = [preprocess_text(text) for text in se['text']]
se['PreprocessedText'] = preprocessed_texts


Tokenization and Vectorization using Word2Vec:

In [4]:
sentences = se['PreprocessedText'].apply(lambda x: x.split()).tolist()
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
vocabulary = set(model.wv.index_to_key)


Feature Creation

In [5]:
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    n_words = 0.
    for word in words:
        if word in vocabulary: 
            n_words = n_words + 1.
            feature_vector = np.add(feature_vector, model.wv[word])
    if n_words:
        feature_vector = np.divide(feature_vector, n_words)
    return feature_vector

features = [average_word_vectors(tokens, model, vocabulary, 100) for tokens in sentences]


Model Training:

In [6]:
y = se[['wording', 'content']].values
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=42)

regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train, y_train)


Evaluation:

In [7]:
y_pred = regressor.predict(X_test)
rmse_wording = mean_squared_error(y_test[:, 0], y_pred[:, 0], squared=False)
rmse_content = mean_squared_error(y_test[:, 1], y_pred[:, 1], squared=False)
print(f"RMSE for wording: {rmse_wording}")
print(f"RMSE for content: {rmse_content}")


RMSE for wording: 0.8203074912382924
RMSE for content: 0.7703621494024757


Competition Test Set Prediction:

In [8]:
test_df = pd.read_csv(Path("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv"))

test_df['PreprocessedText'] = test_df['text'].apply(preprocess_text)
test_sentences = test_df['PreprocessedText'].apply(lambda x: x.split()).tolist()
X_test_competition = [average_word_vectors(tokens, model, vocabulary, 100) for tokens in test_sentences]
predictions = regressor.predict(X_test_competition)


Submission Creation:

In [9]:
submission_df = pd.DataFrame({
    'student_id': test_df['student_id'],
    'content': predictions[:, 1],
    'wording': predictions[:, 0]
})
submission_df['content'] = np.clip(submission_df['content'], -2, 5)
submission_df['wording'] = np.clip(submission_df['wording'], -2, 5)
submission_df.to_csv("submission.csv", index=False)
