In [1]:
import numpy as np 
import pandas as pd 
from pathlib import Path  
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import re
import spacy
from spacy.lang.en import English 
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential



Data Loading:

In [2]:
file_path = '/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv'
se = pd.read_csv(file_path)


Preprocessing:

In [3]:
se.dropna(subset=['wording', 'content'], inplace=True)
se['text'].fillna('', inplace=True)

# Preprocess text function
def preprocess_text(text):
    text = text.replace('\n', ' ').lower()  # replace newlines and convert to lowercase
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)  # remove punctuation
    text = ' '.join([word for word in text.split() if word.isalpha() and word not in stop_words])  # remove stop words
    return ' '.join(text.split())  # replace sequences of whitespace

stop_words = spacy.lang.en.stop_words.STOP_WORDS
preprocessed_texts = [preprocess_text(text) for text in se['text']]
se['PreprocessedText'] = preprocessed_texts


Tokenization and Vectorization using Word2Vec:

In [4]:
sentences = se['PreprocessedText'].apply(lambda x: x.split()).tolist()
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
vocabulary = set(w2v_model.wv.index_to_key)


Feature Creation

In [5]:
def average_word_vectors(words, w2v_model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    n_words = 0.
    for word in words:
        if word in vocabulary: 
            n_words = n_words + 1.
            feature_vector = np.add(feature_vector, w2v_model.wv[word])
    if n_words:
        feature_vector = np.divide(feature_vector, n_words)
    return feature_vector

features = [average_word_vectors(tokens, w2v_model, vocabulary, 100) for tokens in sentences]


In [6]:
tagged_data = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(se['PreprocessedText'].apply(lambda x: x.split()))]

model_d2v = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=100)
model_d2v.build_vocab(tagged_data)
model_d2v.train(tagged_data, total_examples=model_d2v.corpus_count, epochs=model_d2v.epochs)

In [7]:
features = [model_d2v.infer_vector(words) for words in se['PreprocessedText'].apply(lambda x: x.split())]

Split Data

In [8]:
y = se[['wording', 'content']].values
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=42)




Model Creation and Training

In [9]:
# Scale the feature vectors
X_train_dense = np.array(X_train)
X_test_dense = np.array(X_test)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_dense)
X_test_scaled = scaler.transform(X_test_dense)

# Define and compile the model
model = Sequential()
model.add(Dense(116, input_dim=X_train_scaled.shape[1], activation='relu'))
model.add(Dense(2, activation='linear'))  # Two output nodes for "wording" and "content"
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7d7cf495ec20>

Evaluation:

In [10]:
# Predict on test data
y_pred = model.predict(X_test_scaled)
rmse_wording = mean_squared_error(y_test[:, 0], y_pred[:, 0], squared=False)
rmse_content = mean_squared_error(y_test[:, 1], y_pred[:, 1], squared=False)

print(f"RMSE for wording: {rmse_wording}")
print(f"RMSE for content: {rmse_content}")


RMSE for wording: 0.9974470652811114
RMSE for content: 0.8361382326234118


Competition Test Set Prediction:

In [11]:
test_df = pd.read_csv(Path("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv"))

test_df['PreprocessedText'] = test_df['text'].apply(preprocess_text)
test_sentences = test_df['PreprocessedText'].apply(lambda x: x.split()).tolist()
X_test_competition = [model_d2v.infer_vector(tokens) for tokens in test_df['PreprocessedText'].apply(lambda x: x.split())]
X_test_competition = np.array(X_test_competition)

# Predict using the trained model
predictions = model.predict(X_test_competition)





Submission Creation:

In [12]:
submission_df = pd.DataFrame({
    'student_id': test_df['student_id'],
    'content': predictions[:, 1],
    'wording': predictions[:, 0]
})
submission_df['content'] = np.clip(submission_df['content'], -2, 5)
submission_df['wording'] = np.clip(submission_df['wording'], -2, 5)
submission_df.to_csv("submission.csv", index=False)
