In [1]:
# Libraries for data manipulation and linear algebra
import numpy as np 
import pandas as pd 

# For file paths
from pathlib import Path  

# Tensorflow and Keras for Neural Network
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import l2

# Scikit-learn libraries for preprocessing, model evaluation, and text vectorization
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import LatentDirichletAllocation

# Libraries for text processing
import re
import spacy
from spacy.lang.en import English 
from spacy.lang.en.stop_words import STOP_WORDS

# Visualization
import matplotlib.pyplot as plt

%matplotlib inline

# Display input files
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv
/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv
/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv
/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv
/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv
/kaggle/input/summary-test/summaries_test.csv


Load and Preprocess Dataset

In [2]:
# Load the dataset
file_path = '/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv'
se = pd.read_csv(file_path)

# Drop rows with missing 'wording' and 'content'
se.dropna(subset=['wording', 'content'], inplace=True)

# Fill missing 'text' values
se['text'].fillna('', inplace=True)

# Text preprocessing function
def preprocess_text(text):
    text = text.replace('\n', ' ').lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    text = ' '.join([word for word in text.split() if word.isalpha() and word not in STOP_WORDS])
    return ' '.join(text.split())

# Apply the preprocessing function to texts
se['PreprocessedText'] = se['text'].apply(preprocess_text)


Vectorization of Text Data

In [3]:
# Vectorize preprocessed texts
vectorizer = CountVectorizer(binary=True, max_features=5000)
count_data = vectorizer.fit_transform(se['PreprocessedText'])

lda = LatentDirichletAllocation(n_components=4)  # for 4 topics
X = lda.fit_transform(count_data)

# Split data into training and testing sets
y = se[['wording', 'content']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Data Preparation for Neural Network

In [4]:

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Neural Network Model Building and Training

In [5]:
# Define model architecture
hidden_nodes_l1 = (X_train_scaled.shape[1] + 1) // 2
hidden_nodes_l2 = (hidden_nodes_l1 + 1) // 2
hidden_nodes_l3 = (hidden_nodes_l2 + 1) // 2
hidden_nodes_l4 = (hidden_nodes_l3 + 1) // 2

model = Sequential([
    Dense(hidden_nodes_l1, input_dim=X_train_scaled.shape[1], activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.25),
    Dense(hidden_nodes_l2, activation='tanh', kernel_regularizer=l2(0.001)),
    Dropout(0.25),
    Dense(hidden_nodes_l3, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.25),
    Dense(hidden_nodes_l4, activation='tanh', kernel_regularizer=l2(0.001)),
    # Output layer with two nodes for "wording" and "content"
    Dense(2, activation='linear')  
])

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7abddc0afaf0>

Evaluation

In [6]:
# Predict on test data
y_pred = model.predict(X_test_scaled)

rmse_wording = mean_squared_error(y_test[:, 0], y_pred[:, 0], squared=False)
rmse_content = mean_squared_error(y_test[:, 1], y_pred[:, 1], squared=False)

print(f"RMSE for wording: {rmse_wording}")
print(f"RMSE for content: {rmse_content}")


RMSE for wording: 0.9857846063069426
RMSE for content: 1.0472447835686411


Prepare Submission for Competition

In [7]:
# Load test dataset for competition
test_df = pd.read_csv(Path("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv"))


# Preprocess and vectorize the test data
test_df['PreprocessedText'] = test_df['text'].apply(preprocess_text)
count_data_competition = vectorizer.transform(test_df['PreprocessedText'])

# Apply LDA transformation
X_test_competition = lda.transform(count_data_competition)

# Predict using the trained model
predictions = model.predict(X_test_competition)


# Prepare submission DataFrame
submission_df = pd.DataFrame({
    'student_id': test_df['student_id'],
    'content': predictions[:, 1],
    'wording': predictions[:, 0]
})

# Clip values to be within expected bounds
submission_df['content'] = np.clip(submission_df['content'], -2, 5)
submission_df['wording'] = np.clip(submission_df['wording'], -2, 5)

# Save submission to a CSV
submission_df.to_csv("submission.csv", index=False)


