In [1]:
# Import necessary libraries
import numpy as np 
import pandas as pd
import os
import re
from pathlib import Path 

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_squared_error

from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

import matplotlib.pyplot as plt

# Set visualization setting
%matplotlib inline

Load and Explore Data

In [2]:
# Display the available data files in the directory
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load training data
file_path = '/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv'
se = pd.read_csv(file_path)


/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv
/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv
/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv
/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv
/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv
/kaggle/input/summary-test/summaries_test.csv


Data Preprocessing

In [3]:
# Drop NA values and handle missing text values
se.dropna(subset=['wording', 'content'], inplace=True)
se['text'].fillna('', inplace=True)

# Text preprocessing utilities
stop_words = ENGLISH_STOP_WORDS

def preprocess_text(text):
    text = text.replace('\n', ' ')  # Replace newline characters with spaces
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)  # Remove punctuation
    text = ' '.join([word for word in text.split() if word.isalpha() and word not in stop_words])  # Remove non-alphabetic words and stop words
    return ' '.join(text.split())  # Replace sequences of whitespace characters with a single space

preprocessed_texts = [preprocess_text(text) for text in se['text']]
se['PreprocessedText'] = preprocessed_texts


Text Vectorization and Train-Test Split

In [4]:
# Convert text data into numerical vectors
vectorizer = CountVectorizer(binary=True, max_features=5000)
X = vectorizer.fit_transform(se['PreprocessedText'])
y = se[['wording', 'content']].values

# Split the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Model Creation and Training

In [5]:
# Scale the feature vectors
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_dense)
X_test_scaled = scaler.transform(X_test_dense)

# Define and compile the model
model = Sequential()
model.add(Dense(116, input_dim=X_train_scaled.shape[1], activation='relu'))
model.add(Dense(2, activation='linear'))  # Two output nodes for "wording" and "content"
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7c6cae9af730>

Model Evaluation

In [6]:
# Predict on test data
y_pred = model.predict(X_test_scaled)
rmse_wording = mean_squared_error(y_test[:, 0], y_pred[:, 0], squared=False)
rmse_content = mean_squared_error(y_test[:, 1], y_pred[:, 1], squared=False)

print(f"RMSE for wording: {rmse_wording}")
print(f"RMSE for content: {rmse_content}")


RMSE for wording: 0.8398471125149718
RMSE for content: 0.7191654818514979


Predict and Prepare Submission

In [7]:
# Load and preprocess competition test data
test_df = pd.read_csv(Path("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv"))
test_df['PreprocessedText'] = test_df['text'].apply(preprocess_text)
X_test_competition = vectorizer.transform(test_df['PreprocessedText'])

# Predict using the trained model
predictions = model.predict(X_test_competition)

# Create and save submission dataframe
submission_df = pd.DataFrame({
    'student_id': test_df['student_id'],
    'content': predictions[:, 1],
    'wording': predictions[:, 0]
})
submission_df['content'] = np.clip(submission_df['content'], -2, 5)
submission_df['wording'] = np.clip(submission_df['wording'], -2, 5)
submission_df.to_csv("submission.csv", index=False)


