In [None]:
# Libraries for data manipulation and linear algebra
import numpy as np 
import pandas as pd 

# For file paths
from pathlib import Path  

# Tensorflow and Keras for Neural Network
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import l2

# Scikit-learn libraries for preprocessing, model evaluation, and text vectorization
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import mean_squared_error

# Libraries for text processing
import re
import spacy
from spacy.lang.en import English 
from spacy.lang.en.stop_words import STOP_WORDS

# Visualization
import matplotlib.pyplot as plt

%matplotlib inline

# Display input files
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


Load and Preprocess Dataset

In [None]:
# Load the dataset
file_path = '/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv'
se = pd.read_csv(file_path)

# Drop rows with missing 'wording' and 'content'
se.dropna(subset=['wording', 'content'], inplace=True)

# Fill missing 'text' values
se['text'].fillna('', inplace=True)

# Text preprocessing function
def preprocess_text(text):
    text = text.replace('\n', ' ').lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    text = ' '.join([word for word in text.split() if word.isalpha() and word not in STOP_WORDS])
    return ' '.join(text.split())

# Apply the preprocessing function to texts
se['PreprocessedText'] = se['text'].apply(preprocess_text)


Vectorization of Text Data

In [None]:
# Vectorize preprocessed texts
vectorizer = CountVectorizer(binary=True, max_features=5000)
X = vectorizer.fit_transform(se['PreprocessedText'])

# Split data for 'wording' and 'content' models
y_wording = se['wording'].values
y_content = se['content'].values

X_train_wording, X_test_wording, y_train_wording, y_test_wording = train_test_split(X, y_wording, test_size=0.2, random_state=42)
X_train_content, X_test_content, y_train_content, y_test_content = train_test_split(X, y_content, test_size=0.2, random_state=42)



Data Preparation for Neural Network

In [None]:
# Convert sparse matrix to dense format
X_train_wording_dense = X_train_wording.toarray()
X_test_wording_dense = X_test_wording.toarray()

X_train_content_dense = X_train_content.toarray()
X_test_content_dense = X_test_content.toarray()


# Scale data
scaler = StandardScaler()
X_train_wording_scaled = scaler.fit_transform(X_train_wording_dense)
X_test_wording_scaled = scaler.transform(X_test_wording_dense)

X_train_content_scaled = scaler.fit_transform(X_train_content_dense)
X_test_content_scaled = scaler.transform(X_test_content_dense)


Neural Network Model Building and Training

In [None]:
# Define model architecture for wording
hidden_nodes_l1 = (X_train_wording_scaled.shape[1] + 1) // 2
hidden_nodes_l2 = (hidden_nodes_l1 + 1) // 2
hidden_nodes_l3 = (hidden_nodes_l2 + 1) // 2
hidden_nodes_l4 = (hidden_nodes_l3 + 1) // 2

model_wording = Sequential([
    Dense(hidden_nodes_l1, input_dim=X_train_wording_scaled.shape[1], activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.25),
    Dense(hidden_nodes_l2, activation='tanh', kernel_regularizer=l2(0.001)),
    Dropout(0.25),
    Dense(hidden_nodes_l3, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.25),
    Dense(hidden_nodes_l4, activation='tanh', kernel_regularizer=l2(0.001)),
    Dense(1, activation='sigmoid')  
])

#model_wording = create_model(X_train_wording_dense.shape[1])
model_wording.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])
model_wording.fit(X_train_wording_scaled, y_train_wording, epochs=50, batch_size=32, validation_split=0.2)


In [None]:
# Define model architecture for content
hidden_nodes_l1 = (X_train_content_scaled.shape[1] + 1) // 2
hidden_nodes_l2 = (hidden_nodes_l1 + 1) // 2
hidden_nodes_l3 = (hidden_nodes_l2 + 1) // 2
hidden_nodes_l4 = (hidden_nodes_l3 + 1) // 2

model_content = Sequential([
    Dense(hidden_nodes_l1, input_dim=X_train_content_scaled.shape[1], activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.25),
    Dense(hidden_nodes_l2, activation='tanh', kernel_regularizer=l2(0.001)),
    Dropout(0.25),
    Dense(hidden_nodes_l3, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.25),
    Dense(hidden_nodes_l4, activation='tanh', kernel_regularizer=l2(0.001)),
    Dense(1, activation='sigmoid')  
])

#model_content = create_model(X_train_content_dense.shape[1])
model_content.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])
model_content.fit(X_train_content_scaled, y_train_content, epochs=50, batch_size=32, validation_split=0.2)

Evaluation

In [None]:
# Predict on test data
y_pred_wording = model_wording.predict(X_test_wording_scaled)
rmse_wording = mean_squared_error(y_test_wording, y_pred_wording, squared=False)
print(f"RMSE for wording: {rmse_wording}")

y_pred_content = model_content.predict(X_test_content_scaled)
rmse_content = mean_squared_error(y_test_content, y_pred_content, squared=False)
print(f"RMSE for content: {rmse_content}")


Prepare Submission for Competition

In [None]:
# Load test dataset for competition
test_df = pd.read_csv(Path("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv"))

# Preprocess and vectorize the test data
test_df['PreprocessedText'] = test_df['text'].apply(preprocess_text)
X_test_competition = vectorizer.transform(test_df['PreprocessedText'])

# Predict using the trained models
predictions_wording = model_wording.predict(X_test_competition)
predictions_content = model_content.predict(X_test_competition)

# Prepare submission DataFrame
submission_df = pd.DataFrame({
    'student_id': test_df['student_id'],
    'content': predictions_content.squeeze(),
    'wording': predictions_wording.squeeze()
})


# Clip values to be within expected bounds
submission_df['content'] = np.clip(submission_df['content'], -2, 5)
submission_df['wording'] = np.clip(submission_df['wording'], -2, 5)

# Save submission to a CSV
submission_df.to_csv("submission.csv", index=False)
