In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path  # data processing, CSV file I/O 
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer
import re
import spacy
from spacy.lang.en import English 

import matplotlib.pyplot as plt


%matplotlib inline
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/summary-test/summaries_test.csv
/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv
/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv
/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv
/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv
/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv


In [2]:
file_path = '/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv'

se = pd.read_csv(file_path)

In [3]:
se.dropna(subset=['wording', 'content'], inplace=True)

# Handle missing values in text
se['text'].fillna('', inplace=True)

In [4]:
texts = se['text'] 

In [5]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [6]:
def preprocess_text(text):
    # Replace newline characters with spaces
    text = text.replace('\n', ' ')
    
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    
    
    
    # Remove non-alphabetic words and stop words
    text = ' '.join([word for word in text.split() if word.isalpha() and word not in stop_words])
    
    # Replace sequences of whitespace characters with a single space
    text = ' '.join(text.split())
    
    return text


In [7]:
preprocessed_texts = [preprocess_text(text) for text in texts]

In [8]:
se['PreprocessedText'] = preprocessed_texts

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=5000)  # Bigram vectorization
X = vectorizer.fit_transform(se['PreprocessedText'])

In [10]:
y = se[['wording', 'content']].values

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Initialize the regressor
regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
regressor.fit(X_train, y_train)

# Predict
y_pred = regressor.predict(X_test)

In [13]:
rmse_wording = mean_squared_error(y_test[:, 0], y_pred[:, 0], squared=False)
rmse_content = mean_squared_error(y_test[:, 1], y_pred[:, 1], squared=False)

print(f"RMSE for wording: {rmse_wording}")
print(f"RMSE for content: {rmse_content}")

RMSE for wording: 0.7448562440361134
RMSE for content: 0.6109346491748914


In [14]:
test_df = pd.read_csv(
    Path("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv")  
)

# 2. Preprocess and vectorize the test data
test_df['PreprocessedText'] = test_df['text'].apply(preprocess_text)
X_test_competition = vectorizer.transform(test_df['PreprocessedText'])

# 3. Predict using your trained model
predictions = regressor.predict(X_test_competition)

# 4. Create a submission DataFrame and save it as a CSV
submission_df = pd.DataFrame({
    'student_id': test_df['student_id'],
    'content': predictions[:, 1],
    'wording': predictions[:, 0]
})

print(submission_df.shape)


submission_df['content'] = np.clip(submission_df['content'], -2, 5)
submission_df['wording'] = np.clip(submission_df['wording'], -2, 5)


submission_df.to_csv("submission.csv", index=False)

(4, 3)
