In [1]:
# Imports
import pandas as pd
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [2]:
se = pd.read_csv(
    Path("./commonlit-evaluate-student-summaries/summaries_train.csv")
)

se.head()

Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757


In [3]:
texts = se['text']

print(texts[:5])

0    The third wave was an experimentto see how peo...
1    They would rub it up with soda to make the sme...
2    In Egypt, there were many occupations and soci...
3    The highest class was Pharaohs these people we...
4    The Third Wave developed  rapidly because the ...
Name: text, dtype: object


In [4]:
def preprocess_text(text):
    # Replace newline characters with spaces
    text = text.replace('\n', ' ')
    
    # Replace sequences of whitespace characters with a single space
    text = ' '.join(text.split())
    
    return text

In [5]:
preprocessed_texts = [preprocess_text(text) for text in texts]

In [6]:
se['PreprocessedText'] = preprocessed_texts

In [7]:
test_texts = ["Farmers tended the fields, raised animals, kept canals and reservoirs in good order, worked in the stone quarries, and built the royal monuments. Farmers paid taxes that could amount to as much as 60% of their yearly harvest—that’s a lot of hay! Social mobility was not impossible. A small number of peasants and farmers moved up the economic ladder. Families saved money to send their sons to village schools to learn trades. These schools were run by priests or by artisans. Boys who learned to read and write could become scribes, then go on to gain employment in the government. It was possible for a boy born on a farm to work his way up into the higher ranks of the government. Bureaucracy proved lucrative."]

vectorizer = TfidfVectorizer(max_features=5000, stop_words=None)

X = vectorizer.fit_transform(test_texts)

In [8]:
print(se['text'].head())

0    The third wave was an experimentto see how peo...
1    They would rub it up with soda to make the sme...
2    In Egypt, there were many occupations and soci...
3    The highest class was Pharaohs these people we...
4    The Third Wave developed  rapidly because the ...
Name: text, dtype: object


In [9]:
#texts = [row[2] for row in se]

vectorizer = TfidfVectorizer(max_features=5000, stop_words=None)

X = vectorizer.fit_transform(se['PreprocessedText'])

In [10]:
correlation = se['content'].corr(se['wording'])
print('Correlation between content and wording: ', correlation)


Correlation between content and wording:  0.7513804859701986


In [11]:
y = se[['wording', 'content']].values

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Initialize the regressor
regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
regressor.fit(X_train, y_train)

# Predict
y_pred = regressor.predict(X_test)

In [14]:
rmse_wording = mean_squared_error(y_test[:, 0], y_pred[:, 0], squared=False)
rmse_content = mean_squared_error(y_test[:, 1], y_pred[:, 1], squared=False)

print(f"RMSE for wording: {rmse_wording}")
print(f"RMSE for content: {rmse_content}")

RMSE for wording: 0.7140610440104301
RMSE for content: 0.5889862654146798
