# Using NLP on Educational Reform Policies to Predict Educational Outcome
### Model Testing

In [2]:
# Import Statements
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from transformers import BertTokenizer, BertModel
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler


In [3]:
# Load preprocessed df
final_df = pd.read_csv('../Data/final_pisa_werd_merged.csv')

## Word Embeddings

#### TF-IDF

In [4]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(final_df['reform_description_clean'])

print(tfidf_features.shape)

(91, 12019)


#### Word2Vec

In [7]:
final_df['tokens'] = final_df['reform_description_clean'].apply(lambda x: x.split())

word2vec_model = gensim.models.Word2Vec(final_df['tokens'])

def document_vector(doc):
    doc = [word for word in doc if word in word2vec_model.wv.index_to_key]
    if not doc:
        return np.zeros(100)
    return np.mean(word2vec_model.wv[doc], axis=0)

word2vec_features = np.vstack(final_df['tokens'].apply(document_vector))

print("Word2Vec features shape:", word2vec_features.shape)

Word2Vec features shape: (91, 100)


#### Doc2Vec

In [8]:
final_df['tokens'] = final_df['reform_description_clean'].apply(lambda x: x.split())

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(final_df['tokens'])]

doc2vec_model = Doc2Vec(documents)

doc2vec_features = np.array([doc2vec_model.infer_vector(doc.words) for doc in documents])

print("Doc2Vec features shape:", doc2vec_features.shape)

Doc2Vec features shape: (91, 100)


#### BERT - Base

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def encode_text(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].detach().numpy()

bert_base_features = np.vstack(final_df['reform_description_clean'].apply(encode_text))

print("BERT features shape:", bert_base_features.shape)



BERT features shape: (91, 768)


#### BERT - Large

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertModel.from_pretrained('bert-large-uncased')

def encode_text_large(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].detach().numpy()

bert_large_features = np.vstack(final_df['reform_description_clean'].apply(encode_text_large))

print("BERT features shape:", bert_large_features.shape)



BERT features shape: (91, 1024)


## Model Testing

#### TF-IDF Model Testing

In [11]:
X = tfidf_features
y = final_df['Mean_Last_PISA_Score'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Linear Regression TF-IDF

In [12]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

linear_predictions = linear_model.predict(X_test)
linear_mae = mean_absolute_error(y_test, linear_predictions)
linear_mse = mean_squared_error(y_test, linear_predictions)
linear_rmse = np.sqrt(linear_mse)
linear_r2 = r2_score(y_test, linear_predictions)

print("Linear Regression")
print(f'Mean Absolute Error: {linear_mae}')
print(f'Mean Squared Error: {linear_mse}')
print(f'Root Mean Squared Error: {linear_rmse}')
print(f'R² Score: {linear_r2}')

Linear Regression
Mean Absolute Error: 37.637880772741376
Mean Squared Error: 2153.7364893378995
Root Mean Squared Error: 46.40836658769515
R² Score: 0.5004186669802189


#### Random Forest TF-IDF

In [13]:
model = RandomForestRegressor(n_estimators=150, random_state=42, n_jobs = -1)
model.fit(X_train, y_train)

predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

print("Random Forest Regression")
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R² Score: {r2}')

Random Forest Regression
Mean Absolute Error: 41.121871345029184
Mean Squared Error: 2519.2868719947965
Root Mean Squared Error: 50.19249816451455
R² Score: 0.41562549550464734


#### XGBoost TF-IDF

In [14]:
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

xgb_predictions = xgb_model.predict(X_test)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_rmse = np.sqrt(xgb_mse)
xgb_r2 = r2_score(y_test, xgb_predictions)

print("XGBoost Regression")
print(f'Mean Absolute Error: {xgb_mae}')
print(f'Mean Squared Error: {xgb_mse}')
print(f'Root Mean Squared Error: {xgb_rmse}')
print(f'R² Score: {xgb_r2}')

XGBoost Regression
Mean Absolute Error: 45.3185028611568
Mean Squared Error: 3064.095806459063
Root Mean Squared Error: 55.354275412645976
R² Score: 0.2892514590019658


#### SVR TF-IDF

In [15]:
svm_pipeline = make_pipeline(MaxAbsScaler(), SVR(kernel='rbf', C=1.0, epsilon=0.2))

svm_pipeline.fit(X_train, y_train)

svm_predictions = svm_pipeline.predict(X_test)

svm_mae = mean_absolute_error(y_test, svm_predictions)
svm_mse = mean_squared_error(y_test, svm_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_r2 = r2_score(y_test, svm_predictions)

print("SVM Regression")
print(f'Mean Absolute Error: {svm_mae}')
print(f'Mean Squared Error: {svm_mse}')
print(f'Root Mean Squared Error: {svm_rmse}')
print(f'R² Score: {svm_r2}')

SVM Regression
Mean Absolute Error: 60.831340888540595
Mean Squared Error: 4295.912556668858
Root Mean Squared Error: 65.54321136981967
R² Score: 0.0035188927607316955


### Word2Vec Model Testing

In [16]:
X = word2vec_features
y = final_df['Mean_Last_PISA_Score'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#### Word2Vec Random Forest

In [17]:
model = RandomForestRegressor(n_estimators=150, random_state=42, n_jobs= -1)
model.fit(X_train, y_train)

predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

print("Random Forest Regression")
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R² Score: {r2}')

Random Forest Regression
Mean Absolute Error: 34.797777777777725
Mean Squared Error: 1649.6342094866716
Root Mean Squared Error: 40.61568920363991
R² Score: 0.4547184738577771


#### Word2Vec XGBoost

In [18]:
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

xgb_predictions = xgb_model.predict(X_test)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_rmse = np.sqrt(xgb_mse)
xgb_r2 = r2_score(y_test, xgb_predictions)

print("XGBoost Regression")
print(f'Mean Absolute Error: {xgb_mae}')
print(f'Mean Squared Error: {xgb_mse}')
print(f'Root Mean Squared Error: {xgb_rmse}')
print(f'R² Score: {xgb_r2}')

XGBoost Regression
Mean Absolute Error: 33.75865521347313
Mean Squared Error: 1539.7532826125705
Root Mean Squared Error: 39.239690144196736
R² Score: 0.4910392770729798


#### Word2Vec SVR

In [19]:
svm_pipeline = make_pipeline(MaxAbsScaler(), SVR(kernel='rbf', C=1.0, epsilon=0.2))

svm_pipeline.fit(X_train, y_train)

svm_predictions = svm_pipeline.predict(X_test)

svm_mae = mean_absolute_error(y_test, svm_predictions)
svm_mse = mean_squared_error(y_test, svm_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_r2 = r2_score(y_test, svm_predictions)

print("SVM Regression")
print(f'Mean Absolute Error: {svm_mae}')
print(f'Mean Squared Error: {svm_mse}')
print(f'Root Mean Squared Error: {svm_rmse}')
print(f'R² Score: {svm_r2}')

SVM Regression
Mean Absolute Error: 48.77801783203722
Mean Squared Error: 2874.3100559791174
Root Mean Squared Error: 53.61259232660847
R² Score: 0.0499056306440554


### Doc2Vec Model Testing

In [20]:
X = doc2vec_features
y = final_df['Mean_Last_PISA_Score'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#### Doc2Vec Random Forest

In [21]:
model = RandomForestRegressor(n_estimators=150, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

print("Random Forest Regression")
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R² Score: {r2}')

Random Forest Regression
Mean Absolute Error: 32.95999999999999
Mean Squared Error: 1553.0002567901245
Root Mean Squared Error: 39.40812424856231
R² Score: 0.4866605304061348


#### Doc2Vec XGBoost

In [22]:
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

xgb_predictions = xgb_model.predict(X_test)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_rmse = np.sqrt(xgb_mse)
xgb_r2 = r2_score(y_test, xgb_predictions)

print("XGBoost Regression")
print(f'Mean Absolute Error: {xgb_mae}')
print(f'Mean Squared Error: {xgb_mse}')
print(f'Root Mean Squared Error: {xgb_rmse}')
print(f'R² Score: {xgb_r2}')

XGBoost Regression
Mean Absolute Error: 38.585210967482176
Mean Squared Error: 2231.418471529856
Root Mean Squared Error: 47.23789232734517
R² Score: 0.26241147120950437


#### Doc2Vec SVR

In [23]:
svm_pipeline = make_pipeline(MaxAbsScaler(), SVR(kernel='rbf', C=1.0, epsilon=0.2))

svm_pipeline.fit(X_train, y_train)

svm_predictions = svm_pipeline.predict(X_test)

svm_mae = mean_absolute_error(y_test, svm_predictions)
svm_mse = mean_squared_error(y_test, svm_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_r2 = r2_score(y_test, svm_predictions)

print("SVM Regression")
print(f'Mean Absolute Error: {svm_mae}')
print(f'Mean Squared Error: {svm_mse}')
print(f'Root Mean Squared Error: {svm_rmse}')
print(f'R² Score: {svm_r2}')

SVM Regression
Mean Absolute Error: 48.029727667917825
Mean Squared Error: 2784.104226480761
Root Mean Squared Error: 52.76461149748722
R² Score: 0.07972289079356143


### BERT - Base Model Testing

In [24]:
X = bert_base_features
y = final_df['Mean_Last_PISA_Score'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#### BERT - Base Random Forest

In [25]:
model = RandomForestRegressor(n_estimators=150, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

print("Random Forest Regression")
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R² Score: {r2}')

Random Forest Regression
Mean Absolute Error: 44.1190643274854
Mean Squared Error: 2533.906187914234
Root Mean Squared Error: 50.33791998001342
R² Score: 0.16242508472405714


#### BERT - Base XGBoost

In [26]:
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

xgb_predictions = xgb_model.predict(X_test)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_rmse = np.sqrt(xgb_mse)
xgb_r2 = r2_score(y_test, xgb_predictions)

print("XGBoost Regression")
print(f'Mean Absolute Error: {xgb_mae}')
print(f'Mean Squared Error: {xgb_mse}')
print(f'Root Mean Squared Error: {xgb_rmse}')
print(f'R² Score: {xgb_r2}')

XGBoost Regression
Mean Absolute Error: 52.099373800712726
Mean Squared Error: 3612.801339904587
Root Mean Squared Error: 60.10658316611074
R² Score: -0.19420039724131222


#### BERT - Base SVR

In [27]:
svm_pipeline = make_pipeline(MaxAbsScaler(), SVR(kernel='rbf', C=1.0, epsilon=0.2))

svm_pipeline.fit(X_train, y_train)

svm_predictions = svm_pipeline.predict(X_test)

svm_mae = mean_absolute_error(y_test, svm_predictions)
svm_mse = mean_squared_error(y_test, svm_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_r2 = r2_score(y_test, svm_predictions)

print("SVM Regression")
print(f'Mean Absolute Error: {svm_mae}')
print(f'Mean Squared Error: {svm_mse}')
print(f'Root Mean Squared Error: {svm_rmse}')
print(f'R² Score: {svm_r2}')

SVM Regression
Mean Absolute Error: 50.050225266155856
Mean Squared Error: 2996.630965942066
Root Mean Squared Error: 54.74149217862138
R² Score: 0.009472829190176002


### BERT - Large Model Testing

In [28]:
X = bert_large_features
y = final_df['Mean_Last_PISA_Score'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#### BERT - Large Random Forest

In [29]:
model = RandomForestRegressor(n_estimators=150, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

print("Random Forest Regression")
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R² Score: {r2}')

Random Forest Regression
Mean Absolute Error: 37.760116959064376
Mean Squared Error: 1912.2168647173523
Root Mean Squared Error: 43.72890193816159
R² Score: 0.3679225828903986


#### BERT - Large XGBoost

In [30]:
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

xgb_predictions = xgb_model.predict(X_test)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_rmse = np.sqrt(xgb_mse)
xgb_r2 = r2_score(y_test, xgb_predictions)

print("XGBoost Regression")
print(f'Mean Absolute Error: {xgb_mae}')
print(f'Mean Squared Error: {xgb_mse}')
print(f'Root Mean Squared Error: {xgb_rmse}')
print(f'R² Score: {xgb_r2}')

XGBoost Regression
Mean Absolute Error: 39.299570719401046
Mean Squared Error: 2070.0637933716166
Root Mean Squared Error: 45.49795372730093
R² Score: 0.3157467649675616


#### BERT - Large SVR

In [31]:
svm_pipeline = make_pipeline(MaxAbsScaler(), SVR(kernel='rbf', C=1.0, epsilon=0.2))

svm_pipeline.fit(X_train, y_train)

svm_predictions = svm_pipeline.predict(X_test)

svm_mae = mean_absolute_error(y_test, svm_predictions)
svm_mse = mean_squared_error(y_test, svm_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_r2 = r2_score(y_test, svm_predictions)

print("SVM Regression")
print(f'Mean Absolute Error: {svm_mae}')
print(f'Mean Squared Error: {svm_mse}')
print(f'Root Mean Squared Error: {svm_rmse}')
print(f'R² Score: {svm_r2}')

SVM Regression
Mean Absolute Error: 50.26318447762949
Mean Squared Error: 3022.6047295165845
Root Mean Squared Error: 54.97822050154574
R² Score: 0.000887281339553958
