In this notebook, we will prepare data for training a first-level XGBoost model and save the files into a separate folder. The preparation will include generating text features and then training FastText discourse embeddings and TF-IDF + UMAP essay embeddings on the training data.

We will then generate the same features for the holdout and test sets using the trained FastText and TF-IDF + UMAP models.

The resulting datasets will be saved and subsequently loaded in a separate notebook for training the first-level XGBoost model.

In [None]:
!pip install patool umap-learn

Collecting patool
  Downloading patool-3.0.0-py2.py3-none-any.whl.metadata (4.0 kB)
Collecting umap-learn
  Downloading umap_learn-0.5.6-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading patool-3.0.0-py2.py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.4/97.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading umap_learn-0.5.6-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: patool, pynndescent, umap-learn
Successfully installed patool-3.0.0 pynndescent-0.5.13 umap-learn-0.5.6


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import patoolib
import json
import pickle
import os
import joblib

import pandas as pd
import numpy as np

import re

import nltk
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

import spacy

from gensim.models import FastText

from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from scipy.sparse import hstack

from functools import partial
from tqdm import tqdm
from tqdm import trange
import string
from collections import Counter

import umap

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

In [None]:
tqdm_iterator = partial(tqdm, position = 0, leave = True)

In [None]:
tqdm.pandas()

In [None]:
mkdir data2022

In [None]:
BASIC_PATH = '/content/gdrive/MyDrive/ML/projects/feedback-prize/'
MODEL_PATH = '1st_level_models/'
SAVE_TRANSFORMED_DATASETS = '1st_level_transformed_data/'

In [None]:
patoolib.extract_archive(BASIC_PATH+'data/feedback-prize-effectiveness.zip', outdir = '/content/data2022')

INFO patool: Extracting /content/gdrive/MyDrive/ML/projects/feedback-prize/feedback-prize-effectiveness.zip ...
INFO:patool:Extracting /content/gdrive/MyDrive/ML/projects/feedback-prize/feedback-prize-effectiveness.zip ...
INFO patool: running /usr/bin/7z x -o/content/data2022 -- /content/gdrive/MyDrive/ML/projects/feedback-prize/feedback-prize-effectiveness.zip
INFO:patool:running /usr/bin/7z x -o/content/data2022 -- /content/gdrive/MyDrive/ML/projects/feedback-prize/feedback-prize-effectiveness.zip
INFO patool:     with input=''
INFO:patool:    with input=''
INFO patool: ... /content/gdrive/MyDrive/ML/projects/feedback-prize/feedback-prize-effectiveness.zip extracted to `/content/data2022'.
INFO:patool:... /content/gdrive/MyDrive/ML/projects/feedback-prize/feedback-prize-effectiveness.zip extracted to `/content/data2022'.


'/content/data2022'

In [None]:
input_dir = '/content/data2022'

train_csv = os.path.join(input_dir, 'train.csv')

data_2022 = pd.read_csv(train_csv)

In [None]:
data_2022.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [None]:
class_names = list(set(data_2022['discourse_effectiveness']))
class_names.sort()
label_to_id = {label: i for i, label in enumerate(class_names)}

In [None]:
label_to_id

{'Adequate': 0, 'Effective': 1, 'Ineffective': 2}

In [None]:
data_2022['target'] = data_2022['discourse_effectiveness'].replace(label_to_id)

  data_2022['target'] = data_2022['discourse_effectiveness'].replace(label_to_id)


Prepare discourse texts for FastText by cleaning them to ensure they are suitable for training.

In [None]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):

    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]

    return tokens

In [None]:
data_2022['clean_discourse'] = data_2022['discourse_text'].progress_apply(preprocess_text)

100%|██████████| 36765/36765 [00:13<00:00, 2660.45it/s]


Load and clean the essay texts to prepare them for further processing.

In [None]:
def clean_text(text):
    # Replace escaped apostrophes with actual apostrophes
    text = re.sub(r"\\'", "'", text)
    # Replace escaped newline characters with actual newlines
    text = re.sub(r"\\n", "\n", text)
    # Remove other unnecessary backslashes
    text = re.sub(r"\\", "", text)
    # Replace newline characters with spaces
    text = text.replace("\n", " ")
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading/trailing whitespace
    text = text.strip()
    return text

In [None]:
essay_texts = []

essay_ids = data_2022['essay_id'].values

for id in essay_ids:
    essay_path = os.path.join('/content/data2022/train', f"{id}.txt")
    with open(essay_path, 'r', encoding='utf-8') as file:
        essay = file.read()
        cleaned_essay = clean_text(essay)
        essay_texts.append(cleaned_essay)

In [None]:
data_2022['clean_essay'] = essay_texts

In [None]:
data_2022.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,target,clean_discourse,clean_essay
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,0,"[hi, im, isaac, im, going, writing, face, mars...","Hi, i'm Isaac, i'm going to be writing about h..."
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,0,"[perspective, think, face, natural, landform, ...","Hi, i'm Isaac, i'm going to be writing about h..."
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,0,"[think, face, natural, landform, life, mars, d...","Hi, i'm Isaac, i'm going to be writing about h..."
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,0,"[life, mars, would, know, reason, think, natur...","Hi, i'm Isaac, i'm going to be writing about h..."
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,0,"[people, thought, face, formed, alieans, thoug...","Hi, i'm Isaac, i'm going to be writing about h..."


Generate meaningful text features from the essay and discourse texts, as selected during the EDA stage.

In [None]:
spacy_nlp = spacy.load('en_core_web_sm')

In [None]:
def extract_features(text, alias):

    doc = spacy_nlp(text)

    # Initialize counters for different features
    num_long_words = 0
    num_short_words = 0
    pos_counter = Counter()

    for token in doc:
        # Count long and short words
        word_len = len(token.lemma_)
        if word_len > 6:
            num_long_words += 1
        elif word_len < 4:
            num_short_words += 1

        # Count POS tags
        pos_counter[token.pos_] += 1

    # Normalize POS counts by the number of words
    num_words = len([w for w in word_tokenize(text) if w not in string.punctuation])
    pos_features = {
        f'{alias}_noun_count': pos_counter['NOUN'] / num_words,
        f'{alias}_adj_count': pos_counter['ADJ'] / num_words,
        f'{alias}_pnoun_count': pos_counter['PROPN'] / num_words
    }

    return {
        f'{alias}_len': len(text),
        f'{alias}_num_long_words': num_long_words / num_words,
        f'{alias}_num_short_words': num_short_words / num_words,
        **pos_features
    }

In [None]:
discourse_features = data_2022['discourse_text'].progress_apply(lambda x: extract_features(x, 'discourse'))
discourse_features = pd.DataFrame(list(discourse_features))

100%|██████████| 36765/36765 [11:16<00:00, 54.36it/s]


In [None]:
# process only unique essay texts here to improve efficiency
essay_features = data_2022.drop_duplicates('essay_id').reset_index(drop = True)
essay_features = essay_features['clean_essay'].progress_apply(lambda x: extract_features(x, 'essay'))
essay_features = pd.DataFrame(list(essay_features))

100%|██████████| 4191/4191 [06:24<00:00, 10.89it/s]


In [None]:
essay_features_ids = data_2022.drop_duplicates('essay_id').reset_index(drop = True)['essay_id']
essay_features = pd.concat([essay_features_ids, essay_features], axis = 1)

In [None]:
data_2022 = pd.concat([data_2022, discourse_features], axis = 1)

In [None]:
data_2022 = pd.merge(data_2022, essay_features, on = 'essay_id', how = 'left')

In [None]:
data_2022.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,target,clean_discourse,clean_essay,discourse_len,discourse_num_long_words,discourse_num_short_words,discourse_noun_count,discourse_adj_count,discourse_pnoun_count,essay_len,essay_num_long_words,essay_num_short_words,essay_noun_count,essay_adj_count,essay_pnoun_count
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,0,"[hi, im, isaac, im, going, writing, face, mars...","Hi, i'm Isaac, i'm going to be writing about h...",317,0.085714,0.657143,0.142857,0.028571,0.114286,1799,0.124654,0.595568,0.132964,0.049861,0.074792
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,0,"[perspective, think, face, natural, landform, ...","Hi, i'm Isaac, i'm going to be writing about h...",210,0.166667,0.571429,0.142857,0.095238,0.02381,1799,0.124654,0.595568,0.132964,0.049861,0.074792
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,0,"[think, face, natural, landform, life, mars, d...","Hi, i'm Isaac, i'm going to be writing about h...",105,0.190476,0.428571,0.142857,0.047619,0.047619,1799,0.124654,0.595568,0.132964,0.049861,0.074792
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,0,"[life, mars, would, know, reason, think, natur...","Hi, i'm Isaac, i'm going to be writing about h...",362,0.131579,0.697368,0.118421,0.065789,0.039474,1799,0.124654,0.595568,0.132964,0.049861,0.074792
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,0,"[people, thought, face, formed, alieans, thoug...","Hi, i'm Isaac, i'm going to be writing about h...",101,0.055556,0.333333,0.222222,0.0,0.055556,1799,0.124654,0.595568,0.132964,0.049861,0.074792


Split the data.

In [None]:
with open(BASIC_PATH+'data_splits.json', 'r') as file:
    split_ids = json.load(file)

In [None]:
train_ids = split_ids['train_ids']
train_data = data_2022[data_2022['essay_id'].isin(train_ids)].copy()
train_data.reset_index(drop = True, inplace = True)

# for 2nd level model
holdout_ids = split_ids['holdout_ids']
holdout_data = data_2022[data_2022['essay_id'].isin(holdout_ids)].copy()
holdout_data.reset_index(drop = True, inplace = True)

#for final evaluation of blending ensemble
test_ids = split_ids['test_ids']
test_data = data_2022[data_2022['essay_id'].isin(test_ids)].copy()
test_data.reset_index(drop = True, inplace = True)

In [None]:
unique_essay_ids = train_data['essay_id'].unique()

train_ids, val_ids = train_test_split(unique_essay_ids, test_size = 0.2, random_state = 79)

train_mask = train_data['essay_id'].isin(train_ids)
val_mask = train_data['essay_id'].isin(val_ids)

train_df = train_data[train_mask].reset_index(drop = True)
val_df = train_data[val_mask].reset_index(drop = True)

Train a FastText model on cleaned discourse texts in train_df and apply it to val_df, holdout_data, and test_data. Due to space constraints, I'm not saving the model but will save the transformed DataFrames with the new features instead.

In [None]:
custom_ft_model = FastText(vector_size = 64, window = 3, min_count = 3)
custom_ft_model.build_vocab(corpus_iterable = train_df['clean_discourse'])

In [None]:
vocab = custom_ft_model.wv
vocabulary_words = list(vocab.key_to_index.keys())

In [None]:
custom_ft_model.train(corpus_iterable = train_df['clean_discourse'], total_examples = len(train_df), epochs = 10)

(3597556, 4169140)

In [None]:
# sanity check
custom_ft_model.wv.most_similar(positive = ['students'])

[('studens', 0.9408352971076965),
 ('studen', 0.9330945014953613),
 ('student', 0.9277641773223877),
 ('studenst', 0.9031679630279541),
 ('studentbased', 0.8910910487174988),
 ('student_name', 0.8725175261497498),
 ('stundents', 0.8716031312942505),
 ('studentdesign', 0.86700040102005),
 ('studentsummerpacketdesigners', 0.8575951457023621),
 ('studentassigned', 0.8431174755096436)]

In [None]:
# custom_ft_model.save(BASIC_PATH + MODEL_PATH + 'custom_fasttext_model.bin')

In [None]:
def get_mean_embedding(tokens, model):

    embeddings = []

    for token in tokens:
        try:
            embedding = model.wv[token]
            embeddings.append(embedding)
        except KeyError:
            print(f"Warning: Token '{token}' not in vocabulary.")
            embeddings.append(np.zeros(model.vector_size))

    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

In [None]:
# function to apply embeddings to lists of tokens in the clean_discourse column and create a new column containing each discourse's embedding
def add_ft_emb(df, custom_ft_model):
  df['ft_emb'] = df['clean_discourse'].progress_apply(lambda x: get_mean_embedding(x, custom_ft_model))
  return df

In [None]:
# function to create a DataFrame with all dimension embeddings, adding corresponding columns, and concatenate it with the original DataFrame
def join_ft_emb(df):
  df = pd.concat([df, pd.DataFrame(list(df['ft_emb']), columns = [f'ft_emb_{i}' for i in range(len(df['ft_emb'][0]))])], axis = 1)
  return df

In [None]:
train_df = add_ft_emb(train_df, custom_ft_model)
val_df = add_ft_emb(val_df, custom_ft_model)
holdout_data = add_ft_emb(holdout_data, custom_ft_model)
test_data = add_ft_emb(test_data, custom_ft_model)

100%|██████████| 18707/18707 [00:03<00:00, 5493.78it/s]
100%|██████████| 4755/4755 [00:01<00:00, 4395.73it/s]
100%|██████████| 5921/5921 [00:01<00:00, 4472.10it/s]
100%|██████████| 7382/7382 [00:01<00:00, 5749.88it/s]


In [None]:
train_df = join_ft_emb(train_df)
val_df = join_ft_emb(val_df)
holdout_data = join_ft_emb(holdout_data)
test_data = join_ft_emb(test_data)

Train a tf-idf+UMAP model on essay texts in train_df, then apply to val_df, holdout_data and test_data.

In [None]:
# write a custom lemmatizer for tf-idf, as it offers significantly better quality than the default tf-idf, as demonstrated during the EDA stage

lemmatizer = WordNetLemmatizer()

STOP_WORDS = ENGLISH_STOP_WORDS.union({"n't", "'ll", "'m", "'re", "'s", "'ve", "'d", "'t", "wo", "ca"})

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV
                }
    return tag_dict.get(tag, wordnet.NOUN)

def clean_and_tokenize(text):
    # Remove punctuation and digits
    text = text.translate(str.maketrans("", "", string.punctuation + string.digits))
    tokens = word_tokenize(text.lower())
    # Lemmatize and remove stop words
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens if token not in STOP_WORDS]
    return lemmatized_tokens

In [None]:
lemmatized_stop_words = set([lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in ENGLISH_STOP_WORDS])

In [None]:
# use only unique essays to increase training speed (because it may be slow due to the custom lemmatizer using nltk)
train_df_essays = train_df.drop_duplicates('essay_id')[['essay_id', 'clean_essay']].reset_index(drop = True)

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words = list(lemmatized_stop_words),
                                   lowercase = True,
                                   tokenizer = clean_and_tokenize,
                                   ngram_range = (1, 2),
                                   max_features = 10000)

tfidf_matrix = tfidf_vectorizer.fit_transform(train_df_essays['clean_essay'])



In [None]:
feature_names = tfidf_vectorizer.get_feature_names_out()
print("Lemmatized Terms (tokens):", feature_names)

Lemmatized Terms (tokens): ['ability' 'ability learn' 'ability student' ... 'zone accident'
 'zygomatic' 'zygomatic major']


In [None]:
# train the UMAP model using the Hellinger metric
umap_model_hellinger = umap.UMAP(metric='hellinger', n_components = 64, random_state = 77)
umap_embeddings = umap_model_hellinger.fit_transform(tfidf_matrix)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [None]:
models = {
    'tfidf_vectorizer': tfidf_vectorizer,
    'umap_model': umap_model_hellinger
}

#joblib.dump(models, BASIC_PATH+MODEL_PATH+'tfidf_umap_model.pkl')

In [None]:
# apply the trained models using the transform method, create a separate dataset for UMAP embeddings, and then concatenate it with the original dataset

def get_add_join_umap_embeddings(df, models):

    tfidf_matrix = models['tfidf_vectorizer'].transform(df['clean_essay'])
    umap_embeddings = models['umap_model'].transform(tfidf_matrix)

    num_components = umap_embeddings.shape[1]
    umap_columns = [f'umap_emb_{i}' for i in range(num_components)]

    umap_df = pd.DataFrame(umap_embeddings, columns = umap_columns)
    df = pd.concat([df, umap_df], axis = 1)

    return df

In [None]:
train_df_essays = train_df.drop_duplicates('essay_id')[['essay_id', 'clean_essay']].reset_index(drop = True)
val_df_essays = val_df.drop_duplicates('essay_id')[['essay_id', 'clean_essay']].reset_index(drop = True)
holdout_data_essays = holdout_data.drop_duplicates('essay_id')[['essay_id', 'clean_essay']].reset_index(drop = True)
test_data_essays = test_data.drop_duplicates('essay_id')[['essay_id', 'clean_essay']].reset_index(drop = True)

train_df_essays = get_add_join_umap_embeddings(train_df_essays, models)
val_df_essays = get_add_join_umap_embeddings(val_df_essays, models)
holdout_data_essays = get_add_join_umap_embeddings(holdout_data_essays, models)
test_data_essays = get_add_join_umap_embeddings(test_data_essays, models)

In [None]:
train_df = pd.merge(train_df, train_df_essays, on = 'essay_id', how = 'left')
val_df = pd.merge(val_df, val_df_essays, on = 'essay_id', how = 'left')
holdout_data = pd.merge(holdout_data, holdout_data_essays, on = 'essay_id', how = 'left')
test_data = pd.merge(test_data, test_data_essays, on = 'essay_id', how = 'left')

Save the transformed datasets for future use in training the 1st-level XGBoost model, while removing unnecessary columns.

In [None]:
COLS_TO_DROP = ['discourse_text',
                'clean_discourse',
                'clean_essay_x',
                'clean_essay_y',
                'discourse_effectiveness',
                'ft_emb'
                ]

In [None]:
train_df.drop(COLS_TO_DROP, axis = 1, inplace = True)
val_df.drop(COLS_TO_DROP, axis = 1, inplace = True)
holdout_data.drop(COLS_TO_DROP, axis = 1, inplace = True)
test_data.drop(COLS_TO_DROP, axis = 1, inplace = True)

In [None]:
# train_df.to_csv(BASIC_PATH+SAVE_TRANSFORMED_DATASETS+'train_df_transformed.csv', index = False)
# val_df.to_csv(BASIC_PATH+SAVE_TRANSFORMED_DATASETS+'val_df_transformed.csv', index = False)
# holdout_data.to_csv(BASIC_PATH+SAVE_TRANSFORMED_DATASETS+'holdout_data_transformed.csv', index = False)
# test_data.to_csv(BASIC_PATH+SAVE_TRANSFORMED_DATASETS+'test_data_transformed.csv', index = False)