## 1. Baseline code provided by uni

### 1.1 Import modules

In [None]:
import pandas as pd
import logging
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

### 1.2 Baseline function to create [predictions](https://github.com/larshanen/MLChallenge/tree/main/notebooks/predicted.json)

In [None]:
'''
def main():
    # Set the logging level to INFO and set loading message
    logging.getLogger().setLevel(logging.INFO)
    
    # Load train and test sets and change all NA values to empty values
    logging.info("Loading training/test data")
    train = pd.DataFrame.from_records(json.load(open('../data/train.json'))).fillna("")
    test = pd.DataFrame.from_records(json.load(open('../data/test.json'))).fillna("")
    
    # Split the train set into train (75%) and validation (25%) sets
    logging.info("Splitting validation")
    train, val = train_test_split(train, stratify=train['year'], random_state=123)
    
    # Store a featurizer to transform the 'title' column into a bag-of-words format
    featurizer = ColumnTransformer(
        transformers=[("title", CountVectorizer(), "title")], remainder='drop')
    
    # Make a pipeline for the featurizer combined with a dummy regressor, that simply predicts the overall trained mean of the target variable
    dummy = make_pipeline(featurizer, DummyRegressor(strategy='mean'))

    # Make a pipeline for the featurizer and a ridge model, that aims to minimize the sum of squares
    ridge = make_pipeline(featurizer, Ridge())
    
    # Drop target variable column and fit both models
    logging.info("Fitting models")
    dummy.fit(train.drop('year', axis=1), train['year'].values)
    ridge.fit(train.drop('year', axis=1), train['year'].values)
    
    # Calculate and report both MAE's
    logging.info("Evaluating on validation data")
    err = mean_absolute_error(val['year'].values, dummy.predict(val.drop('year', axis=1)))
    logging.info(f"Mean baseline MAE: {err}")
    err = mean_absolute_error(val['year'].values, ridge.predict(val.drop('year', axis=1)))
    logging.info(f"Ridge regress MAE: {err}")
    
    # Let the ridge model predict on test set
    logging.info(f"Predicting on test")
    pred = ridge.predict(test)
    test['year'] = pred
    
    # Write JSON prediction file
    logging.info("Writing prediction file")
    test.to_json("predicted.json", orient='records', indent=2)
'''

In [None]:
# main()

INFO:root:Loading training/test data
INFO:root:Splitting validation
INFO:root:Fitting models
INFO:root:Evaluating on validation data
INFO:root:Mean baseline MAE: 7.8054390754858805
INFO:root:Ridge regress MAE: 5.812345349001838
INFO:root:Predicting on test
INFO:root:Writing prediction file


## 2. Team code

Please follow the instructions beneath when writing or adjusting code:

In [None]:
# Describe every piece of code with comments
# Include your name in every header so we can report our individual contributions (this is mandatory)

### 2.1 Explore baseline performance (Lars)

In [None]:
# Set the logging level to INFO and set loading message
logging.getLogger().setLevel(logging.INFO)
    
# Load train and test sets and change all NA values to empty values
logging.info("Loading training/test data")
train_sample = pd.DataFrame.from_records(json.load(open('../../data/train.json')))
test = pd.DataFrame.from_records(json.load(open('../../data/test.json')))

INFO:root:Loading training/test data


In [None]:
"""
# Split the train set into train (75%) and validation (25%) sets
logging.info("Splitting validation")
train, val = train_test_split(train, stratify=train['year'], random_state=123)
    
# Store a featurizer to transform the 'title' column into a bag-of-words format
featurizer = ColumnTransformer(
transformers=[("title", CountVectorizer(), "title")], remainder='drop')
    
# Make a pipeline for the featurizer combined with a dummy regressor, that simply predicts the overall trained mean of the target variable
dummy = make_pipeline(featurizer, DummyRegressor(strategy='mean'))

# Make a pipeline for the featurizer and a ridge model, that aims to minimize the sum of squares
ridge = make_pipeline(featurizer, Ridge())
    
# Drop target variable column and fit both models
logging.info("Fitting models")
dummy.fit(train.drop('year', axis=1), train['year'].values)
ridge.fit(train.drop('year', axis=1), train['year'].values)
    
# Calculate and report both MAE's
logging.info("Evaluating on validation data")
err = mean_absolute_error(val['year'].values, dummy.predict(val.drop('year', axis=1)))
logging.info(f"Mean baseline MAE: {err}")
err = mean_absolute_error(val['year'].values, ridge.predict(val.drop('year', axis=1)))
logging.info(f"Ridge regress MAE: {err}")
"""

INFO:root:Splitting validation


INFO:root:Fitting models
INFO:root:Evaluating on validation data
INFO:root:Mean baseline MAE: 7.8054390754858805
INFO:root:Ridge regress MAE: 5.812345349001838


### 2.2 Preprocessing (Lars)

In [None]:
# Import extra modules
from sklearn.model_selection import cross_val_score, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

For experimental purposes we start working with a 10% subset of the data:

In [None]:
'''
#Randomly save 10% of the train set for velocity purposes
percentage_to_save = 10

# Calculate the number of rows to save
num_rows_to_save = int(len(train) * (percentage_to_save / 100))

# Use the sample method to randomly select rows
train_sample = train.sample(n=num_rows_to_save, random_state=42)  # Set a random_state for reproducibility

train_sample.head()
'''

'\n#Randomly save 10% of the train set for velocity purposes\npercentage_to_save = 10\n\n# Calculate the number of rows to save\nnum_rows_to_save = int(len(train) * (percentage_to_save / 100))\n\n# Use the sample method to randomly select rows\ntrain_sample = train.sample(n=num_rows_to_save, random_state=42)  # Set a random_state for reproducibility\n\ntrain_sample.head()\n'

#### 2.2.1 Drop all columns with over 75% of missing data

In [None]:
# Set threshold on 75%
threshold = 0.25

# Calculate the threshold for each column
missing_threshold = int(threshold * len(train_sample))

# Drop columns with more than the specified percentage of missing data
train_filtered = train_sample.dropna(axis=1, thresh=missing_threshold)

print(len(train_filtered))
train_filtered.head()

6591


Unnamed: 0,ENTRYTYPE,title,year,publisher,author,abstract
12680,inproceedings,Question-Answering Based on Virtually Integrat...,2003,Association for Computational Linguistics,"[Choi, Key-Sun, Kim, Jae-Ho, Miyazaki, Masaru,...",
17292,inproceedings,AMI&ERIC: How to Learn with Naive Bayes and Pr...,2013,Association for Computational Linguistics,"[Dermouche, Mohamed, Khouas, Leila, Velcin, Ju...",
33265,inproceedings,Inducing Gazetteers for Named Entity Recogniti...,2008,Association for Computational Linguistics,"[Kazama, Jun'ichi, Torisawa, Kentaro]",
52850,inproceedings,Leveraging Explicit Lexico-logical Alignments ...,2022,Association for Computational Linguistics,"[Sun, Runxin, He, Shizhu, Zhu, Chong, He, Yaoh...",Text-to-SQL aims to parse natural language que...
2298,inproceedings,CLAM: Quickly deploy NLP command-line tools on...,2014,Dublin City University and Association for Com...,"[van Gompel, Maarten, Reynaert, Martin]",


#### 2.2.2 Featurize 'author' column (count-vectors, reduced to top X most frequent authors)

In [None]:
# Convert lists of strings, accounting for None values
train_filtered['author_str'] = train_filtered['author'].apply(lambda x: ';'.join(map(str, x)) if x is not None else 'unknown')

# Add a column to store the original row numbers
# train_filtered['original_index'] = train_filtered.index

# Count the number of papers for each author
author_paper_counts = train_filtered['author_str'].str.split(';').explode().value_counts()

# Set the number of most frequent authors you want to include
n_mostfreq_authors = 3  # Adjust this value to the desired number of most frequent authors

# Filter authors based on the X most frequent authors
top_authors = author_paper_counts.head(n_mostfreq_authors).index.tolist()

# Filter only the top authors in 'author_str'
train_filtered['author_str_filtered'] = train_filtered['author_str'].apply(lambda x: ';'.join([author for author in x.split(';') if author in top_authors]))

# Count-vectorize 'author_str_filtered'
count_vectorizer = CountVectorizer(tokenizer=lambda x: x.split(';'))
count_matrix = count_vectorizer.fit_transform(train_filtered['author_str_filtered'])

# Extract and create columns
feature_names = count_vectorizer.get_feature_names_out()
author_count_df = pd.DataFrame(count_matrix.toarray(), columns=feature_names)

# Set the original_index column as the index
# author_count_df.set_index(train_filtered['original_index'], inplace=True)

author_count_df = author_count_df.drop(['unknown', ''], axis=1) # See if this approach always works out

print(len(author_count_df))
author_count_df.head()

6591




Unnamed: 0,"bhattacharyya, pushpak","gurevych, iryna"
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


Merge with train_filtered, meaning we drop the author column and then add author_count_df

In [None]:
# Drop all redundant columns
# train_filtered_buffer = train_filtered.drop(['author', 'author_str', 'author_str_filtered'], axis=1)

# Concatenate the original with dropped redundants and the extracted features for author
# train_2 = pd.concat([train_filtered_buffer, author_count_df], axis=1).reindex(train_filtered_buffer.index)

# print(len(train_2))
# train_2.head()

#### 2.2.3 Vectorize 'ENTRYTYPE' column (3-categorical variable one-hot encoded)

In [None]:
# Perform one-hot encoding
train_encoded_entrytype = pd.get_dummies(train_filtered['ENTRYTYPE'], columns=['category'], prefix='category')

# Show count-values for each of the columns
train_encoded_entrytype.apply(lambda x: x.value_counts())

Unnamed: 0,category_article,category_inproceedings,category_proceedings
0,6380,396,6406
1,211,6195,185


Merge with train_filtered, meaning we drop the ENTRYTYPE column and then add train_encoded_entrytype

In [None]:
# Drop all redundant columns
# train_2 = train_2.drop(['ENTRYTYPE'], axis=1)

# Concatenate the original with dropped redundants and the extracted features for ENTRYTYPE
# train_3 = pd.concat([train_2, train_encoded_entrytype], axis=1).reindex(train_2.index)

# print(len(train_3))
# train_3.head()

#### 2.2.4 Vectorize 'Publisher' column (116-categorical variable one-hot encoded, and reduced to X most frequent publishers)

In [None]:
# Set the number of most frequent publishers to include
n_mostfreq_publishers = 2  # Adjust this value as needed

# Get the X most frequent publishers
top_publishers = train_3['publisher'].value_counts().head(n_mostfreq_publishers).index.tolist()

# Create a new DataFrame with one-hot encoding for the X most frequent publishers
train_encoded_publisher = pd.get_dummies(train_3['publisher'][train_3['publisher'].isin(top_publishers)], prefix='publisher')

# Show count-values for each of the columns
train_encoded_publisher.apply(lambda x: x.value_counts())

Unnamed: 0,publisher_Association for Computational Linguistics,publisher_European Language Resources Association (ELRA)
0,430,3771
1,3771,430


Merge with train_filtered, meaning we drop the 'publisher' column and then add train_encoded_entrytype

In [None]:
# Drop all redundant columns
# train_3 = train_3.drop(['publisher'], axis=1)

# Concatenate the original with dropped redundants and the extracted features for ENTRYTYPE
# train_4 = pd.concat([train_3, train_encoded_publisher], axis=1).reindex(train_3.index).fillna(0)

# print(len(train_4))
# train_4.head()

#### 2.2.5.1 Vectorize 'title' and 'abstract' column (English-translated with stop-words removal and/or synonym replacement)

In [None]:
from googletrans import Translator
from langdetect import detect

In [None]:
def det(x):
    try:
        lang = detect(x)
    except:
        lang = 'Other'
    return lang

In [None]:
train_filtered['language_title'] = train_filtered['title'].apply(det)
train_filtered['language_abstract'] = train_filtered['abstract'].apply(det)

In [None]:
translator = Translator(service_urls=['translate.googleapis.com'])

# Function to translate non-English titles to English based on 'translated_title' column
def translate_to_english(dataframe, column, translated_column):
    for i in dataframe[column].index:
        # Check if the value in 'translated_title' is not 'en' or 'Other' before translation
        if dataframe[translated_column][i] not in ['en', 'Other']:
            dataframe[column][i] = translator.translate(dataframe[column][i], dest='en').text

In [None]:
translate_to_english(train_filtered, 'title', 'language_title')
translate_to_english(train_filtered, 'abstract', 'language_abstract')

In [None]:
# train_4['new_language_title'] = train_4['title'].apply(det)
# train_4['new_language_abstract'] = train_4['abstract'].apply(det)

In [None]:
# train_4['new_language_title'].value_counts()

In [None]:
# train_4['new_language_abstract'].value_counts()

In [None]:
train_textcolumns = train_filtered[['title', 'abstract']]
train_textcolumns

Unnamed: 0,title,abstract
12680,Question-Answering Based on Virtually Integrat...,
17292,AMI&ERIC: How to Learn with Naive Bayes and Pr...,
33265,Inducing Gazetteers for Named Entity Recogniti...,
52850,Leveraging Explicit Lexico-logical Alignments ...,Text-to-SQL aims to parse natural language que...
2298,CLAM: Quickly deploy NLP command-line tools on...,
...,...,...
505,XUXEN: A Spelling Checker/Corrector for Basque...,
39223,Towards Building a Spoken Dialogue System for ...,Speech interfaces for argumentative dialogue s...
14117,"Combining Multiple, Large-Scale Resources in a...",
29826,Lying Through One's Teeth: A Study on Verbal L...,Although many studies use the LIWC lexicon to ...


In [None]:
# Specify the number of most frequent tokens you want to keep (replace X with the desired value)
max_features_title = 100

# Create a list of English stopwords
stop_words = 'english'

# Apply the TF-IDF vectorizer to column 'title' with max_features parameter
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=max_features_title)

# Apply the TF-IDF vectorizer to column 'title'
tfidf_matrix_title = tfidf_vectorizer.fit_transform(train_textcolumns['title'])

# Extract and create columns
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df_title = pd.DataFrame(tfidf_matrix_title.toarray(), columns=feature_names)

print(f"We've transformed the 'title' column to a dataframe of {len(tfidf_df_title.columns)} columns.")
tfidf_df_title.head()

We've transformed the 'title' column to a dataframe of 100 columns.


Unnamed: 0,alignment,analysis,annotation,answering,approach,arabic,automatic,based,case,chinese,...,task,text,texts,training,translation,understanding,unsupervised,using,word,workshop
0,0.0,0.0,0.0,0.495148,0.0,0.0,0.0,0.333044,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.533204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.650351,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Specify the number of most frequent tokens you want to keep (replace X with the desired value)
max_features_abstract = 100

# Create a list of English stopwords
stop_words = 'english'

# Handle missing values and '0's in the 'abstract' column
train_textcolumns['abstract'].replace({None: '', '0': ''}, inplace=True)

# Apply the TF-IDF vectorizer to column 'title' with max_features parameter
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=max_features_abstract)

# Apply the TF-IDF vectorizer to column 'title'
tfidf_matrix_abstract = tfidf_vectorizer.fit_transform(train_textcolumns['abstract'])

# Extract and create columns
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df_abstract = pd.DataFrame(tfidf_matrix_abstract.toarray(), columns=feature_names)

print(f"We've transformed the 'abstract' column to a dataframe of {len(tfidf_df_abstract.columns)} columns.")
tfidf_df_abstract.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_textcolumns['abstract'].replace({None: '', '0': ''}, inplace=True)


We've transformed the 'abstract' column to a dataframe of 100 columns.


Unnamed: 0,analysis,annotated,annotation,approach,approaches,art,attention,automatic,available,based,...,time,trained,training,translation,use,used,using,word,words,work
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.363133,0.21556,0.186109,0.233257,0.0,0.0,0.147688,...,0.0,0.0,0.0,0.0,0.18771,0.0,0.0,0.0,0.0,0.176033
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 3. Merge the following dataframes: author_count_df, train_encoded_entrytype, train_encoded_publisher, tfidf_df_title, tfidf_df_abstract

In [None]:
year_df = train_sample['year']

features_df = pd.concat([author_count_df, train_encoded_entrytype, train_encoded_publisher, tfidf_df_title, tfidf_df_abstract, year_df], axis=1).reindex(year_df.index).fillna(0)

In [None]:
features_df

Unnamed: 0,"bhattacharyya, pushpak","gurevych, iryna",category_article,category_inproceedings,category_proceedings,publisher_Association for Computational Linguistics,publisher_European Language Resources Association (ELRA),alignment,analysis,annotation,...,trained,training,translation,use,used,using,word,words,work,year
12680,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,2003
17292,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,2013
33265,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,2008
52850,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,2022
2298,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.173093,0.158681,0.0,0.16456,0.0,0.0,0.0,0.0,0.154324,2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,1992
39223,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,2022
14117,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,1998
29826,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,2021


### 4. Fit baseline model

In [None]:
# Split the train set into train (75%) and validation (25%) sets
logging.info("Splitting validation")
train, val = train_test_split(features_df, stratify=features_df['year'], random_state=123)

# Make a pipeline for the featurizer and a ridge model, that aims to minimize the sum of squares
ridge = Ridge()
    
# Drop target variable column and fit both models
logging.info("Fitting models")
ridge.fit(train.drop('year', axis=1), train['year'].values)
    
# Calculate and report both MAE's
logging.info("Evaluating on validation data")
err = mean_absolute_error(val['year'].values, ridge.predict(val.drop('year', axis=1)))
logging.info(f"Ridge regress MAE: {err}")

INFO:root:Splitting validation


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

#### 2.2.5.2 Extract topics from 'title' (and 'abstract') column

In [None]:
from bertopic import BERTopic

ImportError: cannot import name 'TypeAliasType' from 'typing_extensions' (c:\Users\Gebruiker\AppData\Local\Programs\Python\Python311\Lib\site-packages\typing_extensions.py)

### 2.3 Incorporate into baseline code

In [None]:
# Set the logging level to INFO and set loading message
logging.getLogger().setLevel(logging.INFO)
    
# Load train and test sets and change all NA values to empty values
logging.info("Loading training/test data")
#train = pd.DataFrame.from_records(json.load(open('../data/train.json'))).fillna("")
#test = pd.DataFrame.from_records(json.load(open('../data/test.json'))).fillna("")
    
# Split the train set into train (80%) and validation (20%) sets, 5-folds
logging.info("Splitting validation")
num_folds = 5
k_fold = KFold(n_splits=num_folds, shuffle=True, random_state=123)
    
# Store a featurizer to transform the 'title' column into a bag-of-words format
featurizer_1 = ColumnTransformer(
    transformers=[("title", CountVectorizer(), "title")], remainder='drop')
featurizer_2 = ColumnTransformer(
    transformers=[("title", TfidfVectorizer(), "title")], remainder='drop')
featurizer_3 = ColumnTransformer(
    transformers=[("abstract", CountVectorizer(), "abstract")], remainder='drop')
featurizer_4 = ColumnTransformer(
    transformers=[("abstract", TfidfVectorizer(), "abstract")], remainder='drop')
featurizers = [featurizer_1, featurizer_2, featurizer_3, featurizer_4]

for i, featurizer in enumerate(featurizers):
    # Make a pipeline for the featurizer and a ridge model, that aims to minimize the sum of squares
    ridge_cv = make_pipeline(featurizer, Ridge())
    
    # Drop target variable column and fit both models
    logging.info(f"Fitting model with featurizer {i+1}")
    ridge_cv.fit(train_4.drop('year', axis=1), train_4['year'].values)
    
    # Calculate and report both MAE's
    logging.info("Evaluating on validation data")
    ridge_cv_scores = cross_val_score(ridge_cv, train_4.drop('year', axis=1), train_4['year'].values, cv=k_fold, scoring='neg_mean_absolute_error')
    logging.info(f"Ridge regress MAE with featurizer {i+1} ({num_folds}-fold cross-validated): {-ridge_cv_scores.mean()}")

INFO:root:Loading training/test data
INFO:root:Splitting validation
INFO:root:Fitting model with featurizer 1
INFO:root:Evaluating on validation data
INFO:root:Ridge regress MAE with featurizer 1 (5-fold cross-validated): 5.773010450586702
INFO:root:Fitting model with featurizer 2
INFO:root:Evaluating on validation data
INFO:root:Ridge regress MAE with featurizer 2 (5-fold cross-validated): 5.384430333156983
INFO:root:Fitting model with featurizer 3


AttributeError: 'int' object has no attribute 'lower'

This paragraph build upon the previous baseline code. It entails the following adjustments/additions chronologically:

- [x] Removal of dummy regressor, since ridge works better from the very start;
- [x] 5-fold cross validation to reduce variability (Ridge regress MAE (5.773));
- [x] Try sklearn's other feature vectorizers (tf-idf (5.384), ...);
- [ ] Perform custom preprocessing, tokenizations within sklearn;
- [ ] Tune hyperparameters of feature vectorizers (n-gram size);
- [ ] Try tasks other than regression, like lazy learning (kNN)(?);
- [ ] Try BERTopic modelling;