## 1. Baseline code provided by uni

### 1.1 Import modules

In [1]:
import pandas as pd
import logging
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

### 1.2 Baseline function to create [predictions](https://github.com/larshanen/MLChallenge/tree/main/notebooks/predicted.json)

In [3]:
def main():
    # Set the logging level to INFO and set loading message
    logging.getLogger().setLevel(logging.INFO)
    
    # Load train and test sets and change all NA values to empty values
    logging.info("Loading training/test data")
    train = pd.DataFrame.from_records(json.load(open('../data/train.json'))).fillna("")
    test = pd.DataFrame.from_records(json.load(open('../data/test.json'))).fillna("")
    
    # Split the train set into train (75%) and validation (25%) sets
    logging.info("Splitting validation")
    train, val = train_test_split(train, stratify=train['year'], random_state=123)
    
    # Store a featurizer to transform the 'title' column into a bag-of-words format
    featurizer = ColumnTransformer(
        transformers=[("title", CountVectorizer(), "title")], remainder='drop')
    
    # Make a pipeline for the featurizer combined with a dummy regressor, that simply predicts the overall trained mean of the target variable
    dummy = make_pipeline(featurizer, DummyRegressor(strategy='mean'))

    # Make a pipeline for the featurizer and a ridge model, that aims to minimize the sum of squares
    ridge = make_pipeline(featurizer, Ridge())
    
    # Drop target variable column and fit both models
    logging.info("Fitting models")
    dummy.fit(train.drop('year', axis=1), train['year'].values)
    ridge.fit(train.drop('year', axis=1), train['year'].values)
    
    # Calculate and report both MAE's
    logging.info("Evaluating on validation data")
    err = mean_absolute_error(val['year'].values, dummy.predict(val.drop('year', axis=1)))
    logging.info(f"Mean baseline MAE: {err}")
    err = mean_absolute_error(val['year'].values, ridge.predict(val.drop('year', axis=1)))
    logging.info(f"Ridge regress MAE: {err}")
    
    # Let the ridge model predict on test set
    logging.info(f"Predicting on test")
    pred = ridge.predict(test)
    test['year'] = pred
    
    # Write JSON prediction file
    logging.info("Writing prediction file")
    test.to_json("predicted.json", orient='records', indent=2)

In [4]:
main()

INFO:root:Loading training/test data
INFO:root:Splitting validation
INFO:root:Fitting models
INFO:root:Evaluating on validation data
INFO:root:Mean baseline MAE: 7.8054390754858805
INFO:root:Ridge regress MAE: 5.812345349001838
INFO:root:Predicting on test
INFO:root:Writing prediction file


## 2. Team code

Please follow the instructions beneath when writing or adjusting code:

In [5]:
# Describe every piece of code with comments
# Include your name in every header so we can report our individual contributions (this is mandatory)

### 2.1 Explore baseline performance (Lars)

In [2]:
# Set the logging level to INFO and set loading message
logging.getLogger().setLevel(logging.INFO)
    
# Load train and test sets and change all NA values to empty values
logging.info("Loading training/test data")
train = pd.DataFrame.from_records(json.load(open('../../data/train.json')))
test = pd.DataFrame.from_records(json.load(open('../../data/test.json')))

INFO:root:Loading training/test data


In [40]:
"""
# Split the train set into train (75%) and validation (25%) sets
logging.info("Splitting validation")
train, val = train_test_split(train, stratify=train['year'], random_state=123)
    
# Store a featurizer to transform the 'title' column into a bag-of-words format
featurizer = ColumnTransformer(
transformers=[("title", CountVectorizer(), "title")], remainder='drop')
    
# Make a pipeline for the featurizer combined with a dummy regressor, that simply predicts the overall trained mean of the target variable
dummy = make_pipeline(featurizer, DummyRegressor(strategy='mean'))

# Make a pipeline for the featurizer and a ridge model, that aims to minimize the sum of squares
ridge = make_pipeline(featurizer, Ridge())
    
# Drop target variable column and fit both models
logging.info("Fitting models")
dummy.fit(train.drop('year', axis=1), train['year'].values)
ridge.fit(train.drop('year', axis=1), train['year'].values)
    
# Calculate and report both MAE's
logging.info("Evaluating on validation data")
err = mean_absolute_error(val['year'].values, dummy.predict(val.drop('year', axis=1)))
logging.info(f"Mean baseline MAE: {err}")
err = mean_absolute_error(val['year'].values, ridge.predict(val.drop('year', axis=1)))
logging.info(f"Ridge regress MAE: {err}")
"""

INFO:root:Splitting validation


INFO:root:Fitting models
INFO:root:Evaluating on validation data
INFO:root:Mean baseline MAE: 7.8054390754858805
INFO:root:Ridge regress MAE: 5.812345349001838


### 2.2 Preprocessing (Lars)

In [6]:
# Import extra modules
from sklearn.model_selection import cross_val_score, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

For experimental purposes we start working with a 10% subset of the data:

In [73]:
#Randomly save 10% of the train set for velocity purposes
percentage_to_save = 10

# Calculate the number of rows to save
num_rows_to_save = int(len(train) * (percentage_to_save / 100))

# Use the sample method to randomly select rows
train_sample = train.sample(n=num_rows_to_save, random_state=42)  # Set a random_state for reproducibility

train_sample

Unnamed: 0,ENTRYTYPE,title,editor,year,publisher,author,abstract
12680,inproceedings,Question-Answering Based on Virtually Integrat...,,2003,Association for Computational Linguistics,"[Choi, Key-Sun, Kim, Jae-Ho, Miyazaki, Masaru,...",
17292,inproceedings,AMI&ERIC: How to Learn with Naive Bayes and Pr...,,2013,Association for Computational Linguistics,"[Dermouche, Mohamed, Khouas, Leila, Velcin, Ju...",
33265,inproceedings,Inducing Gazetteers for Named Entity Recogniti...,,2008,Association for Computational Linguistics,"[Kazama, Jun'ichi, Torisawa, Kentaro]",
52850,inproceedings,Leveraging Explicit Lexico-logical Alignments ...,,2022,Association for Computational Linguistics,"[Sun, Runxin, He, Shizhu, Zhu, Chong, He, Yaoh...",Text-to-SQL aims to parse natural language que...
2298,inproceedings,CLAM: Quickly deploy NLP command-line tools on...,,2014,Dublin City University and Association for Com...,"[van Gompel, Maarten, Reynaert, Martin]",
...,...,...,...,...,...,...,...
505,inproceedings,XUXEN: A Spelling Checker/Corrector for Basque...,,1992,Association for Computational Linguistics,"[Agirre, E., Alegria, I, Arregi, X, Artola, X,...",
39223,inproceedings,Towards Building a Spoken Dialogue System for ...,,2022,European Language Resources Association,"[Aicher, Annalena, Gerstenlauer, Nadine, Feust...",Speech interfaces for argumentative dialogue s...
14117,inproceedings,"Combining Multiple, Large-Scale Resources in a...",,1998,,"[Jing, Hongyan, McKeown, Kathleen]",
29826,inproceedings,Lying Through One's Teeth: A Study on Verbal L...,,2021,Association for Computational Linguistics,"[Yeh, Min-Hsuan, Ku, Lun-Wei]",Although many studies use the LIWC lexicon to ...


#### 2.2.1 Drop all columns with over 75% of missing data

In [74]:
# Set threshold on 75%
threshold = 0.25

# Calculate the threshold for each column
missing_threshold = int(threshold * len(train))

# Drop columns with more than the specified percentage of missing data
train_filtered = train.dropna(axis=1, thresh=missing_threshold)

train_filtered

Unnamed: 0,ENTRYTYPE,title,year,publisher,author,abstract
0,inproceedings,Philippine Language Resources: Trends and Dire...,2009,Association for Computational Linguistics,"[Roxas, Rachel Edita, Cheng, Charibeth, Lim, N...",
1,inproceedings,A System for Translating Locative Prepositions...,1991,Association for Computational Linguistics,"[Japkowicz, Nathalie, Wiebe, Janyce M.]",
2,inproceedings,Introduction to the Shared Task on Comparing S...,2008,College Publications,"[Bos, Johan]",
3,inproceedings,Pynini: A Python library for weighted finite-s...,2016,Association for Computational Linguistics,"[Gorman, Kyle]",
4,inproceedings,Improving Readability of Swedish Electronic He...,2014,Association for Computational Linguistics,"[Grigonyte, Gintarė, Kvist, Maria, Velupillai,...",
...,...,...,...,...,...,...
65909,inproceedings,Optimizing the weighted sequence alignment alg...,2022,Association for Computational Linguistics,"[Janicki, Maciej]",We present an optimized implementation of the ...
65910,proceedings,Proceedings of the 25th Conference on Computat...,2021,Association for Computational Linguistics,,
65911,article,A Large-Scale Pseudoword-Based Evaluation Fram...,2014,MIT Press,"[Pilehvar, Mohammad Taher, Navigli, Roberto]",
65912,inproceedings,CIST System for CL-SciSumm 2016 Shared Task,2016,,"[Li, Lei, Mao, Liyuan, Zhang, Yazhao, Chi, Jun...",


#### 2.2.2 Featurize 'author' column (count-vectors, reduced to top X most frequent authors)

In [75]:
# Convert lists of strings, accounting for None values
train_filtered['author_str'] = train_filtered['author'].apply(lambda x: ';'.join(map(str, x)) if x is not None else 'unknown')

# Add a column to store the original row numbers
# train_filtered['original_index'] = train_filtered.index

# Count the number of papers for each author
author_paper_counts = train_filtered['author_str'].str.split(';').explode().value_counts()

# Set the number of most frequent authors you want to include
n_mostfreq_authors = 3  # Adjust this value to the desired number of most frequent authors

# Filter authors based on the X most frequent authors
top_authors = author_paper_counts.head(n_mostfreq_authors).index.tolist()

# Filter only the top authors in 'author_str'
train_filtered['author_str_filtered'] = train_filtered['author_str'].apply(lambda x: ';'.join([author for author in x.split(';') if author in top_authors]))

# Count-vectorize 'author_str_filtered'
count_vectorizer = CountVectorizer(tokenizer=lambda x: x.split(';'))
count_matrix = count_vectorizer.fit_transform(train_filtered['author_str_filtered'])

# Extract and create columns
feature_names = count_vectorizer.get_feature_names_out()
author_count_df = pd.DataFrame(count_matrix.toarray(), columns=feature_names)

# Set the original_index column as the index
# author_count_df.set_index(train_filtered['original_index'], inplace=True)

author_count_df = author_count_df.drop(['unknown', ''], axis=1) # See if this approach always works out
author_count_df



Unnamed: 0,"bhattacharyya, pushpak","gurevych, iryna"
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
65909,0,0
65910,0,0
65911,0,0
65912,0,0


Merge with train_filtered, meaning we drop the author column and then add author_count_df

In [76]:
# Drop all redundant columns
train_filtered_buffer = train_filtered.drop(['author', 'author_str', 'author_str_filtered'], axis=1)

# Concatenate the original with dropped redundants and the extracted features for author
train_2 = pd.concat([train_filtered_buffer, author_count_df], axis=1).reindex(train_filtered_buffer.index)
train_2

Unnamed: 0,ENTRYTYPE,title,year,publisher,abstract,"bhattacharyya, pushpak","gurevych, iryna"
0,inproceedings,Philippine Language Resources: Trends and Dire...,2009,Association for Computational Linguistics,,0,0
1,inproceedings,A System for Translating Locative Prepositions...,1991,Association for Computational Linguistics,,0,0
2,inproceedings,Introduction to the Shared Task on Comparing S...,2008,College Publications,,0,0
3,inproceedings,Pynini: A Python library for weighted finite-s...,2016,Association for Computational Linguistics,,0,0
4,inproceedings,Improving Readability of Swedish Electronic He...,2014,Association for Computational Linguistics,,0,0
...,...,...,...,...,...,...,...
65909,inproceedings,Optimizing the weighted sequence alignment alg...,2022,Association for Computational Linguistics,We present an optimized implementation of the ...,0,0
65910,proceedings,Proceedings of the 25th Conference on Computat...,2021,Association for Computational Linguistics,,0,0
65911,article,A Large-Scale Pseudoword-Based Evaluation Fram...,2014,MIT Press,,0,0
65912,inproceedings,CIST System for CL-SciSumm 2016 Shared Task,2016,,,0,0


#### 2.2.3 Vectorize 'ENTRYTYPE' column (3-categorical variable one-hot encoded)

In [77]:
# Perform one-hot encoding
train_encoded_entrytype = pd.get_dummies(train_2['ENTRYTYPE'], columns=['category'], prefix='category')

# Show count-values for each of the columns
train_encoded_entrytype.apply(lambda x: x.value_counts())

Unnamed: 0,category_article,category_inproceedings,category_proceedings
0,63840,3948,64040
1,2074,61966,1874


Merge with train_filtered, meaning we drop the ENTRYTYPE column and then add train_encoded_entrytype

In [78]:
# Drop all redundant columns
train_2 = train_2.drop(['ENTRYTYPE'], axis=1)

# Concatenate the original with dropped redundants and the extracted features for ENTRYTYPE
train_3 = pd.concat([train_2, train_encoded_entrytype], axis=1).reindex(train_2.index)
train_3

Unnamed: 0,title,year,publisher,abstract,"bhattacharyya, pushpak","gurevych, iryna",category_article,category_inproceedings,category_proceedings
0,Philippine Language Resources: Trends and Dire...,2009,Association for Computational Linguistics,,0,0,0,1,0
1,A System for Translating Locative Prepositions...,1991,Association for Computational Linguistics,,0,0,0,1,0
2,Introduction to the Shared Task on Comparing S...,2008,College Publications,,0,0,0,1,0
3,Pynini: A Python library for weighted finite-s...,2016,Association for Computational Linguistics,,0,0,0,1,0
4,Improving Readability of Swedish Electronic He...,2014,Association for Computational Linguistics,,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...
65909,Optimizing the weighted sequence alignment alg...,2022,Association for Computational Linguistics,We present an optimized implementation of the ...,0,0,0,1,0
65910,Proceedings of the 25th Conference on Computat...,2021,Association for Computational Linguistics,,0,0,0,0,1
65911,A Large-Scale Pseudoword-Based Evaluation Fram...,2014,MIT Press,,0,0,1,0,0
65912,CIST System for CL-SciSumm 2016 Shared Task,2016,,,0,0,0,1,0


#### 2.2.4 Vectorize 'Publisher' column (116-categorical variable one-hot encoded, and reduced to X most frequent publishers)

In [79]:
# Set the number of most frequent publishers to include
n_mostfreq_publishers = 2  # Adjust this value as needed

# Get the X most frequent publishers
top_publishers = train_3['publisher'].value_counts().head(n_mostfreq_publishers).index.tolist()

# Create a new DataFrame with one-hot encoding for the X most frequent publishers
train_encoded_publisher = pd.get_dummies(train_3['publisher'][train_3['publisher'].isin(top_publishers)], prefix='publisher')

# Show count-values for each of the columns
train_encoded_publisher.apply(lambda x: x.value_counts())

Unnamed: 0,publisher_Association for Computational Linguistics,publisher_European Language Resources Association (ELRA)
0,4456,37526
1,37526,4456


In [80]:
# Drop all redundant columns
train_3 = train_3.drop(['publisher'], axis=1)

# Concatenate the original with dropped redundants and the extracted features for ENTRYTYPE
train_4 = pd.concat([train_3, train_encoded_publisher], axis=1).reindex(train_3.index).fillna(0)
train_4


Unnamed: 0,title,year,abstract,"bhattacharyya, pushpak","gurevych, iryna",category_article,category_inproceedings,category_proceedings,publisher_Association for Computational Linguistics,publisher_European Language Resources Association (ELRA)
0,Philippine Language Resources: Trends and Dire...,2009,0,0,0,0,1,0,1.0,0.0
1,A System for Translating Locative Prepositions...,1991,0,0,0,0,1,0,1.0,0.0
2,Introduction to the Shared Task on Comparing S...,2008,0,0,0,0,1,0,0.0,0.0
3,Pynini: A Python library for weighted finite-s...,2016,0,0,0,0,1,0,1.0,0.0
4,Improving Readability of Swedish Electronic He...,2014,0,0,0,0,1,0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
65909,Optimizing the weighted sequence alignment alg...,2022,We present an optimized implementation of the ...,0,0,0,1,0,1.0,0.0
65910,Proceedings of the 25th Conference on Computat...,2021,0,0,0,0,0,1,1.0,0.0
65911,A Large-Scale Pseudoword-Based Evaluation Fram...,2014,0,0,0,1,0,0,0.0,0.0
65912,CIST System for CL-SciSumm 2016 Shared Task,2016,0,0,0,0,1,0,0.0,0.0


#### 2.2.3 Vectorize 'title' column (English-translated (?) with stop-words removal and/or synonym replacement)

In [81]:
import googletrans    
from googletrans import Translator
from translate import Translator
from langdetect import detect

from mtranslate import translate

In [82]:
# Assign DataFrame to title_tfidf_df
title_tfidf_df = train_sample.copy()

# Function to translate non-English titles to English
def translate_to_english(title):
    translation = translate(title, 'en')
    return translation if translation != title else title

# Apply the translation function to the 'title' column
title_tfidf_df['translated_title'] = title_tfidf_df['title'].apply(translate_to_english)

# Display the resulting DataFrame
title_tfidf_df

KeyboardInterrupt: 

In [None]:
# Assign DataFrame to title_tfidf_df
title_tfidf_df = train_sample

# Function to translate non-English titles to English
def translate_to_english(title):
    translator= Translator(to_lang="en")
    translation = translator.translate(title)
    return translation if translator.from_lang != 'en' else title

# Apply the translation function to the 'title' column
title_tfidf_df['translated_title'] = title_tfidf_df['title'].apply(lambda x: translate_to_english(x))

# Display the resulting DataFrame
title_tfidf_df

Unnamed: 0,ENTRYTYPE,title,editor,year,publisher,author,abstract,translated_title
12680,inproceedings,Question-Answering Based on Virtually Integrat...,,2003,Association for Computational Linguistics,"[Choi, Key-Sun, Kim, Jae-Ho, Miyazaki, Masaru,...",,Question-Answering Based on Virtually Integrat...
17292,inproceedings,AMI&ERIC: How to Learn with Naive Bayes and Pr...,,2013,Association for Computational Linguistics,"[Dermouche, Mohamed, Khouas, Leila, Velcin, Ju...",,AMI&ERIC: How to Learn with Naive Bayes and Pr...
33265,inproceedings,Inducing Gazetteers for Named Entity Recogniti...,,2008,Association for Computational Linguistics,"[Kazama, Jun'ichi, Torisawa, Kentaro]",,Inducing Gazetteers for Named Entity Recogniti...
52850,inproceedings,Leveraging Explicit Lexico-logical Alignments ...,,2022,Association for Computational Linguistics,"[Sun, Runxin, He, Shizhu, Zhu, Chong, He, Yaoh...",Text-to-SQL aims to parse natural language que...,Leveraging Explicit Lexico-logical Alignments ...
2298,inproceedings,CLAM: Quickly deploy NLP command-line tools on...,,2014,Dublin City University and Association for Com...,"[van Gompel, Maarten, Reynaert, Martin]",,CLAM: Quickly deploy NLP command-line tools on...
...,...,...,...,...,...,...,...,...
27576,inproceedings,Analyzing Culture-Specific Argument Structures...,,2022,International Conference on Computational Ling...,"[Chen, Wei-Fan, Chen, Mei-Hua, Mudgal, Garima,...",Language education has been shown to benefit f...,Analyzing Culture-Specific Argument Structures...
33657,inproceedings,Acquistion of the Morphological Structure of t...,,2008,Coling 2008 Organizing Committee,"[Hathout, Nabil]",,Acquistion of the Morphological Structure of t...
20315,inproceedings,A Deep Linguistic Analysis for Cross-language ...,,2006,European Language Resources Association (ELRA),"[Semmar, Nasredine, Laib, Meriama, Fluhr, Chri...",Cross-language information retrieval consists ...,A Deep Linguistic Analysis for Cross-language ...
23899,inproceedings,A Theory of Unsupervised Speech Recognition,,2023,Association for Computational Linguistics,"[Wang, Liming, Hasegawa-Johnson, Mark, Yoo, Ch...",Unsupervised speech recognition (pasted macro ...,A Theory of Unsupervised Speech Recognition


In [None]:
def det(x):
    try:
        lang = detect(x)
    except:
        lang = 'Other'
    return lang

title_tfidf_df['translated_language'] = title_tfidf_df['translated_title'].apply(det)

In [None]:
title_tfidf_df['translated_language'].value_counts()

en       12590
fr         352
it          51
de          34
da          23
nl          22
ca          21
ro          18
tl          16
af          11
es          10
id          10
pt           9
no           6
et           4
sv           2
cy           1
sw           1
Other        1
Name: translated_language, dtype: int64

In [None]:
# Apply the TF-IDF vectorizer to column 'title'
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(train_sample_filtered['title'])

# Extract and create columns
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

print(f"We've transformed the 'author' column to a dataframe of {len(tfidf_df.columns)} columns.")
tfidf_df

We've transformed the 'author' column to a dataframe of 4820 columns.


Unnamed: 0,05,08,10,10th,11,12,14,14th,14ème,15,...,基于bilstm,基于语料库的形容词性别偏度历时研究,多模态表述视域下的小学数学课堂语言计量初探,多語語碼轉換之未知詞擷取,大規模詞彙語意關係自動標示之初步研究,字里行间的道德,為例,融合多层语义特征图的缅甸语图像文本识别方法,調變頻譜正規化法使用於強健語音辨識之研究,非監督式學習於中文電視新聞自動轉寫之初步應用
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### 2.2.5 Vectorize 'Abstract' column

### 2.3 Incorporate into baseline code

In [None]:
# Set the logging level to INFO and set loading message
logging.getLogger().setLevel(logging.INFO)
    
# Load train and test sets and change all NA values to empty values
logging.info("Loading training/test data")
train = pd.DataFrame.from_records(json.load(open('../data/train.json'))).fillna("")
test = pd.DataFrame.from_records(json.load(open('../data/test.json'))).fillna("")
    
# Split the train set into train (80%) and validation (20%) sets, 5-folds
logging.info("Splitting validation")
num_folds = 5
k_fold = KFold(n_splits=num_folds, shuffle=True, random_state=123)
    
# Store a featurizer to transform the 'title' column into a bag-of-words format
featurizer_1 = ColumnTransformer(
    transformers=[("title", CountVectorizer(), "title")], remainder='drop')
featurizer_2 = ColumnTransformer(
    transformers=[("title", TfidfVectorizer(), "title")], remainder='drop')
featurizer_3 = ColumnTransformer(
    transformers=[("abstract", CountVectorizer(), "abstract")], remainder='drop')
featurizer_4 = ColumnTransformer(
    transformers=[("abstract", TfidfVectorizer(), "abstract")], remainder='drop')
featurizers = [featurizer_1, featurizer_2, featurizer_3, featurizer_4]

for i, featurizer in enumerate(featurizers):
    # Make a pipeline for the featurizer and a ridge model, that aims to minimize the sum of squares
    ridge_cv = make_pipeline(featurizer, Ridge())
    
    # Drop target variable column and fit both models
    logging.info(f"Fitting model with featurizer {i+1}")
    ridge_cv.fit(train.drop('year', axis=1), train['year'].values)
    
    # Calculate and report both MAE's
    logging.info("Evaluating on validation data")
    ridge_cv_scores = cross_val_score(ridge_cv, train.drop('year', axis=1), train['year'].values, cv=k_fold, scoring='neg_mean_absolute_error')
    logging.info(f"Ridge regress MAE with featurizer {i+1} ({num_folds}-fold cross-validated): {-ridge_cv_scores.mean()}")

INFO:root:Loading training/test data
INFO:root:Splitting validation
INFO:root:Fitting model with featurizer 1
INFO:root:Evaluating on validation data
INFO:root:Ridge regress MAE with featurizer 1 (5-fold cross-validated): 5.773010450586702
INFO:root:Fitting model with featurizer 2
INFO:root:Evaluating on validation data
INFO:root:Ridge regress MAE with featurizer 2 (5-fold cross-validated): 5.384430333156983
INFO:root:Fitting model with featurizer 3
INFO:root:Evaluating on validation data
INFO:root:Ridge regress MAE with featurizer 3 (5-fold cross-validated): 6.340921782179531
INFO:root:Fitting model with featurizer 4
INFO:root:Evaluating on validation data
INFO:root:Ridge regress MAE with featurizer 4 (5-fold cross-validated): 5.480748043346883
INFO:root:Fitting model with featurizer 5


NotFittedError: This MultiLabelBinarizer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(val['title'])
feature_names1 = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names1)
tfidf_df.head()

Unnamed: 0,00,000,001,0099,01,02,03,04,07,08,...,雜訊環境下應用線性估測編碼於特徵時序列之強健性語音辨識,雜訊環境與說話內容因素分析之強健性語音辨認,電腦輔助句子重組試題編製,電話查詢口語對話系統中語音辨識不確定性之處理,電話轉接對話模式與表達轉接要求句型的分析,非負矩陣分解法於語音調變頻譜強化之研究,面向中文口语理解的基于依赖引导的字特征槽填充模型,面向对话文本的实体关系抽取,面向机器阅读理解的高质量藏语数据集构建,領域相關詞彙極性分析及文件情緒分類之研究
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(val['title'])
feature_names2 = count_vectorizer.get_feature_names_out()
count_df = pd.DataFrame(count_matrix.toarray(), columns=feature_names2)
count_df.head()

Unnamed: 0,00,000,001,0099,01,02,03,04,07,08,...,雜訊環境下應用線性估測編碼於特徵時序列之強健性語音辨識,雜訊環境與說話內容因素分析之強健性語音辨認,電腦輔助句子重組試題編製,電話查詢口語對話系統中語音辨識不確定性之處理,電話轉接對話模式與表達轉接要求句型的分析,非負矩陣分解法於語音調變頻譜強化之研究,面向中文口语理解的基于依赖引导的字特征槽填充模型,面向对话文本的实体关系抽取,面向机器阅读理解的高质量藏语数据集构建,領域相關詞彙極性分析及文件情緒分類之研究
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


This paragraph build upon the previous baseline code. It entails the following adjustments/additions chronologically:

- [x] Removal of dummy regressor, since ridge works better from the very start;
- [x] 5-fold cross validation to reduce variability (Ridge regress MAE (5.773));
- [x] Try sklearn's other feature vectorizers (tf-idf (5.384), ...);
- [ ] Perform custom preprocessing, tokenizations within sklearn;
- [ ] Tune hyperparameters of feature vectorizers (n-gram size);
- [ ] Try tasks other than regression, like lazy learning (kNN)(?);
- [ ] Try BERTopic modelling;
- [ ] 