## 1. Baseline code provided by uni

### 1.1 Import modules

In [1]:
import pandas as pd
import logging
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

### 1.2 Baseline function to create [predictions](https://github.com/larshanen/MLChallenge/tree/main/notebooks/predicted.json)

In [2]:
def main():
    logging.getLogger().setLevel(logging.INFO)
    logging.info("Loading training/test data")
    train = pd.DataFrame.from_records(json.load(open('../data/train.json'))).fillna("")
    test = pd.DataFrame.from_records(json.load(open('../data/test.json'))).fillna("")
    logging.info("Splitting validation")
    train, val = train_test_split(train, stratify=train['year'], random_state=123)
    featurizer = ColumnTransformer(
        transformers=[("title", CountVectorizer(), "title")],
        remainder='drop')
    dummy = make_pipeline(featurizer, DummyRegressor(strategy='mean'))
    ridge = make_pipeline(featurizer, Ridge())
    logging.info("Fitting models")
    dummy.fit(train.drop('year', axis=1), train['year'].values)
    ridge.fit(train.drop('year', axis=1), train['year'].values)
    logging.info("Evaluating on validation data")
    err = mean_absolute_error(val['year'].values, dummy.predict(val.drop('year', axis=1)))
    logging.info(f"Mean baseline MAE: {err}")
    err = mean_absolute_error(val['year'].values, ridge.predict(val.drop('year', axis=1)))
    logging.info(f"Ridge regress MAE: {err}")
    logging.info(f"Predicting on test")
    pred = ridge.predict(test)
    test['year'] = pred
    logging.info("Writing prediction file")
    test.to_json("predicted.json", orient='records', indent=2)

In [4]:
main()

INFO:root:Loading training/test data
INFO:root:Splitting validation
INFO:root:Fitting models
INFO:root:Evaluating on validation data
INFO:root:Mean baseline MAE: 7.8054390754858805
INFO:root:Ridge regress MAE: 5.812345349001838
INFO:root:Predicting on test
INFO:root:Writing prediction file


## 2. Team code

Please follow the instructions beneath when writing or adjusting code:

In [12]:
# Describe every piece of code with comments
# Include your name so we can report our individual contributions (this is mandatory)

### 2.1 Calculate baseline performance

In [28]:
# Extract predictions made by baseline function (Lars)
myBaseline = pd.DataFrame.from_records(json.load(open('predicted.json'))).fillna("")
print(myBaseline.shape)
myBaseline.head()

(21972, 7)


Unnamed: 0,ENTRYTYPE,title,editor,publisher,author,abstract,year
0,inproceedings,Learning to lemmatise Polish noun phrases,,Association for Computational Linguistics,"[Radziszewski, Adam]",,2008.547417
1,inproceedings,The Treebanked Conspiracy. Actors and Actions ...,,,"[Passarotti, Marco, González Saavedra, Berta]",,2009.837691
2,inproceedings,Linguistic structure and machine translation,,,"[Lamb, Sydney M.]",If one understands the nature of linguistic st...,2002.71153
3,inproceedings,NSEmo at EmoInt-2017: An Ensemble to Predict E...,,Association for Computational Linguistics,"[Madisetty, Sreekanth, Desarkar, Maunendra San...","In this paper, we describe a method to predict...",2016.880134
4,inproceedings,Explaining data using causal Bayesian networks,,Association for Computational Linguistics,"[Sevilla, Jaime]",I introduce Causal Bayesian Networks as a form...,2015.042005


In [27]:
myLabels = pd.DataFrame.from_records(json.load(open('../data/train.json'))).fillna("")
print(myLabels.shape)
myLabels.head()

(65914, 7)


Unnamed: 0,ENTRYTYPE,title,editor,year,publisher,author,abstract
0,inproceedings,Philippine Language Resources: Trends and Dire...,,2009,Association for Computational Linguistics,"[Roxas, Rachel Edita, Cheng, Charibeth, Lim, N...",
1,inproceedings,A System for Translating Locative Prepositions...,,1991,Association for Computational Linguistics,"[Japkowicz, Nathalie, Wiebe, Janyce M.]",
2,inproceedings,Introduction to the Shared Task on Comparing S...,,2008,College Publications,"[Bos, Johan]",
3,inproceedings,Pynini: A Python library for weighted finite-s...,,2016,Association for Computational Linguistics,"[Gorman, Kyle]",
4,inproceedings,Improving Readability of Swedish Electronic He...,,2014,Association for Computational Linguistics,"[Grigonyte, Gintarė, Kvist, Maria, Velupillai,...",


In [37]:
print(myBaseline["title"].nunique())
print(myLabels["title"].nunique())

21854
65314


In [30]:
myBaseline['year'] = pd.to_numeric(myBaseline['year'])
myLabels['year'] = pd.to_numeric(myLabels['year'])

# Merge DataFrames on 'title' to align predictions and actual values
merged_df = pd.merge(myBaseline, myLabels, on='title', how='inner', suffixes=('_pred', '_true'))

# Calculate absolute errors
merged_df['absolute_error'] = abs(merged_df['year_pred'] - merged_df['year_true'])

# Calculate mean absolute error
mae = merged_df['absolute_error'].mean()

print(f'Mean Absolute Error: {mae}')
merged_df

Mean Absolute Error: 6.57753536939042


Unnamed: 0,ENTRYTYPE_pred,title,editor_pred,publisher_pred,author_pred,abstract_pred,year_pred,ENTRYTYPE_true,editor_true,year_true,publisher_true,author_true,abstract_true,absolute_error
0,inproceedings,Text Segmentation with Multiple Surface Lingui...,,,"[Mochizuki, Hajime, Honda, Takeo, Okumura, Man...",,2007.143749,inproceedings,,1998,Association for Computational Linguistics,"[Mochizuki, Hajime, Honda, Takeo, Okumura, Man...",,9.143749
1,article,Calls for Papers,,,,,2001.937953,article,,1985,,,,16.937953
2,article,Calls for Papers,,,,,2001.937953,article,,1984,,,,17.937953
3,article,Calls for Papers,,,,,2001.937953,article,,1985,,,,16.937953
4,article,Calls for Papers,,,,,2001.937953,article,,1984,,,,17.937953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2583,inproceedings,Very Large Annotated Database of American English,,,"[Marcus, Mitch]",,1998.709876,inproceedings,,1990,,"[Marcus, Mitch]",,8.709876
2584,inproceedings,Dependency-based Reordering Model for Constitu...,,European Association for Machine Translation,"[Kazemiy, Arefeh, Toral, Antonio, Way, Andy, M...",,2015.479069,inproceedings,,2015,,"[Kazemi, Arefeh, Toral, Antonio, Way, Andy, Mo...",,0.479069
2585,inproceedings,Adaptive Natural Language Processing,,,"[Weischedel, Ralph]",,2006.168170,inproceedings,,1991,,"[Weischedel, Ralph]",,15.168170
2586,inproceedings,Deriving the Predicate-Argument Structure for ...,,,"[Bozsahin, Cem]",,2008.372357,inproceedings,,1998,Association for Computational Linguistics,"[Bozsahin, Cem]",,10.372357
