## 1. Baseline code provided by uni

### 1.1 Import modules

In [1]:
import pandas as pd
import logging
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

### 1.2 Baseline function to create [predictions](https://github.com/larshanen/MLChallenge/tree/main/notebooks/predicted.json)

In [2]:
def main():
    # Set the logging level to INFO and set loading message
    logging.getLogger().setLevel(logging.INFO)
    
    # Load train and test sets and change all NA values to empty values
    logging.info("Loading training/test data")
    train = pd.DataFrame.from_records(json.load(open('../data/train.json'))).fillna("")
    test = pd.DataFrame.from_records(json.load(open('../data/test.json'))).fillna("")
    
    # Split the train set into train (75%) and validation (25%) sets
    logging.info("Splitting validation")
    train, val = train_test_split(train, stratify=train['year'], random_state=123)
    
    # Store a featurizer to transform the 'title' column into a bag-of-words format
    featurizer = ColumnTransformer(
        transformers=[("title", CountVectorizer(), "title")], remainder='drop')
    
    # Make a pipeline for the featurizer combined with a dummy regressor, that simply predicts the overall trained mean of the target variable
    dummy = make_pipeline(featurizer, DummyRegressor(strategy='mean'))

    # Make a pipeline for the featurizer and a ridge model, that aims to minimize the sum of squares
    ridge = make_pipeline(featurizer, Ridge())
    
    # Drop target variable column and fit both models
    logging.info("Fitting models")
    dummy.fit(train.drop('year', axis=1), train['year'].values)
    ridge.fit(train.drop('year', axis=1), train['year'].values)
    
    # Calculate and report both MAE's
    logging.info("Evaluating on validation data")
    err = mean_absolute_error(val['year'].values, dummy.predict(val.drop('year', axis=1)))
    logging.info(f"Mean baseline MAE: {err}")
    err = mean_absolute_error(val['year'].values, ridge.predict(val.drop('year', axis=1)))
    logging.info(f"Ridge regress MAE: {err}")
    
    # Let the ridge model predict on test set
    logging.info(f"Predicting on test")
    pred = ridge.predict(test)
    test['year'] = pred
    
    # Write prediction file
    logging.info("Writing prediction file")
    test.to_json("predicted.json", orient='records', indent=2)

In [4]:
main()

INFO:root:Loading training/test data
INFO:root:Splitting validation
INFO:root:Fitting models
INFO:root:Evaluating on validation data
INFO:root:Mean baseline MAE: 7.8054390754858805
INFO:root:Ridge regress MAE: 5.812345349001838
INFO:root:Predicting on test
INFO:root:Writing prediction file


## 2. Team code

Please follow the instructions beneath when writing or adjusting code:

In [12]:
# Describe every piece of code with comments
# Include your name so we can report our individual contributions (this is mandatory)

### 2.1 Calculate baseline performance

In [28]:
# Extract predictions made by baseline function (Lars)
myBaseline = pd.DataFrame.from_records(json.load(open('predicted.json'))).fillna("")
print(myBaseline.shape)
myBaseline.head()

(21972, 7)


Unnamed: 0,ENTRYTYPE,title,editor,publisher,author,abstract,year
0,inproceedings,Learning to lemmatise Polish noun phrases,,Association for Computational Linguistics,"[Radziszewski, Adam]",,2008.547417
1,inproceedings,The Treebanked Conspiracy. Actors and Actions ...,,,"[Passarotti, Marco, González Saavedra, Berta]",,2009.837691
2,inproceedings,Linguistic structure and machine translation,,,"[Lamb, Sydney M.]",If one understands the nature of linguistic st...,2002.71153
3,inproceedings,NSEmo at EmoInt-2017: An Ensemble to Predict E...,,Association for Computational Linguistics,"[Madisetty, Sreekanth, Desarkar, Maunendra San...","In this paper, we describe a method to predict...",2016.880134
4,inproceedings,Explaining data using causal Bayesian networks,,Association for Computational Linguistics,"[Sevilla, Jaime]",I introduce Causal Bayesian Networks as a form...,2015.042005


In [27]:
myLabels = pd.DataFrame.from_records(json.load(open('../data/train.json'))).fillna("")
print(myLabels.shape)
myLabels.head()

(65914, 7)


Unnamed: 0,ENTRYTYPE,title,editor,year,publisher,author,abstract
0,inproceedings,Philippine Language Resources: Trends and Dire...,,2009,Association for Computational Linguistics,"[Roxas, Rachel Edita, Cheng, Charibeth, Lim, N...",
1,inproceedings,A System for Translating Locative Prepositions...,,1991,Association for Computational Linguistics,"[Japkowicz, Nathalie, Wiebe, Janyce M.]",
2,inproceedings,Introduction to the Shared Task on Comparing S...,,2008,College Publications,"[Bos, Johan]",
3,inproceedings,Pynini: A Python library for weighted finite-s...,,2016,Association for Computational Linguistics,"[Gorman, Kyle]",
4,inproceedings,Improving Readability of Swedish Electronic He...,,2014,Association for Computational Linguistics,"[Grigonyte, Gintarė, Kvist, Maria, Velupillai,...",
