## 1. Baseline code provided by uni

### 1.1 Import modules

In [3]:
import pandas as pd
import logging
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

### 1.2 Baseline function to create [predictions](https://github.com/larshanen/MLChallenge/tree/main/notebooks/predicted.json)

In [3]:
def main():
    # Set the logging level to INFO and set loading message
    logging.getLogger().setLevel(logging.INFO)
    
    # Load train and test sets and change all NA values to empty values
    logging.info("Loading training/test data")
    train = pd.DataFrame.from_records(json.load(open('../data/train.json'))).fillna("")
    test = pd.DataFrame.from_records(json.load(open('../data/test.json'))).fillna("")
    
    # Split the train set into train (75%) and validation (25%) sets
    logging.info("Splitting validation")
    train, val = train_test_split(train, stratify=train['year'], random_state=123)
    
    # Store a featurizer to transform the 'title' column into a bag-of-words format
    featurizer = ColumnTransformer(
        transformers=[("title", CountVectorizer(), "title")], remainder='drop')
    
    # Make a pipeline for the featurizer combined with a dummy regressor, that simply predicts the overall trained mean of the target variable
    dummy = make_pipeline(featurizer, DummyRegressor(strategy='mean'))

    # Make a pipeline for the featurizer and a ridge model, that aims to minimize the sum of squares
    ridge = make_pipeline(featurizer, Ridge())
    
    # Drop target variable column and fit both models
    logging.info("Fitting models")
    dummy.fit(train.drop('year', axis=1), train['year'].values)
    ridge.fit(train.drop('year', axis=1), train['year'].values)
    
    # Calculate and report both MAE's
    logging.info("Evaluating on validation data")
    err = mean_absolute_error(val['year'].values, dummy.predict(val.drop('year', axis=1)))
    logging.info(f"Mean baseline MAE: {err}")
    err = mean_absolute_error(val['year'].values, ridge.predict(val.drop('year', axis=1)))
    logging.info(f"Ridge regress MAE: {err}")
    
    # Let the ridge model predict on test set
    logging.info(f"Predicting on test")
    pred = ridge.predict(test)
    test['year'] = pred
    
    # Write JSON prediction file
    logging.info("Writing prediction file")
    test.to_json("predicted.json", orient='records', indent=2)

In [4]:
main()

INFO:root:Loading training/test data
INFO:root:Splitting validation
INFO:root:Fitting models
INFO:root:Evaluating on validation data
INFO:root:Mean baseline MAE: 7.8054390754858805
INFO:root:Ridge regress MAE: 5.812345349001838
INFO:root:Predicting on test
INFO:root:Writing prediction file


## 2. Team code

Please follow the instructions beneath when writing or adjusting code:

In [5]:
# Describe every piece of code with comments
# Include your name in every header so we can report our individual contributions (this is mandatory)

### 2.1 Explore baseline performance (Lars)

In [82]:
# Set the logging level to INFO and set loading message
logging.getLogger().setLevel(logging.INFO)
    
# Load train and test sets and change all NA values to empty values
logging.info("Loading training/test data")
train = pd.DataFrame.from_records(json.load(open('../../data/train.json'))).fillna("")
test = pd.DataFrame.from_records(json.load(open('../../data/test.json'))).fillna("")

INFO:root:Loading training/test data


In [40]:
# Split the train set into train (75%) and validation (25%) sets
logging.info("Splitting validation")
train, val = train_test_split(train, stratify=train['year'], random_state=123)
    
# Store a featurizer to transform the 'title' column into a bag-of-words format
featurizer = ColumnTransformer(
transformers=[("title", CountVectorizer(), "title")], remainder='drop')
    
# Make a pipeline for the featurizer combined with a dummy regressor, that simply predicts the overall trained mean of the target variable
dummy = make_pipeline(featurizer, DummyRegressor(strategy='mean'))

# Make a pipeline for the featurizer and a ridge model, that aims to minimize the sum of squares
ridge = make_pipeline(featurizer, Ridge())
    
# Drop target variable column and fit both models
logging.info("Fitting models")
dummy.fit(train.drop('year', axis=1), train['year'].values)
ridge.fit(train.drop('year', axis=1), train['year'].values)
    
# Calculate and report both MAE's
logging.info("Evaluating on validation data")
err = mean_absolute_error(val['year'].values, dummy.predict(val.drop('year', axis=1)))
logging.info(f"Mean baseline MAE: {err}")
err = mean_absolute_error(val['year'].values, ridge.predict(val.drop('year', axis=1)))
logging.info(f"Ridge regress MAE: {err}")

INFO:root:Splitting validation


INFO:root:Fitting models
INFO:root:Evaluating on validation data
INFO:root:Mean baseline MAE: 7.8054390754858805
INFO:root:Ridge regress MAE: 5.812345349001838


In [42]:
# Check what validation set looks like
val

Unnamed: 0,ENTRYTYPE,title,editor,year,publisher,author,abstract
2603,inproceedings,Use of Heuristic Knowledge in Chinese Language...,,1984,Association for Computational Linguistics,"[Yang, Yiming, Nishida, Toyoaki, Doshita, Shuji]",
3258,inproceedings,Deciding the Twins Property for Weighted Tree ...,,2012,Association for Computational Linguistics,"[Büchse, Matthias, Fischer, Anja]",
48785,inproceedings,Large Margin Neural Language Model,,2018,Association for Computational Linguistics,"[Huang, Jiaji, Li, Yi, Ping, Wei, Huang, Liang]",We propose a large margin criterion for traini...
8822,inproceedings,Plot-guided Adversarial Example Construction f...,,2021,Association for Computational Linguistics,"[Ghazarian, Sarik, Liu, Zixi, S M, Akash, Weis...",With the recent advances of open-domain story ...
24495,inproceedings,Towards a terminological resource for biomedic...,,2006,European Language Resources Association (ELRA),"[Nenadic, Goran, Okazaki, Naoki, Ananiadou, So...",One of the main challenges in biomedical text ...
...,...,...,...,...,...,...,...
12457,inproceedings,Annotating Events in an Emotion Corpus,,2014,European Language Resources Association (ELRA),"[Lee, Sophia, Li, Shoushan, Huang, Chu-Ren]",This paper presents the development of a Chine...
29939,inproceedings,POSTECH Submission on Duolingo Shared Task,,2020,Association for Computational Linguistics,"[Park, Junsu, Kwon, Hongseok, Lee, Jong-Hyeok]","In this paper, we propose a transfer learning ..."
63102,inproceedings,"Annotation Trees: LDC's customizable, extensib...",,2012,European Language Resources Association (ELRA),"[Wright, Jonathan, Griffitt, Kira, Ellis, Joe,...","In recent months, LDC has developed a web-base..."
19430,inproceedings,"Long Nights, Rainy Days, and Misspent Youth: A...",,2015,Association for Computational Linguistics,"[Bracewell, David]",


In [8]:
# Check what array with predicted values looks like
pred_array = ridge.predict(val.drop('year', axis=1))
print(len(pred_array))
pred_array

16479


array([2005.16848241, 2016.72621127, 2013.17436633, ..., 2014.96570928,
       2014.87896051, 2020.60932351])

In [9]:
# Check what array with true values looks like
true_array = val['year'].values
print(len(true_array))
true_array

16479


array(['1984', '2012', '2018', ..., '2012', '2015', '2016'], dtype=object)

### 2.2 Preprocessing (Lars)

In [34]:
# Import extra modules
from sklearn.model_selection import cross_val_score, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

In [59]:
#Randomly save 5% of the train set for velocity purposes
percentage_to_save = 5

# Calculate the number of rows to save
num_rows_to_save = int(len(train) * (percentage_to_save / 100))

# Use the sample method to randomly select rows
train_sample = train.sample(n=num_rows_to_save, random_state=42)  # Set a random_state for reproducibility

train_sample.head()

Unnamed: 0,ENTRYTYPE,title,editor,year,publisher,author,abstract
49187,inproceedings,Learning Bilingual Sentence Embeddings via Aut...,,2019,Association for Computational Linguistics,"[Kim, Yunsu, Rosendahl, Hendrik, Rossenbach, N...",We propose a novel model architecture and trai...
27721,inproceedings,How Could Rhetorical Relations Be Used in Mach...,,1993,,"[Mitkov, Ruslan]",
28449,inproceedings,The Rhetorical Parsing of Unrestricted Natural...,,1997,Association for Computational Linguistics,"[Marcu, Daniel]",
36059,inproceedings,CoToHiLi at LSCDiscovery: the Role of Linguist...,,2022,Association for Computational Linguistics,"[Sabina Uban, Ana, Maria Cristea, Alina, Danie...",This paper presents the contributions of the C...
35564,inproceedings,Presentation,,2006,Association for Machine Translation in the Ame...,"[Habash, Nizar]",


#### 2.2.1 Drop all columns with over 75% of missing data

In [162]:
# Set threshold on 75%
threshold = 0.25

# Calculate the threshold for each column
missing_threshold = int(threshold * len(train))

# Drop columns with more than the specified percentage of missing data
train_filtered = train.dropna(axis=1, thresh=missing_threshold)

train_filtered

Unnamed: 0,ENTRYTYPE,title,editor,year,publisher,author,abstract
0,inproceedings,Philippine Language Resources: Trends and Dire...,,2009,Association for Computational Linguistics,"[Roxas, Rachel Edita, Cheng, Charibeth, Lim, N...",
1,inproceedings,A System for Translating Locative Prepositions...,,1991,Association for Computational Linguistics,"[Japkowicz, Nathalie, Wiebe, Janyce M.]",
2,inproceedings,Introduction to the Shared Task on Comparing S...,,2008,College Publications,"[Bos, Johan]",
3,inproceedings,Pynini: A Python library for weighted finite-s...,,2016,Association for Computational Linguistics,"[Gorman, Kyle]",
4,inproceedings,Improving Readability of Swedish Electronic He...,,2014,Association for Computational Linguistics,"[Grigonyte, Gintarė, Kvist, Maria, Velupillai,...",
...,...,...,...,...,...,...,...
65909,inproceedings,Optimizing the weighted sequence alignment alg...,,2022,Association for Computational Linguistics,"[Janicki, Maciej]",We present an optimized implementation of the ...
65910,proceedings,Proceedings of the 25th Conference on Computat...,"[Bisazza, Arianna, Abend, Omri]",2021,Association for Computational Linguistics,,
65911,article,A Large-Scale Pseudoword-Based Evaluation Fram...,,2014,MIT Press,"[Pilehvar, Mohammad Taher, Navigli, Roberto]",
65912,inproceedings,CIST System for CL-SciSumm 2016 Shared Task,,2016,,"[Li, Lei, Mao, Liyuan, Zhang, Yazhao, Chi, Jun...",


#### 2.2.2 Vectorize 'author' column

In [204]:
# Convert lists of strings, accounting for None values
train_sample_filtered['author_str'] = train_sample_filtered['author'].apply(lambda x: ';'.join(map(str, x)) if x is not None else 'unknown_author')

# Count the number of papers for each author
author_paper_counts = train_sample_filtered['author_str'].str.split(';').explode().value_counts()

# Set the number of most frequent authors you want to include
X = 100  # Adjust this value to the desired number of most frequent authors

# Filter authors based on the X most frequent authors
top_authors = author_paper_counts.head(X).index.tolist()

# Filter only the top authors in 'author_str'
train_sample_filtered['author_str_filtered'] = train_sample_filtered['author_str'].apply(lambda x: ';'.join([author for author in x.split(';') if author in top_authors]))

# Count-vectorize 'author_str_filtered'
count_vectorizer = CountVectorizer(tokenizer=lambda x: x.split(';'))
count_matrix = count_vectorizer.fit_transform(train_sample_filtered['author_str_filtered'])

# Extract and create columns
feature_names = count_vectorizer.get_feature_names_out()
count_df = pd.DataFrame(count_matrix.toarray(), columns=feature_names)

# Display the resulting DataFrame
count_df




Unnamed: 0,Unnamed: 1,"abdul-mageed, muhammad","agirre, eneko","anastasopoulos, antonios","antoine, jean-yves","baldwin, timothy","besacier, laurent","bethard, steven","bhattacharyya, pushpak","callison-burch, chris",...,"wen, ji-rong","wiebe, janyce","xiong, deyi","yvon, françois","zhang, min","zhang, yue","zhao, hai","zhao, jun","zhou, ming","øvrelid, lilja"
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2467,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2468,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2469,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [170]:
# Convert lists of strings, accounting for None values
train_sample_filtered['author_str'] = train_sample_filtered['author'].apply(lambda x: ';'.join(map(str, x)) if x is not None else 'unknown_author')

# Count-vectorize author_str
count_vectorizer = CountVectorizer(tokenizer=lambda x: x.split(';'))
count_matrix = count_vectorizer.fit_transform(train_sample_filtered['author_str'])

# Extract and create columns
feature_names = count_vectorizer.get_feature_names_out()
count_df = pd.DataFrame(count_matrix.toarray(), columns=feature_names)

print(f"We've transformed the 'author' column to a dataframe of {len(count_df.columns)} columns.")
count_df

# To reduce dimensionalities here, we could see if we could remove all authors that only have 1 paper in the set
# Dimensionalities have to be reduced, doesn't fit inside dataframe


We've transformed the 'author' column to a dataframe of 6263 columns.




Unnamed: 0,"-, mausam","aakhus, mark","abad, alberto","abadi, david","abascal, julio g.","abate, solomon teferra","abbas, mourad","abdelali, ahmed","abdelghaffar, mohamed","abdelghaffar, mohamed a",...,"øvrelid, lilja","üksik, tiiu","čmejrek, martin","šarkutė, ligita","šimon, petr","šnajder, jan","šojat, krešimir","šuster, simon","žabokrtský, zdeněk","žganec gros, jerneja"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2467,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2468,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2469,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### 2.2.3 Vectorize 'title' column

In [155]:
from langdetect import detect

In [157]:
train_sample_filtered['language'] = train_sample_filtered['title'].apply(lambda x: detect(x))

In [161]:
train_sample_filtered['language'].value_counts()

en    2358
fr      80
it       6
de       6
ca       4
ro       4
es       2
da       2
tl       2
pt       2
nl       2
sw       1
sv       1
af       1
Name: language, dtype: int64

In [153]:
# Apply the TF-IDF vectorizer to column 'title'
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(train_sample_filtered['title'])

# Extract and create columns
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

print(f"We've transformed the 'author' column to a dataframe of {len(tfidf_df.columns)} columns.")
tfidf_df

We've transformed the 'author' column to a dataframe of 4820 columns.


Unnamed: 0,05,08,10,10th,11,12,14,14th,14ème,15,...,基于bilstm,基于语料库的形容词性别偏度历时研究,多模态表述视域下的小学数学课堂语言计量初探,多語語碼轉換之未知詞擷取,大規模詞彙語意關係自動標示之初步研究,字里行间的道德,為例,融合多层语义特征图的缅甸语图像文本识别方法,調變頻譜正規化法使用於強健語音辨識之研究,非監督式學習於中文電視新聞自動轉寫之初步應用
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 2.2 Feature selection/extraction (Lars)

In [67]:
train_sample_filtered_author.head()

Unnamed: 0,ENTRYTYPE,title,year,publisher,abstract,"-, Mausam","Aakhus, Mark","Abad, Alberto","Abadi, David","Abascal, Julio G.",...,"Øvrelid, Lilja","Üksik, Tiiu","Čmejrek, Martin","Šarkutė, Ligita","Šimon, Petr","Šnajder, Jan","Šojat, Krešimir","Šuster, Simon","Žabokrtský, Zdeněk","Žganec Gros, Jerneja"
49187,inproceedings,Learning Bilingual Sentence Embeddings via Aut...,2019,Association for Computational Linguistics,We propose a novel model architecture and trai...,,,,,,...,,,,,,,,,,
27721,inproceedings,How Could Rhetorical Relations Be Used in Mach...,1993,,,,,,,,...,,,,,,,,,,
28449,inproceedings,The Rhetorical Parsing of Unrestricted Natural...,1997,Association for Computational Linguistics,,,,,,,...,,,,,,,,,,
36059,inproceedings,CoToHiLi at LSCDiscovery: the Role of Linguist...,2022,Association for Computational Linguistics,This paper presents the contributions of the C...,,,,,,...,,,,,,,,,,
35564,inproceedings,Presentation,2006,Association for Machine Translation in the Ame...,,,,,,,...,,,,,,,,,,


In [24]:
# Set the logging level to INFO and set loading message
logging.getLogger().setLevel(logging.INFO)
    
# Load train and test sets and change all NA values to empty values
logging.info("Loading training/test data")
train = pd.DataFrame.from_records(json.load(open('../data/train.json'))).fillna("")
test = pd.DataFrame.from_records(json.load(open('../data/test.json'))).fillna("")
    
# Split the train set into train (80%) and validation (20%) sets, 5-folds
logging.info("Splitting validation")
num_folds = 5
k_fold = KFold(n_splits=num_folds, shuffle=True, random_state=123)
    
# Store a featurizer to transform the 'title' column into a bag-of-words format
featurizer_1 = ColumnTransformer(
    transformers=[("title", CountVectorizer(), "title")], remainder='drop')
featurizer_2 = ColumnTransformer(
    transformers=[("title", TfidfVectorizer(), "title")], remainder='drop')
featurizer_3 = ColumnTransformer(
    transformers=[("abstract", CountVectorizer(), "abstract")], remainder='drop')
featurizer_4 = ColumnTransformer(
    transformers=[("abstract", TfidfVectorizer(), "abstract")], remainder='drop')
featurizer_5 = ColumnTransformer(
    transformers=[("author", MultiLabelBinarizerTransformer(), "author")], remainder='drop')
featurizer_6 = ColumnTransformer(
    transformers=[("author", MultiLabelBinarizerTransformer(), "author")], remainder='drop')
featurizers = [featurizer_1, featurizer_2, featurizer_3, featurizer_4, featurizer_5, featurizer_6]

for i, featurizer in enumerate(featurizers):
    # Make a pipeline for the featurizer and a ridge model, that aims to minimize the sum of squares
    ridge_cv = make_pipeline(featurizer, Ridge())
    
    # Drop target variable column and fit both models
    logging.info(f"Fitting model with featurizer {i+1}")
    ridge_cv.fit(train.drop('year', axis=1), train['year'].values)
    
    # Calculate and report both MAE's
    logging.info("Evaluating on validation data")
    ridge_cv_scores = cross_val_score(ridge_cv, train.drop('year', axis=1), train['year'].values, cv=k_fold, scoring='neg_mean_absolute_error')
    logging.info(f"Ridge regress MAE with featurizer {i+1} ({num_folds}-fold cross-validated): {-ridge_cv_scores.mean()}")

INFO:root:Loading training/test data
INFO:root:Splitting validation
INFO:root:Fitting model with featurizer 1
INFO:root:Evaluating on validation data
INFO:root:Ridge regress MAE with featurizer 1 (5-fold cross-validated): 5.773010450586702
INFO:root:Fitting model with featurizer 2
INFO:root:Evaluating on validation data
INFO:root:Ridge regress MAE with featurizer 2 (5-fold cross-validated): 5.384430333156983
INFO:root:Fitting model with featurizer 3
INFO:root:Evaluating on validation data
INFO:root:Ridge regress MAE with featurizer 3 (5-fold cross-validated): 6.340921782179531
INFO:root:Fitting model with featurizer 4
INFO:root:Evaluating on validation data
INFO:root:Ridge regress MAE with featurizer 4 (5-fold cross-validated): 5.480748043346883
INFO:root:Fitting model with featurizer 5


NotFittedError: This MultiLabelBinarizer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [13]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(val['title'])
feature_names1 = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names1)
tfidf_df.head()

Unnamed: 0,00,000,001,0099,01,02,03,04,07,08,...,雜訊環境下應用線性估測編碼於特徵時序列之強健性語音辨識,雜訊環境與說話內容因素分析之強健性語音辨認,電腦輔助句子重組試題編製,電話查詢口語對話系統中語音辨識不確定性之處理,電話轉接對話模式與表達轉接要求句型的分析,非負矩陣分解法於語音調變頻譜強化之研究,面向中文口语理解的基于依赖引导的字特征槽填充模型,面向对话文本的实体关系抽取,面向机器阅读理解的高质量藏语数据集构建,領域相關詞彙極性分析及文件情緒分類之研究
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(val['title'])
feature_names2 = count_vectorizer.get_feature_names_out()
count_df = pd.DataFrame(count_matrix.toarray(), columns=feature_names2)
count_df.head()

Unnamed: 0,00,000,001,0099,01,02,03,04,07,08,...,雜訊環境下應用線性估測編碼於特徵時序列之強健性語音辨識,雜訊環境與說話內容因素分析之強健性語音辨認,電腦輔助句子重組試題編製,電話查詢口語對話系統中語音辨識不確定性之處理,電話轉接對話模式與表達轉接要求句型的分析,非負矩陣分解法於語音調變頻譜強化之研究,面向中文口语理解的基于依赖引导的字特征槽填充模型,面向对话文本的实体关系抽取,面向机器阅读理解的高质量藏语数据集构建,領域相關詞彙極性分析及文件情緒分類之研究
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
tfidf_df['author']

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
16474    0.0
16475    0.0
16476    0.0
16477    0.0
16478    0.0
Name: author, Length: 16479, dtype: float64

This paragraph build upon the previous baseline code. It entails the following adjustments/additions chronologically:

- [x] Removal of dummy regressor, since ridge works better from the very start;
- [x] 5-fold cross validation to reduce variability (Ridge regress MAE (5.773));
- [x] Try sklearn's other feature vectorizers (tf-idf (5.384), ...);
- [ ] Perform custom preprocessing, tokenizations within sklearn;
- [ ] Smooth sparse matrices (?);
- [ ] Tune hyperparameters of feature vectorizers (n-gram size);
- [ ] Test for or include other columns (abstract, authors (?));
- [ ] Try tasks other than regression, like lazy learning (kNN)(?);
- [ ] Try BERTopic modelling;
- [ ] 