In [1]:
pip install sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [34]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
import logging
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

In [5]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

In [25]:
train = pd.DataFrame.from_records(json.load(open('../../data/train.json'))).fillna("")

In [6]:
# Step 1: Concatenate 'title' and 'abstract'
train['combined_text'] = train['title'] + ' ' + train['abstract']

# Step 2: Obtain sentence embeddings for the combined text
embeddings = model.encode(train['combined_text'].tolist(), convert_to_tensor=True)

In [28]:
# Step 3: Create a new DataFrame with the sentence embeddings and 'year' as the target variable
embedding_columns = [f'dim_{i+1}' for i in range(embeddings.shape[1])]
embedding_df = pd.DataFrame(embeddings.numpy(), columns=embedding_columns)
embedding_df.reset_index(drop=True, inplace=True)
result_df = pd.concat([embedding_df, pd.to_numeric(train['year'])], axis=1)

# Display the resulting DataFrame
print(result_df.head())

      dim_1     dim_2     dim_3     dim_4     dim_5     dim_6     dim_7  \
0  0.003490 -0.002893 -0.029641  0.013807  0.045661  0.004433 -0.015949   
1  0.049272  0.025005 -0.034317  0.025440 -0.073576 -0.046910 -0.012463   
2 -0.003629 -0.010785 -0.028453  0.048054 -0.069712  0.032118  0.012705   
3  0.005922  0.056695 -0.019930 -0.030621  0.010721  0.030278  0.004040   
4  0.021016  0.015016 -0.034241 -0.005293 -0.017108  0.015270 -0.017285   

      dim_8     dim_9    dim_10  ...   dim_760   dim_761   dim_762   dim_763  \
0  0.037939 -0.051580  0.009515  ...  0.007225  0.007118  0.020501 -0.007893   
1  0.024625  0.005171 -0.025421  ...  0.004504 -0.008347  0.052653 -0.017780   
2 -0.016930 -0.063091 -0.042111  ...  0.049551 -0.026112 -0.009532 -0.016229   
3 -0.011463 -0.042495 -0.043800  ...  0.064271  0.031771  0.043024  0.003078   
4  0.026525  0.002404  0.010120  ... -0.013973  0.002729  0.015522 -0.043427   

    dim_764   dim_765   dim_766   dim_767   dim_768  year  
0 -0.018

In [17]:
def train_model(df, m, name):
    model = m
    train, val = train_test_split(df, stratify=df['year'], random_state=123)
    model.fit(train.drop('year', axis=1), train['year'].values)
    predictions = model.predict(val.drop('year', axis=1))
    mae = mean_absolute_error(val['year'].values, predictions)
    print("{0} mae {1}".format(name,mae))

train_model(result_df, DecisionTreeRegressor(),"Decision Tree Regressor")
train_model(result_df, Ridge(),"Ridge Regressor")
train_model(result_df, LinearRegression(),"Linear Regressor")

Decision Tree Regressor mae 6.679513449341964
Ridge Regressor mae 4.629345174220314
Linear Regressor mae 4.636169807678996


In [45]:
def train_kNN(df, neighbors, metrics):
    scores = {}

    for k in neighbors:
        scores[k] = {}
        for metric in metrics:
            model = KNeighborsRegressor(n_neighbors=k, metric=metric, weights= 'distance')
            train, val = train_test_split(df, stratify=df['year'], random_state=123)
            model.fit(train.drop('year', axis=1), train['year'].values)
            predictions = model.predict(val.drop('year', axis=1))
            mae = mean_absolute_error(val['year'].values, predictions)
            print(f"{k}-Nearest Neighbors with similarity function '{metric}': MAE = {mae}")
            scores[k][metric] = mae
    
    return scores

In [46]:
neighbors = np.arange(1, 13, 1)
metrics = ['cosine', 'euclidean']

train_kNN(result_df, neighbors, metrics)

1-Nearest Neighbors with similarity function 'cosine': MAE = 4.440439347047758
1-Nearest Neighbors with similarity function 'euclidean': MAE = 4.438861581406648
2-Nearest Neighbors with similarity function 'cosine': MAE = 4.084740521070501
2-Nearest Neighbors with similarity function 'euclidean': MAE = 4.105121968543005
3-Nearest Neighbors with similarity function 'cosine': MAE = 3.9630431662974814
3-Nearest Neighbors with similarity function 'euclidean': MAE = 3.9930708711016334
4-Nearest Neighbors with similarity function 'cosine': MAE = 3.90969339213855
4-Nearest Neighbors with similarity function 'euclidean': MAE = 3.943579259090733
5-Nearest Neighbors with similarity function 'cosine': MAE = 3.8879529107377016
5-Nearest Neighbors with similarity function 'euclidean': MAE = 3.926777902846579
6-Nearest Neighbors with similarity function 'cosine': MAE = 3.874491800990524
6-Nearest Neighbors with similarity function 'euclidean': MAE = 3.9163323159954015
7-Nearest Neighbors with simila

{1: {'cosine': 4.440439347047758, 'euclidean': 4.438861581406648},
 2: {'cosine': 4.084740521070501, 'euclidean': 4.105121968543005},
 3: {'cosine': 3.9630431662974814, 'euclidean': 3.9930708711016334},
 4: {'cosine': 3.90969339213855, 'euclidean': 3.943579259090733},
 5: {'cosine': 3.8879529107377016, 'euclidean': 3.926777902846579},
 6: {'cosine': 3.874491800990524, 'euclidean': 3.9163323159954015},
 7: {'cosine': 3.8670177019226966, 'euclidean': 3.909925692394376},
 8: {'cosine': 3.8618125699952848, 'euclidean': 3.907554487021411},
 9: {'cosine': 3.8717850027267007, 'euclidean': 3.9201752832313868},
 10: {'cosine': 3.8703007509108955, 'euclidean': 3.9190687453515256},
 11: {'cosine': 3.874147441523173, 'euclidean': 3.9233427342722256},
 12: {'cosine': 3.873523809636109, 'euclidean': 3.9227350785040342}}