<a href="https://colab.research.google.com/github/ipavlopoulos/paremia/blob/main/spatial_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import *

In [5]:
%%capture
!gdown 1X2nRkVB54gHv9YwNLEP_tLZQ2oPYgyZS
import pandas as pd
sayings = pd.read_csv("sayings_wcoords.csv.gz")

In [4]:
%%capture
!gdown 1hWC-H6ZObb71O6xcF_BvmQ4zbji_bhYn # the BERT embedded texts
sayings_embedded = pd.read_pickle("sayings_embeddings.csv.gz")

In [8]:
# adding the BERT embedding per text into our original dataframe
sayings["bert"] = sayings_embedded.bert

In [9]:
# drop the duplicates
sayings = sayings[~sayings.text.duplicated(keep=False)]
# keeping ones whose area is not located
sayings_unk = sayings[sayings.place=="Άδηλου τόπου"]
sayings = sayings[sayings.place!="Άδηλου τόπου"]
# distinguish ones not **geo**located
sayings_no_ll = sayings[(sayings.lat.isna())|(sayings.lon.isna())]
sayings = sayings[(sayings.lat.notna())&(sayings.lon.notna())]

In [11]:
df = pd.DataFrame({"text": sayings.text, "category": sayings.area, "lat": sayings.lat, "lon": sayings.lon, "representation":sayings.bert})
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.05, random_state=2023)
df_train, df_dev = train_test_split(df_train, test_size=df_test.shape[0], random_state=2023)
print(df_train.shape[0], df_dev.shape[0], df_test.shape[0])

88241 4903 4903


In [17]:
import geopy.distance
def degeocode(geocoded_areas, predictions, target_col="category"):
    classes = []
    # find the nearest geocoded area to the prediction in 2d 
    for p in predictions:
        dists = geocoded_areas.apply(lambda r: geopy.distance.geodesic( (r.lat, r.lon), (p[0], p[1])).km, axis=1)
        classes.append(geocoded_areas.iloc[dists.argmin()][target_col])
    return classes

def km_distance(geocodes, gold_locs, pred_ll, target_col="category"):
    dists = []
    for g, p in zip(gold_locs, pred_ll):
        gold_lat, gold_lon = geocodes[geocodes[target_col]==g][["lat", "lon"]].values[0]
        dists.append(geopy.distance.geodesic( (gold_lat, gold_lon), (p[0], p[1])).km)
    return pd.Series(dists)

# Benchmarking TFIDF vs BERT representations

In [19]:
# fitting with BERT representations
import numpy as np
model = ExtraTreesRegressor()
x = df_train.sample(1000)
model.fit(np.concatenate(x.representation.to_numpy()).reshape(1000,768), x[["lat", "lon"]].values)

In [20]:
# fitting with TFIDF
model2 = Pipeline([('vect', TfidfVectorizer()), ('clf', ExtraTreesRegressor())])
model2.fit(x.text.values, x[["lat", "lon"]].values)

In [25]:
print(f'BERT: {mean_absolute_error(df_dev[["lat", "lon"]].values, model.predict(np.concatenate(df_dev.representation.to_numpy()).reshape(df_dev.shape[0],768))):.2f}')
print(f'TFIDF: {mean_absolute_error(df_dev[["lat", "lon"]].values, model2.predict(df_dev.text.values)):.2f}')

BERT: 1.48
TFIDF: 1.67


# Tuning TFIDF

In [None]:
from sklearn.model_selection import GridSearchCV
# regression w/extratrees (lat/lon)
xtra = Pipeline([('vect', TfidfVectorizer), ('clf', ExtraTreesRegressor())])
grid_param = [
                {
                    "vect": [TfidfVectorizer()],
                    "vect__analyzer": ["word", "char"],
                    "vect__lowercase": [True, False],
                    "vect__max_df": [0.5],
                    "vect__min_df": [2],
                    "vect__ngram_range": [(1,1), (1,2), (1,3), (2,5)],
                    "clf": [ExtraTreesRegressor()],
                    "clf__n_estimators": range(10,100,10),
                    "clf__max_features": range(5, 20, 5)
                 },
              ]

gsc = GridSearchCV(estimator=xtra, param_grid=grid_param, scoring='r2', cv=5)

_train = df_train.sample(1000) # small-scale sandbox, remove this on the server
tuning_result = gsc.fit(_train.text.values, _train[["lat", "lon"]].values)

print("Best: %f using %s" % (tuning_result.best_score_, tuning_result.best_params_))

Best: 0.051876 using {'clf': ExtraTreesRegressor(max_features=15, n_estimators=70), 'clf__max_features': 15, 'clf__n_estimators': 70, 'vect': TfidfVectorizer(analyzer='char', lowercase=False, max_df=0.5, min_df=2,
                ngram_range=(1, 2)), 'vect__analyzer': 'char', 'vect__lowercase': False, 'vect__max_df': 0.5, 'vect__min_df': 2, 'vect__ngram_range': (1, 2)}


In [None]:
for test_mean, std, param in zip(
        tuning_result.cv_results_['mean_test_score'],
        tuning_result.cv_results_['std_test_score'],
        tuning_result.cv_results_['params']):
    print(f"Score: {test_mean:.2f} ({std:.2f}) with: {param}")

Score: -0.15 (0.07) with: {'clf': ExtraTreesRegressor(max_features=15, n_estimators=70), 'clf__max_features': 5, 'clf__n_estimators': 10, 'vect': TfidfVectorizer(analyzer='char', lowercase=False, max_df=0.5, min_df=2,
                ngram_range=(1, 2)), 'vect__analyzer': 'word', 'vect__lowercase': True, 'vect__max_df': 0.5, 'vect__min_df': 2, 'vect__ngram_range': (1, 1)}
Score: -0.15 (0.06) with: {'clf': ExtraTreesRegressor(max_features=15, n_estimators=70), 'clf__max_features': 5, 'clf__n_estimators': 10, 'vect': TfidfVectorizer(analyzer='char', lowercase=False, max_df=0.5, min_df=2,
                ngram_range=(1, 2)), 'vect__analyzer': 'word', 'vect__lowercase': True, 'vect__max_df': 0.5, 'vect__min_df': 2, 'vect__ngram_range': (1, 2)}
Score: -0.15 (0.07) with: {'clf': ExtraTreesRegressor(max_features=15, n_estimators=70), 'clf__max_features': 5, 'clf__n_estimators': 10, 'vect': TfidfVectorizer(analyzer='char', lowercase=False, max_df=0.5, min_df=2,
                ngram_range=(1, 

In [None]:
tuning_result.best_params_

{'clf': ExtraTreesRegressor(max_features=15, n_estimators=70),
 'clf__max_features': 15,
 'clf__n_estimators': 70,
 'vect': TfidfVectorizer(analyzer='char', lowercase=False, max_df=0.5, min_df=2,
                 ngram_range=(1, 2)),
 'vect__analyzer': 'char',
 'vect__lowercase': False,
 'vect__max_df': 0.5,
 'vect__min_df': 2,
 'vect__ngram_range': (1, 2)}

In [None]:
model = Pipeline([('vect', tuning_result.best_params_["vect"]), ('clf', tuning_result.best_params_["clf"])])
model.fit(_train.text.values, _train[["lat", "lon"]].values)

In [None]:
xtra_preds = model.predict(df_test.text.values)
print(f"Predicting lat: {mean_absolute_error(df_test.lat.values, xtra_preds[:, 0]):.2f}")
print(f"Predicting lon: {mean_absolute_error(df_test.lon.values, xtra_preds[:, 1]):.2f}")

Predicting lat: 1.33
Predicting lon: 1.66


In [None]:
# interpreting the results in terms of predicted distance
dist = km_distance(geocodes=_train.drop_duplicates("category"), gold_locs=_train.category.values, pred_ll=xtra_preds)
print(f"W/XtraTrees, {25}% of the predicted places fall within {dist.quantile(0.25):.2f}kms")
print(f"(NOTE: {len(set(df_test.category.unique())-set(df_test.category.unique()).intersection((_train.category.unique())))} areas out of the {len(set(df_test.category.unique()))} were not in the train)")

W/XtraTrees, 25% of the predicted places fall within 139.25kms
(NOTE: 30 areas out of the 108 were not in the train)
