# Geographical regression of Greek proverbs

* Using a collection of proverbs from the [Hellenic Folklore Research Centre](http://www.kentrolaografias.gr/) of the Academy of Athens
* Employing text regression to estimate the lat/lon of proverbs whose information is not known.
* Studying feature importance, i.e., terms that distinguish spatially, from South to North, from West to East.
* Experimenting with multiple train/test splits for statistical significance.
---


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesRegressor
import ast

In [5]:
reg_models = {'el':[], 'nn':[], 'rf':[], 'lr':[], 'xt':[]}
scores = {m:{'mse':{'lat':[], 'lon':[]}, 'mae':{'lat':[], 'lon':[]}} for m in reg_models}

root = 'https://github.com/greek-proverb-atlas/proverbs.gr/raw/refs/heads/main'
for i in range(2,6):
    balanced_corpus = pd.read_csv(f"{root}/data/balanced_corpus_{i}.csv", index_col=0).reset_index()

    # quality assurance code
    geolocs = pd.read_csv(f'{root}/data/geolocs.csv')
    balanced_corpus.lat = balanced_corpus.area.apply(lambda x: geolocs[x].iloc[0])
    balanced_corpus.lon = balanced_corpus.area.apply(lambda x: geolocs[x].iloc[1])

    train, test = train_test_split(balanced_corpus, test_size=0.05, random_state=2023)
    train, dev = train_test_split(train, test_size=test.shape[0], random_state=2023)

    vect = TfidfVectorizer(ngram_range=(1,1), analyzer="word", max_df=0.5, min_df=10, lowercase=True)
    vect.fit(train.text.values)

    # elastic
    el_reg_best_params = {'alpha': 0.0006167913176584872, 'l1_ratio': 0.7277034948352414}
    el_reg = Pipeline([('vect', vect), ('reg', ElasticNet(**el_reg_best_params))])
    el_reg.fit(train.text.values, train[["lat", "lon"]].values)
    reg_models['el'] = el_reg

    # knn
    nn_reg_best_params = {'n_neighbors': 22, 'weights': 'uniform', 'p': 2}
    nn_reg = Pipeline([('vect', vect), ('reg', KNeighborsRegressor(**nn_reg_best_params))])
    nn_reg.fit(train.text.values, train[["lat", "lon"]].values)
    reg_models['nn'] = nn_reg

    # forest
    rf_reg_best_params = {'n_estimators': 401, 'max_depth': 32, 'min_samples_split': 3, 'min_samples_leaf': 6}
    rf_reg = Pipeline([('vect', vect), ('reg', RandomForestRegressor(**rf_reg_best_params))])
    rf_reg.fit(train.text.values, train[["lat", "lon"]].values)
    reg_models['rf'] = rf_reg

    # extra trees
    xt_reg_best_params = {'n_estimators': 300, 'max_depth': 32, 'min_samples_split': 7, 'min_samples_leaf': 8}
    xt_reg = Pipeline([('vect', vect), ('reg', ExtraTreesRegressor(**xt_reg_best_params))])
    xt_reg.fit(train.text.values, train[["lat", "lon"]].values)
    reg_models['xt'] = xt_reg

    # linear regression
    lr_reg = Pipeline([('vect', vect), ('reg', LinearRegression())])
    lr_reg.fit(train.text.values, train[["lat", "lon"]].values)
    reg_models['lr'] = lr_reg

    for model_name in reg_models:
        regressor = reg_models[model_name]
        preds = regressor.predict(test.text.values)
        mae_lat, mae_lon = (mean_absolute_error(test.lat.values, preds[:, 0]),
                            mean_absolute_error(test.lon.values, preds[:, 1]))
        mse_lat, mse_lon = (mean_squared_error(test.lat.values, preds[:, 0]),
                            mean_squared_error(test.lon.values, preds[:, 1]))
        scores[model_name]['mae']['lat'].append(mae_lat)
        scores[model_name]['mae']['lon'].append(mae_lon)
        scores[model_name]['mse']['lat'].append(mse_lat)
        scores[model_name]['mse']['lon'].append(mse_lon)


In [6]:
pd.DataFrame({'mae-lon':pd.DataFrame({m:scores[m]['mae']['lon'] for m in reg_models}).mean(),
              'mae-lat':pd.DataFrame({m:scores[m]['mae']['lat'] for m in reg_models}).mean(),
              'mse-lon':pd.DataFrame({m:scores[m]['mse']['lon'] for m in reg_models}).mean(),
              'mse-lat':pd.DataFrame({m:scores[m]['mse']['lat'] for m in reg_models}).mean()})

Unnamed: 0,mae-lon,mae-lat,mse-lon,mse-lat
el,2.852402,1.387452,15.00908,2.967122
nn,3.240114,1.482188,17.870896,3.375482
rf,2.885921,1.448215,15.665696,3.240905
lr,2.911631,1.380598,15.516081,2.987923
xt,2.881278,1.44552,15.571935,3.237811


In [7]:
pd.DataFrame({'mae-lon':pd.DataFrame({m:scores[m]['mae']['lon'] for m in reg_models}).sem(),
              'mae-lat':pd.DataFrame({m:scores[m]['mae']['lat'] for m in reg_models}).sem(),
              'mse-lon':pd.DataFrame({m:scores[m]['mse']['lon'] for m in reg_models}).sem(),
              'mse-lat':pd.DataFrame({m:scores[m]['mse']['lat'] for m in reg_models}).sem()})

Unnamed: 0,mae-lon,mae-lat,mse-lon,mse-lat
el,0.027371,0.007265,0.281894,0.028659
nn,0.089338,0.004781,0.810607,0.020581
rf,0.020596,0.004682,0.265272,0.016261
lr,0.026549,0.01324,0.385361,0.047276
xt,0.023227,0.009773,0.278193,0.033256
