<a href="https://colab.research.google.com/github/ipavlopoulos/paremia/blob/main/tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import os
import ast

from sklearn.base import TransformerMixin
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

In [4]:
corpus_path = "data/balanced_corpus.csv"
root = 'https://raw.githubusercontent.com/ipavlopoulos/paremia/main/'
if not os.path.exists(corpus_path):
  corpus_path = root + corpus_path
balanced_corpus = pd.read_csv(corpus_path, index_col=0)

In [5]:
train, test = train_test_split(balanced_corpus, test_size=0.05, random_state=2023)
train, dev = train_test_split(train, test_size=test.shape[0], random_state=2023)

In [6]:
 # small-scale sandbox
tuning_set = train.sample(1000, random_state=2023)

# Text regression

#### Tuning the ExtraTrees

In [None]:
# tuning extratrees
xtra = Pipeline([('vect', TfidfVectorizer()), ('clf', ExtraTreesRegressor())])
grid_param = [
                {
                    "vect": [TfidfVectorizer()],
                    "vect__analyzer": ["word", "char"],
                    "vect__lowercase": [True],
                    "vect__max_df": [0.5],
                    "vect__min_df": [2, 10],
                    "vect__ngram_range": [(1,1), (1,2), (2,5)],
                    "clf": [ExtraTreesRegressor()],
                    "clf__n_estimators": [10, 100], # this is redundant
                 },
              ]

gsc = GridSearchCV(estimator=xtra, param_grid=grid_param, scoring='r2', cv=3)
tuning_result = gsc.fit(tuning_set.text.values, tuning_set[["lat", "lon"]].values)
print("Best: %f using %s" % (tuning_result.best_score_, tuning_result.best_params_))

Best: 0.061708 using {'clf': ExtraTreesRegressor(), 'clf__n_estimators': 100, 'vect': TfidfVectorizer(analyzer='char', max_df=0.5, min_df=10, ngram_range=(1, 2)), 'vect__analyzer': 'char', 'vect__lowercase': True, 'vect__max_df': 0.5, 'vect__min_df': 10, 'vect__ngram_range': (1, 2)}


In [None]:
# tuning the best result on all the texts
xtra_tuned = Pipeline([('vect', tuning_result.best_params_["vect"]), ('clf', tuning_result.best_params_["clf"])])
xtra_tuned.fit(train.text.values, train[["lat", "lon"]].values)
predicted_coords = xtra_tuned.predict(test.text.values)
print(f"MAE of lat and lon: {mean_absolute_error(test.lat.values, predicted_coords[:, 0]):.2f} & {mean_absolute_error(test.lon.values, predicted_coords[:, 1]):.2f}")
print(f"MSE of lat and lon: {mean_squared_error(test.lat.values, predicted_coords[:, 0]):.2f} & {mean_squared_error(test.lon.values, predicted_coords[:, 1]):.2f}")

MAE of lat and lon: 1.22 & 1.74
MSE of lat and lon: 2.48 & 5.11


#### Tuning the Forest

In [None]:
# tuning extratrees
forest = Pipeline([('vect', TfidfVectorizer()), ('clf', RandomForestRegressor())])
grid_param = [
                {
                    "vect": [TfidfVectorizer()],
                    "vect__analyzer": ["word", "char"],
                    "vect__lowercase": [True],
                    "vect__max_df": [0.5],
                    "vect__min_df": [2, 10],
                    "vect__ngram_range": [(1,1), (1,2), (2,5)],
                    "clf": [RandomForestRegressor()],
                    "clf__n_estimators": [10, 100],
                 },
              ]

gsc = GridSearchCV(estimator=forest, param_grid=grid_param, scoring='r2', cv=3)
tuning_result = gsc.fit(tuning_set.text.values, tuning_set[["lat", "lon"]].values)
print("Best: %f using %s" % (tuning_result.best_score_, tuning_result.best_params_))

Best: 0.109010 using {'clf': RandomForestRegressor(), 'clf__n_estimators': 100, 'vect': TfidfVectorizer(analyzer='char', max_df=0.5, min_df=2, ngram_range=(2, 5)), 'vect__analyzer': 'char', 'vect__lowercase': True, 'vect__max_df': 0.5, 'vect__min_df': 2, 'vect__ngram_range': (2, 5)}


In [None]:
# tuning the best result on all the texts
forest_tuned = Pipeline([('vect', tuning_result.best_params_["vect"]), ('clf', tuning_result.best_params_["clf"])])
forest_tuned.fit(train.text.values, train[["lat", "lon"]].values)
forest_coords = forest_tuned.predict(test.text.values)
print(f"MAE of lat and lon: {mean_absolute_error(test.lat.values, forest_coords[:, 0]):.2f} & {mean_absolute_error(test.lon.values, forest_coords[:, 1]):.2f}")
print(f"MSE of lat and lon: {mean_squared_error(test.lat.values, forest_coords[:, 0]):.2f} & {mean_squared_error(test.lon.values, forest_coords[:, 1]):.2f}")

MAE of lat and lon: 1.22 & 1.69
MSE of lat and lon: 2.45 & 5.00


#### Classification assessment

In [None]:
import geopy.distance
def degeocode(geocoded_areas, predictions, target_col="category"):
    classes = []
    # find the nearest geocoded area to the prediction in 2d 
    for p in predictions:
        dists = geocoded_areas.apply(lambda r: geopy.distance.geodesic( (r.lat, r.lon), (p[0], p[1])).km, axis=1)
        classes.append(geocoded_areas.iloc[dists.argmin()][target_col])
    return classes

def km_distance(geocodes, gold_locs, pred_ll, target_col="category"):
    dists = []
    for g, p in zip(gold_locs, pred_ll):
        gold_lat, gold_lon = geocodes[geocodes[target_col]==g][["lat", "lon"]].values[0]
        dists.append(geopy.distance.geodesic( (gold_lat, gold_lon), (p[0], p[1])).km)
    return pd.Series(dists)

In [None]:
# interpreting the results in terms of predicted distance
dist = km_distance(geocodes=_train.drop_duplicates("category"), gold_locs=_train.category.values, pred_ll=xtra_preds)
print(f"W/XtraTrees, {25}% of the predicted places fall within {dist.quantile(0.25):.2f}kms")
print(f"(NOTE: {len(set(df_test.category.unique())-set(df_test.category.unique()).intersection((_train.category.unique())))} areas out of the {len(set(df_test.category.unique()))} were not in the train)")

# Classification

#### SVC

In [14]:
# tuning extratrees
svc = Pipeline([('vect', TfidfVectorizer()), ('clf', LinearSVC())])
grid_param = [
                {
                    "vect": [TfidfVectorizer()],
                    "vect__analyzer": ["word", "char"],
                    "vect__lowercase": [True],
                    "vect__max_df": [0.5],
                    "vect__min_df": [2, 10],
                    "vect__ngram_range": [(1,1), (1,2), (2,5)],
                    "clf": [LinearSVC()],
                 },
              ]

gsc = GridSearchCV(estimator=svc, param_grid=grid_param, scoring='accuracy', cv=3)
tuning_result = gsc.fit(tuning_set.text.values, tuning_set.area.values)
print("Best: %f using %s" % (tuning_result.best_score_, tuning_result.best_params_))

Best: 0.163028 using {'clf': LinearSVC(), 'vect': TfidfVectorizer(analyzer='char', max_df=0.5, min_df=10, ngram_range=(2, 5)), 'vect__analyzer': 'char', 'vect__lowercase': True, 'vect__max_df': 0.5, 'vect__min_df': 10, 'vect__ngram_range': (2, 5)}


In [19]:
# benchmarking text classification algorithms
def benchmark_clf(train, dev):
    for algo_name, algorithm in (('SVM', LinearSVC), 
                                 ('KNN', KNeighborsClassifier),
                                 ('Forest', RandomForestClassifier), 
                                 ('LR', LogisticRegression)):
        vect = TfidfVectorizer(ngram_range=(2,5), analyzer="char", max_df=0.5, min_df=10, lowercase=True)
        locator = Pipeline([('vect', vect), ('clf', algorithm())]) 
        locator.fit(train.text.values, train.area.values) 
        preds = locator.predict(test.text.values) 
        print(f'{algo_name}\n{classification_report(test.area.values, preds)}')

benchmark_clf(train, test)

SVM
                 precision    recall  f1-score   support

        Ήπειρος       0.10      0.09      0.09        23
        Αιτωλία       0.35      0.29      0.32        24
        Αμοργός       0.27      0.32      0.29        22
Ανατολική Θράκη       0.18      0.25      0.21        24
        Αρκαδία       0.14      0.13      0.14        31
          Αχαΐα       0.41      0.34      0.37        32
      Επτάνησος       0.39      0.61      0.47        23
         Εύβοια       0.05      0.05      0.05        20
      Θεσπρωτία       0.21      0.18      0.20        22
          Θράκη       0.23      0.20      0.21        25
       Ιωάννινα       0.24      0.14      0.17        29
       Κάρπαθος       0.43      0.36      0.39        28
     Κεφαλληνία       0.11      0.11      0.11        27
          Κρήτη       0.47      0.30      0.37        30
         Κύπρος       0.68      0.88      0.76        24
         Λέσβος       0.43      0.50      0.46        24
        Λακωνία       0.10