In [None]:
import spacy
import pandas as pd
import numpy as np
from tqdm import tqdm
# python -m spacy download pt_core_news_sm

### Load data

In [None]:
# Import data
df = pd.read_csv('Data/_20230622-130921_training.tsv', sep='\t', header=None)

### Preprocess

In [None]:
# Rename columns
df.rename(columns = {0: 'Value',
                     1: 'Evidence_type',
                     2: 'CCV'}, 
          inplace = True)

# Remove evidence_type
df.drop('Evidence_type', axis=1, inplace=True)

# Remove duplicates
df.drop_duplicates(inplace=True)

# Remove ENG instances
ENG_ccvs = [ccv for ccv in df['CCV'].unique() if 'ENG' in ccv]
df = df[ ~df['CCV'].isin(ENG_ccvs) ]

# Drop CCVs, which can be handled with hard-rules
drop_CCVs = ['CCV:00004', 'CCV:00012', 'CCV:00013', 'CCV:00014', 
             'CCV:00071', 'CCV:00072', 'CCV:00073', 'CCV:00084', 
             'CCV:00094', 'CCV:00034', 'CCV:00035', 'CCV:00036', 'CCV:00100',
             'CCV:00042', 'CCV:00043', 'CCV:00047', 'CCV:00048', 
             'CCV:00066', 'CCV:00067', 'CCV:00068', 'CCV:00069', 
             'CCV:00083', 'CCV:00028', 'CCV:00029', 'CCV:00079', 
             'CCV:00080', 'CCV:00081', 'CCV:00082', 'CCV:00085', 
             'CCV:00087', 'CCV:00088', 'CCV:00089', 'CCV:00090', 
             'CCV:00092', 'CCV:00093', 'CCV:00038', 'CCV:00022', 
             'CCV:00025', 'CCV:00026', 'CCV:00041', 'CCV:00053', 
             'CCV:00054', 'CCV:00049', 'CCV:00057', 'CCV:00058', 
             'CCV:00059', 'CCV:00061', 'CCV:00062', 'CCV:00063',
             'CCV:00096', 'CCV:00097', 'CCV:00098', 'CCV:00099']

df = df[ ~df['CCV'].isin(drop_CCVs) ]

# Reset index
df = df.reset_index().drop('index', axis=1)
print('Number of records: ', df.shape[0])


# if 'Γυμνάσιο' in text: return 'CCV:00057'
# if 'Λύκειο' in text: return 'CCV:00061'
# if 'Νοσοκομείο' in text: return 'CCV:00073' ή 'CCV:00089'
# if 'ΚΕΠ' in text: return 'CCV:00094' ή ''CCV:00029'
# if 'Ληξιαρχείο' in text: return 'CCV:00028'
# if 'ΑΤ,' in text: return 'CCV:00080'
# if len(text) > 30 return 'CCV:00085' 
# if 'GR' in text: return 'CCV:00098'
# if 'bank' in text or 'Bank' in text: return 'CCV:00097'

list( df.CCV.unique() ) + ['CCV:00057', 'CCV:00061', 'CCV:00073', 'CCV:00089', 'CCV:00094', 'CCV:00029', 
'CCV:00028', 'CCV:00080', 'CCV:00085', 'CCV:00098', 'CCV:00097']

In [None]:
# If an instance is contained in two or more CCVs, then we randomly choose one of them
df = df.sample(frac=1).reset_index(drop=True)

select_indices = df.drop('CCV', axis=1).drop_duplicates().index

df = df.iloc[select_indices]

### Create input/output ndarrays

In [None]:
from tqdm import  tqdm
nlp = spacy.load("pt_core_news_sm")

X = np.array( [nlp(x).vector for x in tqdm(df['Value'].values)] )

np.savez('Data/data.npz', X=X, entity_type=df['Evidence_type'].to_numpy(), y=df['CCV'].to_numpy())

  0%|          | 865/376500 [00:20<47:45, 131.10it/s]

# ML

In [None]:
# X = ...
y = df['CCV'].to_numpy()


le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
import xgboost

params = {
    'n_estimators'      : 50,
    'learning_rate'     : 1e-1,
    'max_depth'         : 6,
    'reg_alpha'         : 20, #trial.suggest_categorical('reg_alpha', [10, 20, 30]),
    'reg_lambda'        : 1.0, #trial.suggest_loguniform('reg_lambda', 0, 1),
    'gamma'             : 1, #trial.suggest_loguniform('gamma', 1 , 9),
    'min_child_weight'  : 2, #trial.suggest_int('min_child_weight', 2, 4),
    'max_leaves'        : 2, #trial.suggest_int('max_leaves', 2, 5),
    'eval_metric':'auc'
}

model = xgboost.XGBClassifier(objective           = 'multi:softmax',  
                                n_jobs              = -1,                                   
                                validate_parameters = True, 
                                verbosity           = 1,
                                tree_method         = 'hist',
                                **params)


le = LabelEncoder()
trainY = le.fit_transform(y)

weights = [y.shape[0] /np.where(y == i)[0].shape[0] for i in np.unique(y)]

model.fit(X, y,
        eval_set = [ (X, y) ],
        sample_weight = [weights[int(x)] for x in y],
        verbose = 10);

In [None]:
import umap

# Setup dimensionality reduction model
umap_model = umap.UMAP(n_neighbors=5, 
                       n_components=15, 
                       metric='euclidean',
                       random_state=42)


X_d = umap_model.fit_transform(X)