In [None]:
import spacy
import pandas as pd
import numpy as np
from tqdm import tqdm
# python -m spacy download el_core_news_sm

### Load data

In [None]:
# Import data
df = pd.read_csv('Data/_20230622-052634_training.tsv', sep='\t', header=None)

### Preprocess

In [None]:
# Rename columns
df.rename(columns = {0: 'Value',
                     1: 'Evidence_type',
                     2: 'CCV'}, 
          inplace = True)

# Remove evidence_type
df.drop('Evidence_type', axis=1, inplace=True)

# Remove duplicates
df.drop_duplicates(inplace=True)

# Remove ENG instances
ENG_ccvs = [ccv for ccv in df['CCV'].unique() if 'ENG' in ccv]
df = df[ ~df['CCV'].isin(ENG_ccvs) ]

# Drop CCVs, which can be handled with hard-rules
drop_CCVs = ['CCV:00004', 'CCV:00012', 'CCV:00013', 'CCV:00014', 
             'CCV:00071', 'CCV:00072', 'CCV:00073', 'CCV:00084', 
             'CCV:00094', 'CCV:00034', 'CCV:00035', 'CCV:00036', 'CCV:00100',
             'CCV:00042', 'CCV:00043', 'CCV:00047', 'CCV:00048', 
             'CCV:00066', 'CCV:00067', 'CCV:00068', 'CCV:00069', 
             'CCV:00083', 'CCV:00028', 'CCV:00029', 'CCV:00079', 
             'CCV:00080', 'CCV:00081', 'CCV:00082', 'CCV:00085', 
             'CCV:00087', 'CCV:00088', 'CCV:00089', 'CCV:00090', 
             'CCV:00092', 'CCV:00093', 'CCV:00038', 'CCV:00022', 
             'CCV:00025', 'CCV:00026', 'CCV:00041', 'CCV:00053', 
             'CCV:00054', 'CCV:00049', 'CCV:00057', 'CCV:00058', 
             'CCV:00059', 'CCV:00061', 'CCV:00062', 'CCV:00063',
             'CCV:00096', 'CCV:00097', 'CCV:00098', 'CCV:00099']

df = df[ ~df['CCV'].isin(drop_CCVs) ]

# Reset index
df = df.reset_index().drop('index', axis=1)
print('Number of records: ', df.shape[0])

In [None]:
# If an instance is contained in two or more CCVs, then we randomly choose one of them
df = df.sample(frac=1).reset_index(drop=True)

select_indices = df.drop('CCV', axis=1).drop_duplicates().index

df = df.iloc[select_indices]
print('Number of records: ', df.shape[0])

### Create input/output ndarrays

In [None]:
from tqdm import  tqdm
nlp = spacy.load("el_core_news_sm")

X = np.array( [nlp(x).vector for x in tqdm(df['Value'].values)] )

np.savez('Data/data.npz', X=X, y=df['CCV'].to_numpy())