In [86]:
import spacy
import pandas as pd
import numpy as np
from tqdm import tqdm

# python -m spacy download el_core_news_sm

### Load data

In [87]:
# Load RAW data
df = pd.read_csv('Data/_20220711-021643-GRE.txt', sep='\t', encoding='utf-16')

print('Data loaded')

  interactivity=interactivity, compiler=compiler, result=result)


Data loaded


In [88]:
values, types = [], []
for column in tqdm(df.columns):
    if ('ENG' in column): continue
    data = df[column].tolist()

    values += data
    types += len(data)*[column]

# Create DataFrame (value|type)
df = pd.DataFrame({})
df['value'] = values
df['type'] = types

100%|██████████| 90/90 [00:00<00:00, 140.17it/s]


#### Preprocess

In [89]:
CCVs = [
'CCV:00002',
'CCV:00003',
# 'CCV:00004',
'CCV:00005',
'CCV:00008',
# 'CCV:00011', Citizenship
# 'CCV:00012', Nationality
# 'CCV:00013', Sex
# 'CCV:00014',
'CCV:00022',
'CCV:00023',
# 'CCV:00025',
'CCV:00026',
'CCV:00028',
'CCV:00029',
'CCV:00030',
'CCV:00034',
# 'CCV:00035',
# 'CCV:00036',
# 'CCV:00038', Passport issuing country
# 'CCV:00039', Passport issuing authority
# 'CCV:00041',
# 'CCV:00042',
# 'CCV:00043',
# 'CCV:00047',
# 'CCV:00048',
# 'CCV:00049',
'CCV:00052',
# 'CCV:00053',
# 'CCV:00054',
'CCV:00057',
# 'CCV:00058',
# 'CCV:00059',
'CCV:00061',
# 'CCV:00062',
# 'CCV:00063',
'CCV:00065',
# 'CCV:00066',
# 'CCV:00067',
# 'CCV:00068', Tertiary school type: 'University', 'TechnicalSchool', 'PolytechnicSchool', 'MedicalSchool', 'TradeSchool'
# 'CCV:00069',
# 'CCV:00071',
# 'CCV:00072',
'CCV:00073',
'CCV:00074',
# 'CCV:00079',
# 'CCV:00080', It is the same as CCV:00026
# 'CCV:00081',
# 'CCV:00082',
# 'CCV:00083', Insurance status: 'Insured', 'Protected', 'Uninsured'
# 'CCV:00084'
]

In [90]:
# Drop duplicates
df = df.drop_duplicates()
# Remove empty cells
df = df[df['value'] != '-']
# Select types
df = df[df['type'].isin(CCVs)]
# Convert CCV:00074-x to CCV:00074
df['type'] = df['type'].apply(lambda x:'CCV:00074' if '00074' in x else x)
# Reset index
df = df.reset_index().drop(['index'], axis=1)
print('Number of records: ', df.shape[0])

Number of records:  296155


In [91]:
# Manually remove records, which belong to two categories 
# for example:
# Άγγελος ['CCV:00002' 'CCV:00003']
# Ακριβή ['CCV:00002' 'CCV:00003']
# Ανδρόνικος ['CCV:00002' 'CCV:00003']
# Αντώνης ['CCV:00002' 'CCV:00003']
# Αρχοντή ['CCV:00002' 'CCV:00003']
# Ευκρατίδης ['CCV:00002' 'CCV:00003']
# Θεσσαλονίκη ['CCV:00002' 'CCV:00023']
# Κέρκυρα ['CCV:00002' 'CCV:00023']
# Πλάτων ['CCV:00002' 'CCV:00003']
# Πλειώνη ['CCV:00002' 'CCV:00003']
# Σπάρτη ['CCV:00002' 'CCV:00023']

df = df[ ~((df['value'] == 'Άγγελος') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Ακριβή') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Ανδρόνικος') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Αντώνης') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Αρχοντή') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Ευκρατίδης') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Θεσσαλονίκη') & (df['type'] == 'CCV:00002'))]
df = df[ ~((df['value'] == 'Κέρκυρα') & (df['type'] == 'CCV:00002'))]
df = df[ ~((df['value'] == 'Πλάτων') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Πλειώνη') & (df['type'] == 'CCV:00002'))]
df = df[ ~((df['value'] == 'Σπάρτη') & (df['type'] == 'CCV:00002'))]

### Create input/output ndarrays

In [92]:
from tqdm.notebook import tqdm_notebook
tqdm_notebook().pandas(desc="Progress")


nlp = spacy.load("el_core_news_sm")

X = np.array( df['value'].progress_apply(lambda x: list(nlp(x).vector)).to_list() )

np.savez('Data/data.npz', X=X, y=df['type'].to_numpy())

0it [00:00, ?it/s]

Progress:   0%|          | 0/296144 [00:00<?, ?it/s]