In [8]:
import spacy
import pandas as pd
import numpy as np
from tqdm import tqdm

# python -m spacy download pt_core_news_sm

### Load data

In [9]:
# Load RAW data
df = pd.read_csv('Data/_20220705-160526-PRT.txt', sep='\t', encoding='utf-16')

print('Data loaded')

  interactivity=interactivity, compiler=compiler, result=result)


Data loaded


In [10]:
values, types = [], []
for column in tqdm(df.columns):
    if ('ENG' in column): continue
    data = df[column].tolist()

    values += data
    types += len(data)*[column]

# Create DataFrame (value|type)
df = pd.DataFrame({})
df['value'] = values
df['type'] = types

100%|██████████| 90/90 [00:01<00:00, 62.70it/s]


#### Preprocess

In [11]:
CCVs = [
'CCV:00002',
'CCV:00003',
# 'CCV:00004',
'CCV:00005',
'CCV:00008',
# 'CCV:00011', Citizenship
# 'CCV:00012', Nationality
# 'CCV:00013', Sex
# 'CCV:00014',
'CCV:00022',
'CCV:00023',
# 'CCV:00025',
'CCV:00026',
'CCV:00028',
'CCV:00029',
'CCV:00030',
'CCV:00034',
# 'CCV:00035',
# 'CCV:00036',
# 'CCV:00038', Passport issuing country
# 'CCV:00039', Passport issuing authority
# 'CCV:00041',
# 'CCV:00042',
# 'CCV:00043',
# 'CCV:00047',
# 'CCV:00048',
# 'CCV:00049',
'CCV:00052',
# 'CCV:00053',
# 'CCV:00054',
'CCV:00057',
# 'CCV:00058',
# 'CCV:00059',
'CCV:00061',
# 'CCV:00062',
# 'CCV:00063',
'CCV:00065',
# 'CCV:00066',
# 'CCV:00067',
# 'CCV:00068', Tertiary school type: 'University', 'TechnicalSchool', 'PolytechnicSchool', 'MedicalSchool', 'TradeSchool'
# 'CCV:00069',
# 'CCV:00071',
# 'CCV:00072',
'CCV:00073',
'CCV:00074',
# 'CCV:00079',
# 'CCV:00080', It is the same as CCV:00026
# 'CCV:00081',
# 'CCV:00082',
# 'CCV:00083', Insurance status: 'Insured', 'Protected', 'Uninsured'
# 'CCV:00084'
]

In [12]:
# Drop duplicates
df = df.drop_duplicates()
# Remove empty cells
df = df[df['value'] != '-']
# Select types
df = df[df['type'].isin(CCVs)]
# Convert CCV:00074-x to CCV:00074
df['type'] = df['type'].apply(lambda x:'CCV:00074' if '00074' in x else x)
# Reset index
df = df.reset_index().drop(['index'], axis=1)
print('Number of records: ', df.shape[0])

Number of records:  278821


In [13]:
# Manually remove records, which belong to two categories 
# for example:
# Abraão ['CCV:00002' 'CCV:00003']
# Afonso ['CCV:00002' 'CCV:00003']
# Alencar ['CCV:00002' 'CCV:00003']
# ...
df = df[ ~((df['value'] == 'Abraão') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Afonso') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Alencar') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Barcelos') & (df['type'] == 'CCV:00023'))]
df = df[ ~((df['value'] == 'Barreiro') & (df['type'] == 'CCV:00023'))]
df = df[ ~((df['value'] == 'Belarmino') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Braga') & (df['type'] == 'CCV:00023'))]
df = df[ ~((df['value'] == 'Cantanhede') & (df['type'] == 'CCV:00023'))]
df = df[ ~((df['value'] == 'Castelo Branco') & (df['type'] == 'CCV:00023'))]
df = df[ ~((df['value'] == 'Chaves') & (df['type'] == 'CCV:00023'))]
df = df[ ~((df['value'] == 'Coimbra') & (df['type'] == 'CCV:00023'))]
df = df[ ~((df['value'] == 'Crispim') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Franco') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Gilberto') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Guimarães') & (df['type'] == 'CCV:00023'))]
df = df[ ~((df['value'] == 'Hata') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Januário') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Jesus') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Kan') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Klinger') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Katsura') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Kuni') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Lineu') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Maia') & (df['type'] == 'CCV:00023'))]
df = df[ ~((df['value'] == 'Micael') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Miki') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Minami') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Miranda') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Moura') & (df['type'] == 'CCV:00023'))]
df = df[ ~((df['value'] == 'Onofre') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Paulino') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Romualdo') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Ronaldo') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Setúbal') & (df['type'] == 'CCV:00023'))]
df = df[ ~((df['value'] == 'Silvestre') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Silvério') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Suga') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Takaki') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Takamine') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Tamaki') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Teruya') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Tokuhisa') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Uno') & (df['type'] == 'CCV:00003'))]
df = df[ ~((df['value'] == 'Yui') & (df['type'] == 'CCV:00003'))]


# Identify the value which may belong to two different categories ['CCV:00005', 'CCV:00008']
tt = df.groupby('value').count().reset_index()
for x in tqdm(tt[tt['type'] > 1]['value']):
    df = df[ ~((df['value'] == x) & (df['type'] == 'CCV:00008'))]

100%|██████████| 346/346 [00:23<00:00, 14.42it/s]


### Create input/output ndarrays

In [7]:
from tqdm.notebook import tqdm_notebook
tqdm_notebook().pandas(desc="Progress")


nlp = spacy.load("pt_core_news_sm")

X = np.array( df['value'].progress_apply(lambda x: list(nlp(x).vector)).to_list() )

np.savez('Data/data.npz', X=X, y=df['type'].to_numpy())

0it [00:00, ?it/s]

Progress:   0%|          | 0/278431 [00:00<?, ?it/s]