In [34]:
import pandas as pd
import numpy as np
from tqdm import tqdm

### Load data

In [None]:
# Import data
df = pd.read_csv('Data/_20230622-130921_training.tsv', sep='\t', header=None)

# Rename columns
df.rename(columns = {0: 'Value',
                     1: 'Evidence_type',
                     2: 'CCV'}, 
          inplace = True)

# Remove ENG instances
ENG_ccvs = [ccv for ccv in df['CCV'].unique() if 'ENG' in ccv]
df = df[ ~df['CCV'].isin(ENG_ccvs) ]

# Remove duplicates
df.drop_duplicates(inplace=True)

df = df.reset_index().drop('index', axis=1)

In [None]:
# # Import data
# df = pd.read_csv('Data/_20230622-130921_training.tsv', sep='\t', header=None)

# # Rename columns
# df.rename(columns = {0: 'Value',
#                      1: 'Evidence_type',
#                      2: 'CCV'}, 
#           inplace = True)


# # Remove duplicates
# df.drop_duplicates(inplace=True)

# # Selected CCVs, which can be handled with hard-rules
# selected_CCVs = ['CCV:00004', 'CCV:00012', 'CCV:00013', 'CCV:00014', 
#                  'CCV:00071', 'CCV:00072', 'CCV:00073', 'CCV:00084', 
#                  'CCV:00094', 'CCV:00034', 'CCV:00035', 'CCV:00036', 'CCV:00100',
#                  'CCV:00042', 'CCV:00043', 'CCV:00047', 'CCV:00048', 
#                  'CCV:00066', 'CCV:00067', 'CCV:00068', 'CCV:00069', 
#                  'CCV:00083', 'CCV:00028', 'CCV:00029', 'CCV:00079', 
#                  'CCV:00080', 'CCV:00081', 'CCV:00082', 'CCV:00085', 
#                  'CCV:00087', 'CCV:00088', 'CCV:00089', 'CCV:00090', 
#                  'CCV:00092', 'CCV:00093', 'CCV:00038', 'CCV:00022', 
#                  'CCV:00025', 'CCV:00026', 'CCV:00041', 'CCV:00053', 
#                  'CCV:00054', 'CCV:00049', 'CCV:00057', 'CCV:00058', 
#                  'CCV:00059', 'CCV:00061', 'CCV:00062', 'CCV:00063',
#                  'CCV:00096', 'CCV:00097', 'CCV:00098', 'CCV:00099']

# df = df[ df['CCV'].isin(selected_CCVs) ]

# df = df.reset_index().drop('index', axis=1)


### Preprocess

In [40]:
def is_float(string):
    try:
        float(string)
        return True
    except ValueError:
        return False

def contains_only_letters(string):
    for char in string:
        if not char.isalpha():
            return False
    return True

def hard_rules_GR(text, Evidence_type):
    # Hard-rule for 'CCV:00012'
    if 'Portuguese' == text: return 'CCV:00012'
    # Hard-rule for 'CCV:00013'
    if text in ['Male', 'Female']: return 'CCV:00013'
    # Hard-rule for 'CCV:00014'
    if is_float(text) and Evidence_type == 'Passport': return 'CCV:00014'
    # Hard-rule for 'CCV:00022'
    if len(text) == 14 and text[:8].isdigit() and text[8] == ' ' and contains_only_letters(text[-3:-1]) and text[-1].isdigit(): return 'CCV:00022'
    # Hard-rule for 'CCV:00026'
    if 'polícia,' in text and Evidence_type == 'ID': return 'CCV:00026'
    # # Hard-rule for 'CCV:00028'
    # if 'Cartório de Registro Civil' in text: return 'CCV:00028'
    # # Hard-rule for 'CCV:00029'
    # if 'ΚΕΠ' in text and Evidence_type == 'BirthCertificate': return 'CCV:00029'
    # Hard-rule for 'CCV:00034'
    if len(text) == 9 and contains_only_letters(text[:2]) and text[2:].isdigit(): return 'CCV:00034'
    # Hard-rule for 'CCV:00038'
    if 'PRT' == text: return 'CCV:00038'
    # Hard-rule for 'CCV:00041'
    if len(text) == 11 and text.isdigit(): return 'CCV:00041'  
    # Hard-rule for 'CCV:00053'
    if Evidence_type == 'PrimarySchool' and text.isdigit() and len(text) == 8: return 'CCV:00053'
    # Hard-rule for 'CCV:00054'
    if Evidence_type == 'PrimarySchool' and text.isdigit() and len(text) <= 2: return 'CCV:00054'
    # Hard-rule for 'CCV:00057'
    if 'Secundária' in text: return 'CCV:00057'
    # Hard-rule for 'CCV:00058'
    if Evidence_type == 'LowerSecondarySchool' and text.isdigit() and len(text) == 8: return 'CCV:00058'
    # Hard-rule for 'CCV:00059'
    if Evidence_type == 'LowerSecondarySchool' and is_float(text) and len(text) < 8: return 'CCV:00059'
    # Hard-rule for 'CCV:00061'
    if 'Liceu' in text: return 'CCV:00061'
    # Hard-rule for 'CCV:00062'
    if Evidence_type == 'HigherSecondarySchool' and text.isdigit() and len(text) == 8: return 'CCV:00062'
    # Hard-rule for 'CCV:00063'
    if Evidence_type == 'HigherSecondarySchool' and is_float(text) and len(text) < 8: return 'CCV:00063'
    # Hard-rule for 'CCV:00066'
    if Evidence_type == 'TertiarySchool' and text.isdigit() and len(text) == 8: return 'CCV:00066'
    # Hard-rule for 'CCV:00067'
    if Evidence_type == 'TertiarySchool' and text.isdigit() and len(text) <= 2: return 'CCV:00067'
    # Hard-rule for 'CCV:00068'
    if Evidence_type == 'TertiarySchool' and text in ['University', 'MedicalSchool', 'TechnicalSchool', 'PolytechnicSchool', 'TradeSchool']: return 'CCV:00068'
    # Hard-rule for 'CCV:00069'
    if Evidence_type == 'TertiarySchool' and text.isdigit() and len(text) == 3: return 'CCV:00069'
    # Hard-rule for 'CCV:00071'
    if '%' in text and is_float(text[:-1]): return 'CCV:00071'    
    # Hard-rule for 'CCV:00073'
    if 'Hospital' in text and Evidence_type == 'DisabilityRecord' : return 'CCV:00073' 
    # Hard-rule for 'CCV:00080'
    if 'polícia,' in text and Evidence_type == 'CriminalRecord': return 'CCV:00080'
    # Hard-rule for 'CCV:00081'
    if text in ['True', 'False']: return 'CCV:00081' 
    # Hard-rule for 'CCV:00083'
    if text in ['Protected', 'Insured', 'Uninsured']: return 'CCV:00083'    
    # Hard-rule for 'CCV:00085'
    if '[' in text and Evidence_type == 'DisabilityRecord': return 'CCV:00085'
    # Hard-rule for 'CCV:00089'
    if 'Hospital' in text and Evidence_type == 'MedicalRecord' : return 'CCV:00089'
    # Hard-rule for 'CCV:00090'
    if '[' in text and Evidence_type == 'MedicalRecord': return 'CCV:00090'
    # Hard-rule for 'CCV:00094'
    if 'Conservatória do Registo Civil' in text and Evidence_type == 'ResidenceCertificate': return 'CCV:00094'
    # Hard-rule for 'CCV:00097'
    if text in ['Banco Santander Totta', 'Novo Banco', 'Millennium bcp', 'Banco BPI', 'Caixa Geral de Depósitos']: return 'CCV:00097'
    # Hard-rule for 'CCV:00098'
    if 'PT' in text and len(text) == 24: return 'CCV:00098'
    # Hard-rule for 'CCV:00099'
    if Evidence_type == 'ResidenceCertificate' and text.isdigit() and len(text) == 17: return 'CCV:00099'
    # Hard-rule for 'CCV:00100'
    if Evidence_type == 'ResidenceCertificate' and text.isdigit() and len(text) == 3: return 'CCV:00100'



In [41]:
# Cannot be classified: 'CCV:00004', 'CCV:00025', 'CCV:00028', 'CCV:00029', 'CCV:00035', 'CCV:00036', 'CCV:00042', 'CCV:00043'
# 'CCV:00047', 'CCV:00048', 'CCV:00049', CCV:00072, 'CCV:00084', 'CCV:00087', 'CCV:00088',
# 'CCV:00079', 'CCV:00082', 'CCV:00092', 'CCV:00093', 'CCV:00096'

In [42]:
from tqdm import tqdm

for i in tqdm( range(df.shape[0]) ):
    text = df['Value'].iloc[i]
    Evidence_type = df['Evidence_type'].iloc[i]
    CCV = df['CCV'].iloc[i]


    if CCV in ['CCV:00004', 'CCV:00025', 'CCV:00028', 'CCV:00029', 'CCV:00035', 'CCV:00036', 'CCV:00042', 'CCV:00043',
    'CCV:00047', 'CCV:00048', 'CCV:00049', 'CCV:00072', 'CCV:00084', 'CCV:00087', 'CCV:00088',
    'CCV:00079', 'CCV:00082', 'CCV:00092', 'CCV:00093', 'CCV:00096']: continue

        
    pred = hard_rules_GR(text, Evidence_type)

    if CCV in ['CCV:00008', 'CCV:00005', 'CCV:00003', 'CCV:00052', 'CCV:00065', 'CCV:00002', 'CCV:00011', 'CCV:00030']:
        if pred == None:
            continue
        else:
            print('(1)', i, text, Evidence_type, pred, CCV)
            break

    if pred != CCV:
        print('(2)', i, text, Evidence_type, pred, CCV)
        break
    

100%|██████████| 13479679/13479679 [09:18<00:00, 24117.64it/s]
