In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

### Load data

In [23]:
# Import data
df = pd.read_csv('Data/_20230622-052634_training.tsv', sep='\t', header=None)

# Rename columns
df.rename(columns = {0: 'Value',
                     1: 'Evidence_type',
                     2: 'CCV'}, 
          inplace = True)

# Remove ENG instances
ENG_ccvs = [ccv for ccv in df['CCV'].unique() if 'ENG' in ccv]
df = df[ ~df['CCV'].isin(ENG_ccvs) ]

# Remove duplicates
df.drop_duplicates(inplace=True)

df = df.reset_index().drop('index', axis=1)

### Preprocess

In [26]:
# def is_float(string):
#     try:
#         float(string)
#         return True
#     except ValueError:
#         return False

# def contains_only_letters(string):
#     for char in string:
#         if not char.isalpha():
#             return False
#     return True

# def hard_rules_GR(text, Evidence_type):
#     if 'ΑΤ,' in text and Evidence_type in ['CriminalReIDcord', 'ID']: return 'CCV:00026'
#     if 'Ληξιαρχείο' in text: return 'CCV:00028'
#     if 'ΚΕΠ' in text and Evidence_type == 'BirthCertificate': return 'CCV:00029'
#     if 'Λύκειο' in text: return 'CCV:00061'
#     if 'Νοσοκομείο' in text and Evidence_type == 'DisabilityRecord' : return 'CCV:00073' 
#     if 'ΑΤ,' in text and Evidence_type == 'CriminalRecord': return 'CCV:00080'
#     if '[' in text and Evidence_type == 'DisabilityRecord': return 'CCV:00085'
#     if 'Νοσοκομείο' in text and Evidence_type == 'MedicalRecord' : return 'CCV:00089'
#     if '[' in text and Evidence_type == 'MedicalRecord': return 'CCV:00090'
#     if 'ΚΕΠ' in text and Evidence_type == 'ResidenceCertificate': return 'CCV:00094'
#     if 'bank' in text or 'Bank' in text: return 'CCV:00097'
#     if 'GR' in text and len(text) == 24: return 'CCV:00098'
#     if 'Greek' == text: return 'CCV:00012'
#     if text in ['Male', 'Female']: return 'CCV:00013'
#     if is_float(text) and Evidence_type == 'Passport': return 'CCV:00014'
#     if len(text) == 9 and contains_only_letters(text[:2]) and text[2:].isdigit(): return 'CCV:00034'
#     if 'GRE' == text: return 'CCV:00038'
#     if len(text) == 11 and text.isdigit(): return 'CCV:00041'
#     if Evidence_type == 'PrimarySchool' and text.isdigit() and len(text) == 8: return 'CCV:00053'
#     if Evidence_type == 'PrimarySchool' and text.isdigit() and len(text) <= 2: return 'CCV:00054'
#     if 'Γυμνάσιο' in text: return 'CCV:00057'
#     if Evidence_type == 'LowerSecondarySchool' and text.isdigit() and len(text) == 8: return 'CCV:00058'
#     if Evidence_type == 'LowerSecondarySchool' and is_float(text) and len(text) < 8: return 'CCV:00059'
#     if Evidence_type == 'HigherSecondarySchool' and text.isdigit() and len(text) == 8: return 'CCV:00062'
#     if Evidence_type == 'HigherSecondarySchool' and is_float(text) and len(text) < 8: return 'CCV:00063'
#     if Evidence_type == 'TertiarySchool' and text.isdigit() and len(text) == 8: return 'CCV:00066'
#     if Evidence_type == 'TertiarySchool' and text.isdigit() and len(text) <= 2: return 'CCV:00067'
#     if Evidence_type == 'TertiarySchool' and text in ['University', 'MedicalSchool', 'TechnicalSchool', 'PolytechnicSchool', 'TradeSchool']: return 'CCV:00068'
#     if Evidence_type == 'TertiarySchool' and text.isdigit() and len(text) == 3: return 'CCV:00069'
#     if '%' in text: return 'CCV:00071'
#     if text in ['True', 'False']: return 'CCV:00081' 
#     if text in ['Protected', 'Insured', 'Uninsured']: return 'CCV:00083'
#     if Evidence_type == 'ResidenceCertificate' and text.isdigit() and len(text) == 17: return 'CCV:00099'
#     if Evidence_type == 'ResidenceCertificate' and text.isdigit() and len(text) == 3: return 'CCV:00100'

#     if len(text) == 9:
#         if contains_only_letters(text[:2]) and text[2] == ' ' and text[3:].isdigit(): return 'CCV:00022'


In [3]:
def is_float(string):
    try:
        float(string)
        return True
    except ValueError:
        return False

def contains_only_letters(string):
    for char in string:
        if not char.isalpha():
            return False
    return True

def hard_rules_GR(text, Evidence_type):
    # Hard-rule for 'CCV:00012'
    if 'Greek' == text: return 'CCV:00012'
    # Hard-rule for 'CCV:00013'
    if text in ['Male', 'Female']: return 'CCV:00013'
    # Hard-rule for 'CCV:00014'
    if is_float(text) and Evidence_type == 'Passport': return 'CCV:00014'    
    # Hard-rule for 'CCV:00022'
    if len(text) == 9 and contains_only_letters(text[:2]) and text[2] == ' ' and text[3:].isdigit(): return 'CCV:00022'
    # Hard-rule for 'CCV:00026'
    if 'ΑΤ,' in text and Evidence_type in ['CriminalReIDcord', 'ID']: return 'CCV:00026'
    # Hard-rule for 'CCV:00028'
    if 'Ληξιαρχείο' in text: return 'CCV:00028'
    # Hard-rule for 'CCV:00029'
    if 'ΚΕΠ' in text and Evidence_type == 'BirthCertificate': return 'CCV:00029'
    # Hard-rule for 'CCV:00034'
    if len(text) == 9 and contains_only_letters(text[:2]) and text[2:].isdigit(): return 'CCV:00034'
    # Hard-rule for 'CCV:00038'
    if 'GRE' == text: return 'CCV:00038'
    # Hard-rule for 'CCV:00041'
    if len(text) == 11 and text.isdigit(): return 'CCV:00041'
    # Hard-rule for 'CCV:00053'
    if Evidence_type == 'PrimarySchool' and text.isdigit() and len(text) == 8: return 'CCV:00053'
    # Hard-rule for 'CCV:00054'
    if Evidence_type == 'PrimarySchool' and text.isdigit() and len(text) <= 2: return 'CCV:00054'
    # Hard-rule for 'CCV:00057'
    if 'Γυμνάσιο' in text: return 'CCV:00057'
    # Hard-rule for 'CCV:00058'
    if Evidence_type == 'LowerSecondarySchool' and text.isdigit() and len(text) == 8: return 'CCV:00058'
    # Hard-rule for 'CCV:00059'
    if Evidence_type == 'LowerSecondarySchool' and is_float(text) and len(text) < 8: return 'CCV:00059'
    # Hard-rule for 'CCV:00061'
    if 'Λύκειο' in text: return 'CCV:00061'
    # Hard-rule for 'CCV:00062'
    if Evidence_type == 'HigherSecondarySchool' and text.isdigit() and len(text) == 8: return 'CCV:00062'
    # Hard-rule for 'CCV:00063'
    if Evidence_type == 'HigherSecondarySchool' and is_float(text) and len(text) < 8: return 'CCV:00063'
    # Hard-rule for 'CCV:00066'
    if Evidence_type == 'TertiarySchool' and text.isdigit() and len(text) == 8: return 'CCV:00066'
    # Hard-rule for 'CCV:00067'
    if Evidence_type == 'TertiarySchool' and text.isdigit() and len(text) <= 2: return 'CCV:00067'
    # Hard-rule for 'CCV:00068'
    if Evidence_type == 'TertiarySchool' and text in ['University', 'MedicalSchool', 'TechnicalSchool', 'PolytechnicSchool', 'TradeSchool']: return 'CCV:00068'
    # Hard-rule for 'CCV:00069'
    if Evidence_type == 'TertiarySchool' and text.isdigit() and len(text) == 3: return 'CCV:00069'
    # Hard-rule for 'CCV:00071'    
    if '%' in text and is_float(text[:-1]): return 'CCV:00071'
    # Hard-rule for 'CCV:00073'
    if 'Νοσοκομείο' in text and Evidence_type == 'DisabilityRecord' : return 'CCV:00073' 
    # Hard-rule for 'CCV:00080'
    if 'ΑΤ,' in text and Evidence_type == 'CriminalRecord': return 'CCV:00080'
    # Hard-rule for 'CCV:00081'
    if text in ['True', 'False']: return 'CCV:00081' 
    # Hard-rule for 'CCV:00083'
    if text in ['Protected', 'Insured', 'Uninsured']: return 'CCV:00083'
    # Hard-rule for 'CCV:00085'
    if '[' in text and Evidence_type == 'DisabilityRecord': return 'CCV:00085'
    # Hard-rule for 'CCV:00089'
    if 'Νοσοκομείο' in text and Evidence_type == 'MedicalRecord' : return 'CCV:00089'
    # Hard-rule for 'CCV:00090'
    if '[' in text and Evidence_type == 'MedicalRecord': return 'CCV:00090'
    # Hard-rule for 'CCV:00094'
    if 'ΚΕΠ' in text and Evidence_type == 'ResidenceCertificate': return 'CCV:00094'
    # Hard-rule for 'CCV:000197'
    if 'bank' in text or 'Bank' in text: return 'CCV:00097'
    # Hard-rule for 'CCV:00098'
    if 'GR' in text and len(text) == 24: return 'CCV:00098'
    # Hard-rule for 'CCV:00099'
    if Evidence_type == 'ResidenceCertificate' and text.isdigit() and len(text) == 17: return 'CCV:00099'
    # Hard-rule for 'CCV:00100'
    if Evidence_type == 'ResidenceCertificate' and text.isdigit() and len(text) == 3: return 'CCV:00100'

    return None

In [28]:
# Cannot be classified: 'CCV:00004', 'CCV:00025', 'CCV:00035', 'CCV:00036', 'CCV:00042', 'CCV:00043'
# 'CCV:00047', 'CCV:00048', 'CCV:00049', CCV:00072, 'CCV:00084', 'CCV:00087', 'CCV:00088',
# 'CCV:00079', 'CCV:00082', 'CCV:00092', 'CCV:00093', 'CCV:00096'

In [29]:
from tqdm import tqdm

for i in tqdm( range(df.shape[0]) ):
    text = df['Value'].iloc[i]
    Evidence_type = df['Evidence_type'].iloc[i]
    CCV = df['CCV'].iloc[i]


    if CCV in ['CCV:00004', 'CCV:00025', 'CCV:00035', 'CCV:00036', 'CCV:00042', 'CCV:00043',
    'CCV:00047', 'CCV:00048', 'CCV:00049', 'CCV:00072', 'CCV:00084', 'CCV:00087', 'CCV:00088',
    'CCV:00079', 'CCV:00082', 'CCV:00092', 'CCV:00093', 'CCV:00096']: continue


    pred = hard_rules_GR(text, Evidence_type)

    if CCV in ['CCV:00008', 'CCV:00005', 'CCV:00003', 'CCV:00052', 'CCV:00065', 'CCV:00002', 'CCV:00011', 'CCV:00030']:
        if pred is None:
            continue
        else:
            print('(1)', i, text, Evidence_type, pred, CCV)
            break

    if pred != CCV:
        print('(2)', i, text, Evidence_type, pred, CCV)
        break
    

100%|██████████| 19287254/19287254 [12:45<00:00, 25184.03it/s]


In [None]:
# df[(df['Evidence_type'] == 'Passport') & (df.CCV == 'CCV:00038')]