In [1]:
# https://github.com/spacyturk/spacyturk
import spacyturk
import pandas as pd
import numpy as np
from tqdm import tqdm

# downloads the spaCyTurk model
spacyturk.download("tr_floret_web_md")

# info about spaCyTurk installation and models
spacyturk.info()

  return torch._C._cuda_getDeviceCount() > 0


[1m

spaCyTurk version   0.1.0                         
Location            c:\Users\ioann\Repositories\Glass_models\Turkish\spacyturk
Platform            Windows-10-10.0.22621-SP0     
spaCy version       3.2.4                         
Python version      3.7.10                        
spaCyTurk models    tr_floret_web_md (3.2.0)      



### Load data

In [2]:
# Load RAW data
df = pd.read_csv('Data/_20220705-155847-TUR.txt', sep='\t', encoding='utf-16')

print('Data loaded')

Data loaded


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
values, types = [], []
for column in tqdm(df.columns):
    if ('ENG' in column): continue
    data = df[column].tolist()

    values += data
    types += len(data)*[column]

# Create DataFrame (value|type)
df = pd.DataFrame({})
df['value'] = values
df['type'] = types

100%|██████████| 90/90 [00:00<00:00, 267.87it/s]


#### Preprocess

In [4]:
CCVs = [
'CCV:00002',
'CCV:00003',
# 'CCV:00004',
'CCV:00005',
'CCV:00008',
# 'CCV:00011', Citizenship
# 'CCV:00012', Nationality
# 'CCV:00013', Sex
# 'CCV:00014',
'CCV:00022',
'CCV:00023',
# 'CCV:00025',
'CCV:00026',
'CCV:00028',
'CCV:00029',
'CCV:00030',
'CCV:00034',
# 'CCV:00035',
# 'CCV:00036',
# 'CCV:00038', Passport issuing country
# 'CCV:00039', Passport issuing authority
# 'CCV:00041',
# 'CCV:00042',
# 'CCV:00043',
# 'CCV:00047',
# 'CCV:00048',
# 'CCV:00049',
'CCV:00052',
# 'CCV:00053',
# 'CCV:00054',
'CCV:00057',
# 'CCV:00058',
# 'CCV:00059',
'CCV:00061',
# 'CCV:00062',
# 'CCV:00063',
'CCV:00065',
# 'CCV:00066',
# 'CCV:00067',
# 'CCV:00068', Tertiary school type: 'University', 'TechnicalSchool', 'PolytechnicSchool', 'MedicalSchool', 'TradeSchool'
# 'CCV:00069',
# 'CCV:00071',
# 'CCV:00072',
'CCV:00073',
'CCV:00074',
# 'CCV:00079',
# 'CCV:00080', It is the same as CCV:00026
# 'CCV:00081',
# 'CCV:00082',
# 'CCV:00083', Insurance status: 'Insured', 'Protected', 'Uninsured'
# 'CCV:00084'
]

In [5]:
# Drop duplicates
df = df.drop_duplicates()
# Remove empty cells
df = df[df['value'] != '-']
# Select types
df = df[df['type'].isin(CCVs)]
# Convert CCV:00074-x to CCV:00074
df['type'] = df['type'].apply(lambda x:'CCV:00074' if '00074' in x else x)
# Reset index
df = df.reset_index().drop(['index'], axis=1)
print('Number of records: ', df.shape[0])

Number of records:  291033


In [7]:
# Identify the value which may belong to two different categories
tt = df.groupby('value').count().reset_index()


for x in tqdm(tt[tt['type'] > 1]['value']):

    Categories = list(df[df['value'] == x]['type'].unique())
    if   Categories == ['CCV:00005', 'CCV:00008']:
        df = df[~((df['value'] == x) & (df['type'] == 'CCV:00005'))]
        df = df[~((df['value'] == x) & (df['type'] == 'CCV:00008'))]
    elif Categories == ['CCV:00002', 'CCV:00023']:
        df = df[~((df['value'] == x) & (df['type'] == 'CCV:00023'))]   
    elif Categories == ['CCV:00003', 'CCV:00023']:
        df = df[~((df['value'] == x) & (df['type'] == 'CCV:00023'))]   
    elif Categories == ['CCV:00002', 'CCV:00003']:
        df = df[~((df['value'] == x) & (df['type'] == 'CCV:00003'))]
    elif Categories == ['CCV:00002', 'CCV:00003', 'CCV:00023']: # Only 2 cases
        df = df[~((df['value'] == x) & (df['type'] == 'CCV:00003'))] 
        df = df[~((df['value'] == x) & (df['type'] == 'CCV:00023'))]        

100%|██████████| 2042/2042 [02:38<00:00, 12.85it/s]


### Create input/output ndarrays

In [55]:
from tqdm.notebook import tqdm_notebook
tqdm_notebook().pandas(desc="Progress")

# Load the model using spaCy
import spacy
nlp = spacy.load("tr_floret_web_md")

def convert(x):
    try:
        return nlp(x).vector
    except:
        return 300*[np.NaN]
    
X = np.array( df['value'].progress_apply( convert ).to_list() )
y = df['type'].to_numpy()


# Remove NaN records
df_temp = pd.DataFrame(X)
df_temp['y'] = y
df_temp = df_temp.dropna()

# Save data
np.savez('Data/data.npz', X=df_temp.iloc[:,:-1].to_numpy(), y=df_temp.iloc[:,-1].to_numpy())

0it [00:00, ?it/s]

Progress:   0%|          | 0/287224 [00:00<?, ?it/s]