In [4]:
# Install required packages only in kernel (temporary).
%pip install polyfuzz==0.3.3 rapidfuzz==1.5.1 --user
import os
import sys
sys.path.insert(0, os.path.abspath('..\\'))
import numpy as np
import pandas as pd 
from polyfuzz import PolyFuzz
from polyfuzz.models import RapidFuzz
from rapidfuzz import fuzz
import time
from datetime import date

Collecting progressbar33==2.4
  Using cached progressbar33-2.4-py3-none-any.whl
Installing collected packages: progressbar33
Successfully installed progressbar33-2.4
Note: you may need to restart the kernel to use updated packages.


In [5]:
"""
This script extracts only covid-related records from clean data
"""
def get_clean_data():
    
    path =  os.path.abspath('..\\data\\interim\\clean_data.csv')
    df = pd.read_csv(path)
    return df

def sample_data():
    
    # take a random sample (1%) from the data.
    df = get_clean_data()
    df = df.sample(frac=0.01, random_state=1)
    return df   

def select_series():
    
    df = sample_data()
    causes_df = df[df.columns[-6:]]
    causes_df = causes_df.applymap(lambda s:str(s))    
    return causes_df

def extract_matches(series):
    
    #queries = [['covid-19'], ['covid'], ['covid19'],['coronavirus'], ['sars-cov-2'],['covid-19 coronavirus sars-cov-2']]
    queries = [['covid-19'], ['sars-cov-2'],['coronavirus']]
    idx = np.array([], dtype='int64')
    series = series.to_list()
    
    for query in queries:
        matcher = RapidFuzz(n_jobs=1, score_cutoff=0.6, scorer=fuzz.token_set_ratio)
        model = PolyFuzz(matcher)
        model.match(series, query)
        matches = model.get_matches()  
        #lst = matches.loc[matches['Similarity']>= 0.6].index.to_list()
        array = matches.loc[matches['Similarity']>= 0.6].index.to_numpy()
        #indexes.extend(lst)
        idx = np.concatenate([idx, array])
        #list(set(indexes))
        
    idx = np.unique(idx)
    return idx  

def get_indexes():
    
    startTime = time.time()
    
    causes_df = select_series()
    indexes = np.array([],dtype='int64')
    
    for s in causes_df:
        s = causes_df[s]
        idx = extract_matches(s)
        indexes =np.concatenate([indexes, idx])
    
    indexes = np.unique(indexes)
    
    
    print ('Transformation took {0} seconds.'.format(time.time() - startTime))
    
    return indexes
    
def transform_data(df, indexes):
    
    df = df.iloc[indexes]
    df.to_csv(os.path.abspath('..\\data\\processed\\ts_data.csv'),index=False)
    
def main():
    print("\nSampling data...done.")
    print("Transforming data...")
    indexes = get_indexes()
    df = sample_data()
    transform_data(df,indexes)
    print("Done.\n")

if __name__ == "__main__":
    # Master function
    main()


Sampling data...done.
Transforming data...
Transformation took 8.525687217712402 seconds.
Done.



In [7]:
# Load transformed data, which is only a sample of 1% of the data. 

df = pd.read_csv('..\\data\\processed\\ts_data.csv')
df.head()

Unnamed: 0,id,tipo_seguro,sexo,edad,tiempo_edad,cod#_ubigeo_domicilio,pais_domicilio,departamento_domicilio,provincia_domicilio,distrito_domicilio,fecha,tipo_lugar,debido_a_(causa_a),debido_a_(causa_b),debido_a_(causa_c),debido_a_(causa_d),debido_a_(causa_e),debido_a_(causa_f)
0,286265,sanidad pnp,masculino,67.0,anos,92-33-14-01-33-000,peru,lima,lima,jesus maria,2021-05-08,eess,insuficiencia respiratoria,virus sars cov2 covid 19,,,,
1,226735,sis,masculino,73.0,anos,92-33-02-13-01-000,peru,ancash,santa,chimbote,2020-07-07,eess,insuficiencia respiratoria aguda,neumonia atipica,covid19,,,
2,137308,usuario,masculino,33.0,anos,92-33-12-01-07-000,peru,la libertad,trujillo,victor larco herrera,2020-06-27,eess,insuficiencia respiratoria,neumonia,covid sospechoso,,,
3,239016,sanidad ep,masculino,54.0,anos,92-33-15-01-01-000,peru,loreto,maynas,iquitos,2021-04-02,eess,insuficiencia respiratoria aguda en ventilacio...,neumonia viral atipica por sars cov2,coronavirus,,obesidad,
4,373213,sis,masculino,65.0,anos,92-33-13-01-01-000,peru,lambayeque,chiclayo,chiclayo,2020-05-10,domicilio,covid 19 virus no identificado,,,,,
