# Carregar pacotes

In [1]:
pip install PySRAG==0.1.6

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import numpy as np
import pandas as pd
from pysrag.data import SRAG
from pysrag.model import GBMTrainer
from joblib import dump, load

# Treinando com mais de 1 arquivo (2 anos ou mais)

In [3]:
path='./../bases'

cols_X = ['REGIAO_LATITUDE', 'REGIAO_LONGITUDE', 'UF_LATITUDE'
        , 'UF_LONGITUDE', 'LATITUDE', 'LONGITUDE', 'POPULACAO', 'IDADE_ANO'
        , 'ANO_SEM_SIN_PRI']
col_y = ['POS_SARS2', 'POS_FLUA', 'POS_FLUB', 'POS_VSR',
         'POS_PARA1', 'POS_PARA2', 'POS_PARA3', 'POS_PARA4',
         'POS_ADENO', 'POS_METAP', 'POS_BOCA', 'POS_RINO', 'POS_OUTROS']
demais_virus = ''

all_files = os.listdir(path)

from datetime import datetime

def get_latest_data(all_files,string_year):
  format_string = '%d-%m-%Y'
  list_files = [i.replace(f'{string_year}-','').split('.')[0] for i in all_files if i[:8] == string_year]
  latest_data = max(list_files,key=lambda x: datetime.strptime(x, format_string))
  string_latest_data = string_year+'-'+latest_data+'.csv'
  return string_latest_data

influd22 = get_latest_data(all_files,'INFLUD22')
influd23 = get_latest_data(all_files,'INFLUD23')
influd24 = get_latest_data(all_files,'INFLUD24')

print(influd22, influd23, influd24)

# Old Filters True

In [4]:
%%time

#list_filepath = [os.path.join(path,i) for i in [influd22,influd23,influd24]]
list_filepath = ['https://s3.sa-east-1.amazonaws.com/ckan.saude.gov.br/SRAG/2022/INFLUD22-03-04-2023.csv' 
,'https://s3.sa-east-1.amazonaws.com/ckan.saude.gov.br/SRAG/2023/INFLUD23-15-07-2024.csv'
,'https://s3.sa-east-1.amazonaws.com/ckan.saude.gov.br/SRAG/2024/INFLUD24-15-07-2024.csv']

list_X = []
list_y = []
list_weeks = []
list_train = []
list_training_weeks = []

for filepath in list_filepath:
  print(filepath)
  srag = SRAG(filepath,old_filter=True)
  X_aux, y_aux = srag.generate_training_data(None, 'multiclass', cols_X, col_y, demais_virus)
  list_X.append(X_aux)
  list_y.append(y_aux)
  list_training_weeks.append(srag.generate_training_weeks())

  weeks = np.unique(X_aux['ANO_SEM_SIN_PRI'])
  train = srag.get_start_day_of_week(0)
  list_train.append(train)
  list_weeks.append(weeks)

X = pd.concat(list_X).reset_index(drop=True)
y = pd.concat(list_y).reset_index(drop=True)
df_training_weeks = pd.concat(list_training_weeks).reset_index(drop=True)

https://s3.sa-east-1.amazonaws.com/ckan.saude.gov.br/SRAG/2022/INFLUD22-03-04-2023.csv
https://s3.sa-east-1.amazonaws.com/ckan.saude.gov.br/SRAG/2023/INFLUD23-15-07-2024.csv
https://s3.sa-east-1.amazonaws.com/ckan.saude.gov.br/SRAG/2024/INFLUD24-15-07-2024.csv
CPU times: total: 13 s
Wall time: 46.7 s


In [5]:
%time
trainer = GBMTrainer(objective='multiclass', eval_metric='multi_logloss')
trainer.fit(X, y)

CPU times: total: 0 ns
Wall time: 0 ns


In [6]:
model = {'filename': list_filepath,
         'weeks': list_weeks,
         'train': list_train,
          'virus': trainer.model.classes_,
          'df_training_weeks': df_training_weeks,
          'model': trainer,
          'train_size': len(y),
          'best_boost_iteration': trainer.model.best_iteration_}

In [7]:
model

{'filename': ['https://s3.sa-east-1.amazonaws.com/ckan.saude.gov.br/SRAG/2022/INFLUD22-03-04-2023.csv',
  'https://s3.sa-east-1.amazonaws.com/ckan.saude.gov.br/SRAG/2023/INFLUD23-15-07-2024.csv',
  'https://s3.sa-east-1.amazonaws.com/ckan.saude.gov.br/SRAG/2024/INFLUD24-15-07-2024.csv'],
 'weeks': [array([202201, 202202, 202203, 202204, 202205, 202206, 202207, 202208,
         202209, 202210, 202211, 202212, 202213, 202214, 202215, 202216,
         202217, 202218, 202219, 202220, 202221, 202222, 202223, 202224,
         202225, 202226, 202227, 202228, 202229, 202230, 202231, 202232,
         202233, 202234, 202235, 202236, 202237, 202238, 202239, 202240,
         202241, 202242, 202243, 202244, 202245, 202246, 202247, 202248,
         202249, 202250, 202251, 202252]),
  array([202301, 202302, 202303, 202304, 202305, 202306, 202307, 202308,
         202309, 202310, 202311, 202312, 202313, 202314, 202315, 202316,
         202317, 202318, 202319, 202320, 202321, 202322, 202323, 202324,
  

In [8]:
dump(model,'./dict_model')

['./dict_model']

In [9]:
# Exclude recent lags
excl_weeks = []
for lag in [0,1]:
  dict_lag = srag.get_start_day_of_week(lag)
  excl_weeks.append( dict_lag['year']*100 + dict_lag['week'] )

ind_excl_weeks = df_training_weeks['ANO_SEM_SIN_PRI'].isin(excl_weeks)

df_training_weeks_app = df_training_weeks[~ind_excl_weeks].query('SEM_SIN_PRI > 0')

df_training_weeks_app.to_csv('./df_semanas.csv',index=False)

In [10]:
SRAG.load_common_data().to_csv('./df_municipios.csv',index=False)

In [11]:
feature_name = model['model'].model.feature_name_
pd.Series(feature_name,name='feature_name').to_csv('./feature_name.csv',index=False)

classes = model['virus']
pd.Series(classes,name='virus').to_csv('./classes.csv',index=False)

model['model'].model.booster_.save_model('./booster.txt')

<lightgbm.basic.Booster at 0x21a00d17d30>