In [None]:
import os
import numpy as np
import json
import re
import pandas as pd

In [None]:
raw_resources_directory = '../data/raw/universidade_federal_do_rio_grande_do_norte_-_ufrn'
proc_resources_directory = '../data/proc/universidade_federal_do_rio_grande_do_norte_-_ufrn'

In [None]:
os.makedirs(proc_resources_directory,exist_ok=True)

In [None]:
resources_filenames = os.listdir(raw_resources_directory)
resources_filepaths = [os.path.join(raw_resources_directory,resource_filename) for resource_filename in os.listdir(raw_resources_directory)]

In [None]:
resources_csv_paths_containing = lambda pattern : [fn for fn in resources_filepaths if len(re.findall(pattern,fn)) and fn.endswith('.csv')]
resources_csv_names_containing = lambda pattern : [fn for fn in resources_filenames if len(re.findall(pattern,fn)) and fn.endswith('.csv')]

In [None]:
componentes = pd.concat([pd.read_csv(fn,sep=';') for fn in resources_csv_paths_containing('componentes-curriculares')])

In [None]:
for c in componentes.iloc:
    proc_component_directory = os.path.join(proc_resources_directory,'componentes',c['codigo'])
    info_filepath = os.path.join(proc_component_directory,'info.json')
    if os.path.exists(info_filepath): continue
    os.makedirs(proc_component_directory,exist_ok=True)
    with open(info_filepath,'w+') as f:
        json.dump(c.to_dict(), f, indent=4)

In [None]:
extract_resource_year_semester = lambda r : ''.join(re.findall('[0-9]',re.findall('[0-9][0-9-\.]+',r.split('/')[-1])[0])) 
extract_resource_year          = lambda r : int(extract_resource_year_semester(r)[:4])
extract_resource_semester      = lambda r : int(extract_resource_year_semester(r)[-1])

In [None]:
df_files = pd.concat(
    [
        pd.DataFrame({
            'filepath' : resources_csv_paths_containing(pattern),
            'filename' : resources_csv_names_containing(pattern),
            'ano'     : [extract_resource_year(r) for r in resources_csv_paths_containing(pattern)],
            'semestre' : [extract_resource_semester(r) for r in resources_csv_paths_containing(pattern)]
        })
        for pattern in ['matricula','turma']
    ]
).sort_values(['ano','semestre','filename'])

In [None]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

In [None]:
for (ano, semestre), df_file in df_files.groupby(['ano','semestre']):
    if df_file.shape[0] < 2: continue
    
    df_ano_matriculas = pd.read_csv(
        df_file.iloc[0]['filepath'],
        sep=';',
        decimal=',',
        usecols=[
            'id_turma',
            'discente',
            'media_final',
            'numero_total_faltas',
            'descricao'
        ]
    ).drop_duplicates()

    df_ano_turmas = pd.read_csv(
        df_file.iloc[1]['filepath'],
        sep=';',
        decimal=','
    )

    for (id_turma,), df_matricula_turma in df_ano_matriculas.groupby(['id_turma']):
        
        id_aux_turma = df_ano_turmas['id_turma'] == id_turma
        df_turma = df_ano_turmas[id_aux_turma]
        if df_turma.shape[0] == 0 : continue

        id_componente  = df_turma.iloc[0]['id_componente_curricular']
        df_componente  = componentes[componentes['id_componente'] == id_componente]
        if df_componente.shape[0] == 0 : continue

        codigo_componente = df_componente.iloc[0]['codigo']
        component_directory = os.path.join(proc_resources_directory,'componentes',codigo_componente)

        filename = f'{ano}_{semestre}_{id_turma}.json'
        filepath = os.path.join(component_directory,filename)
        
        if os.path.exists(filepath) : continue
        print(filepath)
        siape                     = list(df_turma['siape'].values)
        matricula_docente_externo = list(df_turma['matricula_docente_externo'].values)
        ch_dedicada_periodo       = list(df_turma['ch_dedicada_periodo'].values)
        
        df_turma = df_turma.loc[:,~df_turma.columns.isin(['siape','matricula_docente_externo','ch_dedicada_periodo'])].drop_duplicates()
        
        df_turma['siape'] = [siape]
        df_turma['matricula_docente_externo'] = [matricula_docente_externo]
        df_turma['ch_dedicada_periodo'] = [ch_dedicada_periodo]


        df_turma['matriculas'] = [{
            'discente'            : list(df_matricula_turma['discente'].values),
            'descricao'           : list(df_matricula_turma['descricao'].values),
            'media_final'         : list(df_matricula_turma['media_final'].values),
            'numero_total_faltas' : list(df_matricula_turma['numero_total_faltas'].values)
        }]
        
        with open(filepath,'w+') as f:
            json.dump(df_turma.iloc[0].to_dict(),f,indent=4,cls=NpEncoder)
    
