# Faculdades

> Check faculdades_clean.ipynb for state of data cleaning for Faculdades

Para criar uma versão `html` sem o código fonte, fazer, na directoria "notebooks":

    jupyter nbconvert faculdades.ipynb --to html --no-input

## Setup

In [1]:
from timelink.mhk.utilities import get_engine, get_dbnames, get_connection_string
from sqlalchemy import engine, text, MetaData
print('Available databases:')
print(get_dbnames())
%load_ext sql
%config SqlMagic.displaycon=False

Available databases:
['alunos', 'angelica', 'api_tests', 'china', 'china_coimbra', 'dyncoopnet', 'eiras', 'familias', 'glopes', 'ilhavo', 'ilhavo2', 'ilhavo_contrib', 'ilhavo_editor', 'ilhavo_server', 'ilhavo_testes', 'lousa', 'lousa3', 'lousa4', 'mhk', 'notaveis', 'obidos', 'oguest', 'rhv', 'santaclara', 'santajusta', 'sister', 'soure', 'soure_edit', 'soure_editor', 'soure_mfg', 'soure_public', 'soure_server', 'testes', 'toliveira', 'toliveira_reviewed', 'ucprosop']


## Connect to database

### Choose the database

In [2]:
db = 'ucprosop'
connection_string = get_connection_string(db)
%sql $connection_string
engine = get_engine(db)

## Map MHK database

In [3]:
from sqlalchemy import MetaData,Table, Column, String, ForeignKey
from sqlalchemy.orm import declarative_base, relationship, sessionmaker
mhk = MetaData()
Base = declarative_base()
Session = sessionmaker(bind=engine)
session = Session()
entities_table = Table('entities', mhk, autoload_with=engine)
attr_table = Table('attributes', mhk, autoload_with=engine)
nattr_table = Table('nattributes', mhk, autoload_with=engine)
person_table = Table('persons', mhk, autoload_with=engine)

## Stats for faculdade

In [4]:
import pandas
from sqlalchemy import create_engine

# we need this because pandas 1.3.2 not compatible with recent sqlalchemy
legacy_engine = create_engine(connection_string,future=False)

fac_df = pandas.read_sql(
    f"select distinct entity,\
             the_value as faculdade,\
             SUBSTRING(the_date,1,4) as year \
             from attributes \
             where the_type = 'faculdade' \
                   and the_date <> '0000-00-00' \
        order by year",
    con=legacy_engine, # this is necessary for compatibility of read_sql with sqlalchemy 2.0
)
fac_df = fac_df[fac_df['faculdade'].isin(['Artes','Teologia','Cânones','Leis','Medicina','Direito','Matemática','Filosofia','?'])]
fac_df = fac_df.drop_duplicates(subset=['entity','year', 'faculdade'], keep='first')
#print("Faculdades: remoção de múltiplos valores por aluno, mantendo o primeiro")
#fac_df.info()
print("Exemplo do resultado:")
fac_df[:10]

Exemplo do resultado:


Unnamed: 0,entity,faculdade,year
0,242228,Artes,1256
1,196431,Artes,1503
2,141858,Cânones,1536
3,172287,Artes,1536
4,128342,Cânones,1537
5,129439,Leis,1537
6,130313,Cânones,1537
7,131972,Cânones,1537
8,132974,Leis,1537
9,135969,Leis,1537


In [5]:
import numpy as np
pandas.set_option('display.max_rows', None)

table = pandas.pivot_table(fac_df,index=['year'], columns=['faculdade'],aggfunc=np.count_nonzero, fill_value=0)
table

Unnamed: 0_level_0,entity,entity,entity,entity,entity,entity,entity,entity,entity
faculdade,?,Artes,Cânones,Direito,Filosofia,Leis,Matemática,Medicina,Teologia
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
1256,0,1,0,0,0,0,0,0,0
1503,0,1,0,0,0,0,0,0,0
1536,0,1,1,0,0,0,0,0,0
1537,1,2,69,1,0,78,0,7,2
1538,0,0,66,0,0,87,0,2,0
1539,0,1,17,0,0,17,0,0,0
1540,0,8,106,0,12,148,0,14,17
1541,0,13,7,1,0,20,0,4,3
1542,0,10,3,0,1,5,0,3,3
1543,0,9,5,0,0,9,0,1,2
