In [34]:
import os
import logging
import time
import psycopg2

import pandas as pd
import numpy as np

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def connect_postgres():
    database = os.environ['POSTGRES_DB']
    user = os.environ['POSTGRES_USER']
    password = os.environ['POSTGRES_PASSWORD']
    host = os.environ['POSTGRES_SERVER']
    port = 5432

    exc, conn = None, None

    for _ in range(5):
        try:
            conn = psycopg2.connect(
                database=database, user=user, password=password, host=host, port=port)
        except Exception as e:
            logging.warning("Error connecting to postgres, will retry in 3 sec: %s", e)
            time.sleep(3)
            exc = e
        else:
            logging.info("Connected...")
            logging.info("Everything goes well from Postgres, you're a fu*** pro...")
            break
    else:
        logging.error("Unable to connect to  %s DB", database)
        raise exc
    
    return conn


In [13]:
engine = connect_postgres()

INFO:root:Connected...
INFO:root:Everything goes well from Postgres, you're a fu*** pro...


In [46]:
df = pd.read_sql("""
    SELECT code, estrato, birth_date,
    DATE_PART('year', to_date(SUBSTRING(code::text, 1, 4)::text, 'YYYY')) - DATE_PART('year', student.birth_date::date) as age,
    sexo, m_grado, m_activo, m_tesis, per_cancelados, per_matriculados, bajos_rendimientos, trans_type_code, count(trans_type_code)
    FROM public.copytransaction, public.student, public.transactiontype, public.date
    WHERE trans_borrower_code = student.id
        AND trans_type_id = transactiontype.id
        AND trans_location_code_id = 5
        AND trans_type_code = 'ISS'
        AND trans_date_id = date.id
    GROUP BY code, estrato, birth_date, sexo, m_grado, m_activo, m_tesis, per_cancelados, age, per_matriculados, bajos_rendimientos, trans_type_code    
""", con=engine)

In [47]:
df

Unnamed: 0,code,estrato,birth_date,age,sexo,m_grado,m_activo,m_tesis,per_cancelados,per_matriculados,bajos_rendimientos,trans_type_code,count
0,198224610,2,1957-01-01,25.0,M,False,True,False,0,10,False,ISS,30
1,199000484,2,1970-01-01,20.0,F,True,False,False,0,12,False,ISS,31
2,199000896,2,1947-01-01,43.0,M,False,False,False,3,28,False,ISS,227
3,199001027,2,1971-06-25,19.0,F,True,False,False,0,17,False,ISS,34
4,199001038,2,1969-02-03,21.0,F,True,False,False,1,20,False,ISS,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
56407,201880005,2,1999-10-20,19.0,M,False,True,False,0,1,False,ISS,4
56408,201880012,1,1999-07-23,19.0,M,False,True,False,0,1,False,ISS,1
56409,201880022,1,1999-08-27,19.0,F,False,True,False,0,1,False,ISS,5
56410,201880023,1,2000-01-02,18.0,M,False,True,False,0,1,False,ISS,1


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56412 entries, 0 to 56411
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   code                56412 non-null  int64  
 1   estrato             56412 non-null  int64  
 2   birth_date          56412 non-null  object 
 3   age                 56412 non-null  float64
 4   sexo                56412 non-null  object 
 5   m_grado             56412 non-null  bool   
 6   m_activo            56412 non-null  bool   
 7   m_tesis             56412 non-null  bool   
 8   per_cancelados      56412 non-null  int64  
 9   per_matriculados    56412 non-null  int64  
 10  bajos_rendimientos  56412 non-null  bool   
 11  trans_type_code     56412 non-null  object 
 12  count               56412 non-null  int64  
dtypes: bool(4), float64(1), int64(5), object(3)
memory usage: 4.1+ MB


In [50]:
df["bajos_rendimientos"] = df["bajos_rendimientos"].astype(int)
df["m_grado"] = df["m_grado"].astype(int)
df["m_activo"] = df["m_activo"].astype(int)
df["m_tesis"] = df["m_tesis"].astype(int)
df["age"] = df["age"].astype(int)

# df = df.drop(columns=['trans_type_code'])

In [51]:
#exploring the data

# checking null values
df.isnull().sum()

# visualize them
# df[df['bajos_rendimientos'].isnull()]

code                  0
estrato               0
birth_date            0
age                   0
sexo                  0
m_grado               0
m_activo              0
m_tesis               0
per_cancelados        0
per_matriculados      0
bajos_rendimientos    0
trans_type_code       0
count                 0
dtype: int64

In [59]:
df.drop(df[df['age'] < 14].index, inplace = True)
df.drop(df[df['age'] > 80].index, inplace = True)

In [60]:
# investigate all the elements within each feature

for column in df:
    unique_values = np.unique(df[column])
    nr_values = len(unique_values)
    if nr_values < 100:
        print('The number of values for feature {} :{} -- {}'.format(column, nr_values,unique_values))
    else:
        print('The number of values for feature {} :{}'.format(column, nr_values))

The number of values for feature code :56290
The number of values for feature estrato :8 -- [ 0  1  2  3  4  5  6 11]
The number of values for feature birth_date :11647
The number of values for feature age :54 -- [14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
 62 63 64 67 69 73]
The number of values for feature sexo :2 -- ['F' 'M']
The number of values for feature m_grado :2 -- [0 1]
The number of values for feature m_activo :2 -- [0 1]
The number of values for feature m_tesis :2 -- [0 1]
The number of values for feature per_cancelados :9 -- [0 1 2 3 4 5 6 7 8]
The number of values for feature per_matriculados :37 -- [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 45]
The number of values for feature bajos_rendimientos :2 -- [0 1]
The number of values for feature trans_type_code :1 -- ['ISS']
The number of values for feature co