# Exercicio 1

O objetivo deste exercício é utilizar outra base de dados para testar as amostragens e comparar os resultados

Faça o download e carregue a base de dados **credit_data.csv**, que possui informações sobre empréstimos (se o cliente pagará ou não pagará o empréstimo)

Teste cada uma das técnicas de amostragem, selecionando 1000 registros

Para a amostragem estratificada, utilize o atributo **c#default** para separar as categorias

No final, faça o comparativo da média utilizando os atributos **age, income e loan**

Na próxima aula você pode assistir a solução para esse exercício

Bom trabalho!

In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
df = pd.read_csv('credit_data.csv')

In [3]:
df

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.642260,0
4,5,66952.688845,18.584336,8770.099235,1
...,...,...,...,...,...
1995,1996,59221.044874,48.518179,1926.729397,0
1996,1997,69516.127573,23.162104,3503.176156,0
1997,1998,44311.449262,28.017167,5522.786693,1
1998,1999,43756.056605,63.971796,1622.722598,0


In [4]:
def simple_random_sample(dataset,number_sample):
    return dataset.sample(n= number_sample,random_state = 1)

In [5]:
df_aleatorio_Simple = simple_random_sample(df,1000)
df_aleatorio_Simple

Unnamed: 0,i#clientid,income,age,loan,c#default
674,675,34158.633968,29.421142,2911.408067,0
1699,1700,25789.742025,45.316211,4442.331780,0
1282,1283,59589.064289,20.609764,4191.715856,0
1315,1316,49908.291867,29.550940,2903.036128,0
1210,1211,69132.462579,33.471182,7621.410219,0
...,...,...,...,...,...
103,104,57296.160823,25.708482,10601.082783,1
9,10,25075.872771,39.776378,1409.230371,0
1929,1930,27514.088473,36.278684,192.144611,0
543,544,55476.656980,52.089203,4733.505830,0


In [6]:
def systematic_sample(dataset,number_sample):
    interval = len(dataset) // number_sample
    random.seed(1)
    begin = random.randint(0,interval)
    index = np.arange(begin, len(dataset), step = interval)
    systematic_sample = dataset.iloc[index]
    return systematic_sample

In [7]:
df_amstra_estratificada = systematic_sample(df,1000)
df_amstra_estratificada

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
2,3,57317.170063,63.108049,8020.953296,0
4,5,66952.688845,18.584336,8770.099235,1
6,7,48430.359613,26.809132,5722.581981,0
8,9,40654.892537,55.496853,4755.825280,0
...,...,...,...,...,...
1990,1991,34237.575419,34.101654,2658.090632,0
1992,1993,30803.806165,23.250084,623.024153,0
1994,1995,24254.700791,37.751622,2225.284643,0
1996,1997,69516.127573,23.162104,3503.176156,0


In [8]:
def cluster_sample(dataset,number_sample):
    interval = len(dataset) / number_sample
    cluster = []
    id_cluster = 0
    count = 0
    for _ in dataset.iterrows():
        cluster.append(id_cluster)
        count += 1
        if count > interval:
            count = 0
            id_cluster += 1
    
    dataset['Cluster'] = cluster
    random.seed(1)
    selected_cluster = random.randint(0,number_sample)
    return dataset[dataset['Cluster'] == selected_cluster]

In [9]:
df_amostragem_grupos = cluster_sample(df,2)
df_amostragem_grupos

Unnamed: 0,i#clientid,income,age,loan,c#default,Cluster
0,1,66155.925095,59.017015,8106.532131,0,0
1,2,34415.153966,48.117153,6564.745018,0,0
2,3,57317.170063,63.108049,8020.953296,0,0
3,4,42709.534201,45.751972,6103.642260,0,0
4,5,66952.688845,18.584336,8770.099235,1,0
...,...,...,...,...,...,...
996,997,49104.768240,35.538517,9452.217947,0,0
997,998,65776.232413,39.798191,2805.863745,0,0
998,999,36192.149452,21.402403,7236.173930,1,0
999,1000,62165.861186,19.602543,4739.948954,0,0


In [10]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(test_size = 0.5)
for x,y in split.split(df,df['c#default']):
    df_x = df.iloc[x]
    df_amostragem_estratificada = df.iloc[y]

In [11]:
df_amostragem_estratificada

Unnamed: 0,i#clientid,income,age,loan,c#default,Cluster
1537,1538,65824.515657,40.621920,2643.106432,0,1
1827,1828,24112.499394,35.971338,3285.499948,0,1
689,690,57187.700893,59.471918,9390.672261,0,0
227,228,26090.725877,48.078520,4255.626392,0,0
938,939,62799.750611,35.361492,6752.586071,0,0
...,...,...,...,...,...,...
1650,1651,67151.318612,51.655099,3941.698673,0,1
434,435,23086.255409,24.849960,1256.401160,0,0
129,130,39441.444764,46.753896,1034.758838,0,0
1543,1544,44827.233772,56.298495,2639.916846,0,1


In [12]:
def amostragem_reservatorio(dataset, amostras):
    stream = []
    for i in range(len(dataset)):
        stream.append(i)
  
    i = 0
    tamanho = len(dataset)

    reservatorio = [0] * amostras
    for i in range(amostras):
        reservatorio[i] = stream[i]

    while (i < tamanho):
        j = random.randrange(i + 1)
        if (j < amostras): 
            reservatorio[j] = stream[i]
    i += 1

    return dataset.iloc[reservatorio]

In [14]:
df_amostragem_reservatorio = amostragem_reservatorio(df, 1000)
df_amostragem_reservatorio

# Testando

In [15]:
df['c#default'].mean()

0.1415

In [16]:
df_aleatorio_Simple['c#default'].mean()

0.143

In [17]:
df_amstra_estratificada['c#default'].mean()

0.148

In [18]:
df_amostragem_grupos['c#default'].mean()

0.14885114885114886

In [19]:
df_amostragem_estratificada['c#default'].mean()

0.142

In [26]:
df_amostragem_reservatorio['c#default'].mean()

In [21]:
df['age'].mean(), df['income'].mean(), df['loan'].mean()

(40.80755937840458, 45331.60001779331, 4444.369694688262)

In [22]:
df_aleatorio_Simple['age'].mean(), df_aleatorio_Simple['income'].mean(), df_aleatorio_Simple['loan'].mean()

(40.49552561124429, 45563.26865376901, 4449.4469004423645)

In [23]:
df_amstra_estratificada['age'].mean(), df_amstra_estratificada['income'].mean(), df_amstra_estratificada['loan'].mean()

(40.91117381141754, 45691.498750669496, 4506.787976426329)

In [24]:
df_amostragem_grupos['age'].mean(), df_amostragem_grupos['income'].mean(), df_amostragem_grupos['loan'].mean()

(41.0432231120503, 44846.749259861404, 4390.1614937442055)

In [25]:
df_amostragem_estratificada['age'].mean(), df_amostragem_estratificada['income'].mean(), df_amostragem_estratificada['loan'].mean()

(40.96864657884068, 45528.66014690375, 4516.67962852732)

In [None]:
df_amostragem_reservatorio['age'].mean(), df_amostragem_reservatorio['income'].mean(), df_amostragem_reservatorio['loan'].mean()