### Lidando com dados no Pandas

In [1]:
import pandas as pd

df = pd.read_csv("../data/insurance.csv")

In [2]:
# Carrega as primeiras 5 linhas
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
# Carrega as últimas 5 linhas
df.tail()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


In [4]:
df.shape

(1338, 7)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [6]:
print(f"Temos {df.shape[0]} linhas e {df.shape[1]} colunas.")

Temos 1338 linhas e 7 colunas.


In [7]:
# Usando colchetes
print(df['age'])

# Usando notação de ponto
print(df.age)

0       19
1       18
2       28
3       33
4       32
        ..
1333    50
1334    18
1335    18
1336    21
1337    61
Name: age, Length: 1338, dtype: int64
0       19
1       18
2       28
3       33
4       32
        ..
1333    50
1334    18
1335    18
1336    21
1337    61
Name: age, Length: 1338, dtype: int64


In [8]:
# Exibe da linha 3 a 8
df['age'][3:8]

3    33
4    32
5    31
6    46
7    37
Name: age, dtype: int64

In [9]:
# Exibe a linha 4
df['age'][4]

np.int64(32)

In [10]:
df[['age', 'sex', 'bmi']]

Unnamed: 0,age,sex,bmi
0,19,female,27.900
1,18,male,33.770
2,28,male,33.000
3,33,male,22.705
4,32,male,28.880
...,...,...,...
1333,50,male,30.970
1334,18,female,31.920
1335,18,female,36.850
1336,21,female,25.800


In [11]:
# Seleciona a primeira linha
df.loc[0]

age                19
sex            female
bmi              27.9
children            0
smoker            yes
region      southwest
charges     16884.924
Name: 0, dtype: object

In [12]:
# Seleciona a linha 0 a 2
df.loc[[0, 2]]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
2,28,male,33.0,3,no,southeast,4449.462


In [13]:
# Selecioa a célula 0 da columa bmi
df.loc[0, 'bmi']

np.float64(27.9)

In [14]:
# Seleciona quem tem mais de 40 anos
df.loc[df['age'] > 63]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
62,64,male,24.7,1,no,northwest,30166.61817
94,64,female,31.3,2,yes,southwest,47291.055
199,64,female,39.33,0,no,northeast,14901.5167
328,64,female,33.8,1,yes,southwest,47928.03
335,64,male,34.5,0,no,southwest,13822.803
378,64,female,30.115,3,no,northwest,16455.70785
398,64,male,25.6,2,no,southwest,14988.432
402,64,female,32.965,0,no,northwest,14692.66935
418,64,male,39.16,1,no,southeast,14418.2804
420,64,male,33.88,0,yes,southeast,46889.2612


In [15]:
# Seleciona homens acima de 30 anos
df.loc[((df['age'] > 30) & (df['sex'] == 'male'))]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
8,37,male,29.830,2,no,northeast,6406.41070
18,56,male,40.300,0,no,southwest,10602.38500
24,37,male,28.025,2,no,northwest,6203.90175
...,...,...,...,...,...,...,...
1324,31,male,25.935,1,no,northwest,4239.89265
1325,61,male,33.535,0,no,northeast,13143.33665
1327,51,male,30.030,1,no,southeast,9377.90470
1329,52,male,38.600,2,no,southwest,10325.20600


In [16]:
# Seleciona a primeira linha
df.iloc[0]

age                19
sex            female
bmi              27.9
children            0
smoker            yes
region      southwest
charges     16884.924
Name: 0, dtype: object

In [17]:
# Seleciona a linha 10 e 3
df.iloc[[10, 3]]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
10,25,male,26.22,0,no,northeast,2721.3208
3,33,male,22.705,0,no,northwest,21984.47061


In [18]:
# Seleciona linha 10 e coluna 4
print(df.iloc[10, 4])

no


### Manipulando dados

In [19]:
# Adicionando colunas
df['Ano_Analise'] = 2023

# Adicionando coluna com condição
import numpy as np

df['Cobranca_Abusiva'] = np.where(
    df["charges"] > 20000,
    "Sim",
    "Não"
)

df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,Ano_Analise,Cobranca_Abusiva
0,19,female,27.9,0,yes,southwest,16884.924,2023,Não
1,18,male,33.77,1,no,southeast,1725.5523,2023,Não
2,28,male,33.0,3,no,southeast,4449.462,2023,Não
3,33,male,22.705,0,no,northwest,21984.47061,2023,Sim
4,32,male,28.88,0,no,northwest,3866.8552,2023,Não


In [20]:
df.drop(columns="Ano_Analise")

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,Cobranca_Abusiva
0,19,female,27.900,0,yes,southwest,16884.92400,Não
1,18,male,33.770,1,no,southeast,1725.55230,Não
2,28,male,33.000,3,no,southeast,4449.46200,Não
3,33,male,22.705,0,no,northwest,21984.47061,Sim
4,32,male,28.880,0,no,northwest,3866.85520,Não
...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830,Não
1334,18,female,31.920,0,no,northeast,2205.98080,Não
1335,18,female,36.850,0,no,southeast,1629.83350,Não
1336,21,female,25.800,0,no,southwest,2007.94500,Não


In [21]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,Ano_Analise,Cobranca_Abusiva
0,19,female,27.9,0,yes,southwest,16884.924,2023,Não
1,18,male,33.77,1,no,southeast,1725.5523,2023,Não
2,28,male,33.0,3,no,southeast,4449.462,2023,Não
3,33,male,22.705,0,no,northwest,21984.47061,2023,Sim
4,32,male,28.88,0,no,northwest,3866.8552,2023,Não


In [22]:
df.drop(columns="Ano_Analise", inplace=True)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,Cobranca_Abusiva
0,19,female,27.9,0,yes,southwest,16884.924,Não
1,18,male,33.77,1,no,southeast,1725.5523,Não
2,28,male,33.0,3,no,southeast,4449.462,Não
3,33,male,22.705,0,no,northwest,21984.47061,Sim
4,32,male,28.88,0,no,northwest,3866.8552,Não


In [23]:
# Trocar Sim por 1 e Não por 0 
df["Cobranca_Abusiva"] = df["Cobranca_Abusiva"].map({
    "Sim": 1,
    "Não": 0
})

# Cria coluna com valor 2023
df["Ano_Analise"] = 2023

# Atualiza valor 2023 para 2022
df["Ano_Analise"] = 2022

df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,Cobranca_Abusiva,Ano_Analise
0,19,female,27.9,0,yes,southwest,16884.924,0,2022
1,18,male,33.77,1,no,southeast,1725.5523,0,2022
2,28,male,33.0,3,no,southeast,4449.462,0,2022
3,33,male,22.705,0,no,northwest,21984.47061,1,2022
4,32,male,28.88,0,no,northwest,3866.8552,0,2022


### Analisando dados

In [24]:
df.describe()

Unnamed: 0,age,bmi,children,charges,Cobranca_Abusiva,Ano_Analise
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265,0.204036,2022.0
std,14.04996,6.098187,1.205493,12110.011237,0.403146,0.0
min,18.0,15.96,0.0,1121.8739,0.0,2022.0
25%,27.0,26.29625,0.0,4740.28715,0.0,2022.0
50%,39.0,30.4,1.0,9382.033,0.0,2022.0
75%,51.0,34.69375,2.0,16639.912515,0.0,2022.0
max,64.0,53.13,5.0,63770.42801,1.0,2022.0


In [25]:
df['sex'].value_counts()

sex
male      676
female    662
Name: count, dtype: int64

In [26]:
df['sex'].value_counts(normalize=True)

sex
male      0.505232
female    0.494768
Name: proportion, dtype: float64

In [27]:
df.groupby("sex")["age"].mean()

sex
female    39.503021
male      38.917160
Name: age, dtype: float64

In [28]:
# Média de idade e cobrança entre fumantes e não fumantes
df.groupby("smoker")[["age", "charges"]].mean()

Unnamed: 0_level_0,age,charges
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1
no,39.385338,8434.268298
yes,38.514599,32050.231832


In [29]:
# Desvio padrão por sexo e fumante
df.groupby(["smoker", "sex"])["charges"].std()

smoker  sex   
no      female     6060.775970
        male       5908.108989
yes     female    11907.536381
        male      11202.670862
Name: charges, dtype: float64