### Pivotamento de tabelas (pivot tables)

In [32]:
import pandas as pd
import numpy as np

In [33]:
stud = pd.read_csv("../data/students/data.csv", sep=';')
stud.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [34]:
# Qual a proporção de dropouts
stud["Target"].value_counts(1)

Target
Graduate    0.499322
Dropout     0.321203
Enrolled    0.179476
Name: proportion, dtype: float64

In [35]:
# Qual a média de inflação para cada tipo de target?
stud.groupby("Target")["Inflation rate"].mean()

Target
Dropout     1.283955
Enrolled    1.211713
Graduate    1.197918
Name: Inflation rate, dtype: float64

In [36]:
stud.pivot_table(index='Target', columns=['Marital status'], values='Inflation rate', aggfunc=np.mean)

  stud.pivot_table(index='Target', columns=['Marital status'], values='Inflation rate', aggfunc=np.mean)


Marital status,1,2,3,4,5,6
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dropout,1.282601,1.300559,1.4,1.269048,1.354545,0.875
Enrolled,1.220694,1.003846,2.55,1.59375,-0.333333,1.4
Graduate,1.193499,1.216216,-0.8,1.163636,2.181818,-0.3


In [37]:
stud.pivot_table(index='Target',
                columns=['Daytime/evening attendance\t', 'Marital status'],
                values = 'Previous qualification (grade)',
                aggfunc=np.mean)

  stud.pivot_table(index='Target',


Daytime/evening attendance,0,0,0,0,0,0,1,1,1,1,1,1
Marital status,1,2,3,4,5,6,1,2,3,4,5,6
Target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Dropout,128.471774,130.476471,,127.733333,120.0,122.5,131.537736,129.937838,135.0,130.753333,143.55,133.1
Enrolled,128.863415,133.296,,135.4,,133.1,131.172607,129.177778,154.0,137.4125,128.7,
Graduate,132.313333,131.133784,120.0,138.388235,120.0,133.1,134.495393,130.505405,,126.59375,132.1375,


### Formatação Condicional

In [38]:
df = pd.DataFrame({
    'A': [9, -7, 5],
    'B': [-1, 3, -4]
})

df

Unnamed: 0,A,B
0,9,-1
1,-7,3
2,5,-4


In [39]:
# Definindo uma função para aplicar a coloração
def color_positive_negative(val):
    color = 'green' if val > 0 else 'red'
    return 'color: %s' % color

# Aplicando a coloração ao DataFrame
styled_df = df.style.applymap(color_positive_negative)
display(styled_df)

  styled_df = df.style.applymap(color_positive_negative)


Unnamed: 0,A,B
0,9,-1
1,-7,3
2,5,-4


In [40]:
pivot_df = stud.pivot_table(index='Marital status',
                columns=['Target'],
                values = 'Previous qualification (grade)',
                aggfunc=np.mean)

pivot_df

  pivot_df = stud.pivot_table(index='Marital status',


Target,Dropout,Enrolled,Graduate
Marital status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,131.216639,131.041111,134.381687
2,130.142458,131.157692,130.819595
3,135.0,154.0,120.0
4,129.890476,136.40625,132.669697
5,141.409091,128.7,128.827273
6,127.8,133.1,133.1


In [41]:
def highlight_max_min(data):
    styles = data.copy()
    for col in data.columns:
        max_val = data[col].max()
        min_val = data[col].min()
        styles[col] = ['background-color: lightgreen' if v == max_val else 'background-color: yellow'
                       if v == min_val else '' for v in data[col]]
    return styles

styled_df = pivot_df.style.apply(highlight_max_min, axis=None)
display(styled_df)

Target,Dropout,Enrolled,Graduate
Marital status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,131.216639,131.041111,134.381687
2,130.142458,131.157692,130.819595
3,135.0,154.0,120.0
4,129.890476,136.40625,132.669697
5,141.409091,128.7,128.827273
6,127.8,133.1,133.1


### Função agg no pandas

In [42]:
stud.drop(columns='Target').agg("mean", axis=0)

Marital status                                       1.178571
Application mode                                    18.669078
Application order                                    1.727848
Course                                            8856.642631
Daytime/evening attendance\t                         0.890823
Previous qualification                               4.577758
Previous qualification (grade)                     132.613314
Nacionality                                          1.873192
Mother's qualification                              19.561935
Father's qualification                              22.275316
Mother's occupation                                 10.960895
Father's occupation                                 11.032324
Admission grade                                    126.978119
Displaced                                            0.548373
Educational special needs                            0.011528
Debtor                                               0.113698
Tuition 

In [43]:
stud.drop(columns='Target').agg(["sum", "mean", "min"])

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
sum,5214.0,82592.0,7644.0,39181790.0,3941.0,20252.0,586681.3,8287.0,86542.0,98546.0,...,609.0,2397.0,27571.0,35672.0,19624.0,45258.430117,665.0,51168.6,5432.8,8.71
mean,1.178571,18.669078,1.727848,8856.643,0.890823,4.577758,132.613314,1.873192,19.561935,22.275316,...,0.137658,0.541817,6.232143,8.063291,4.435805,10.230206,0.150316,11.566139,1.228029,0.001969
min,1.0,1.0,0.0,33.0,0.0,1.0,95.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.6,-0.8,-4.06


In [44]:
def amplitude(series):
    return series.max() - series.min()

stud.drop(columns="Target").agg(['sum', 'mean', 'min', 'max', amplitude])

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
sum,5214.0,82592.0,7644.0,39181790.0,3941.0,20252.0,586681.3,8287.0,86542.0,98546.0,...,609.0,2397.0,27571.0,35672.0,19624.0,45258.430117,665.0,51168.6,5432.8,8.71
mean,1.178571,18.669078,1.727848,8856.643,0.890823,4.577758,132.613314,1.873192,19.561935,22.275316,...,0.137658,0.541817,6.232143,8.063291,4.435805,10.230206,0.150316,11.566139,1.228029,0.001969
min,1.0,1.0,0.0,33.0,0.0,1.0,95.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.6,-0.8,-4.06
max,6.0,57.0,9.0,9991.0,1.0,43.0,190.0,109.0,44.0,44.0,...,12.0,19.0,23.0,33.0,20.0,18.571429,12.0,16.2,3.7,3.51
amplitude,5.0,56.0,9.0,9958.0,1.0,42.0,95.0,108.0,43.0,43.0,...,12.0,19.0,23.0,33.0,20.0,18.571429,12.0,8.6,4.5,7.57


### Unindo diferentes datasets

In [45]:
# Criando o dataframe "compras"
 

compras = pd.DataFrame({
    'id_cliente': ['AA00', 'AA00', 'BB01', 'BB01', 'BB01', 'CC02', 'CC02', 'CC02', 'CC02', 'CC02'],
    'data_compra': ['2023-01-01', '2023-01-05', '2023-01-10', '2023-01-15', '2023-01-20', '2023-01-25', '2023-01-30', '2023-02-01', '2023-02-05', '2023-02-10'], 
    'valor_compra': [100, 150, 200, 250, 300, 350, 400, 450, 500, 550]

}) 

# Criando o dataframe "geolocalizacao" 
geolocalizacao = pd.DataFrame({
    'id_cliente': ['AA00', 'BB01', 'CC02', 'DD03', 'EE04'],
    'estado': ['São Paulo', 'São Paulo', 'Minas Gerais', 'Bahia', 'Pernambuco']
}) 

In [46]:
compras

Unnamed: 0,id_cliente,data_compra,valor_compra
0,AA00,2023-01-01,100
1,AA00,2023-01-05,150
2,BB01,2023-01-10,200
3,BB01,2023-01-15,250
4,BB01,2023-01-20,300
5,CC02,2023-01-25,350
6,CC02,2023-01-30,400
7,CC02,2023-02-01,450
8,CC02,2023-02-05,500
9,CC02,2023-02-10,550


In [47]:
geolocalizacao

Unnamed: 0,id_cliente,estado
0,AA00,São Paulo
1,BB01,São Paulo
2,CC02,Minas Gerais
3,DD03,Bahia
4,EE04,Pernambuco


In [48]:
compras_group = compras.groupby('id_cliente')['valor_compra'].sum().reset_index()
compras_group

Unnamed: 0,id_cliente,valor_compra
0,AA00,250
1,BB01,750
2,CC02,2250


In [49]:
compras_group_geo = compras_group.merge(geolocalizacao, on='id_cliente', how='left')
compras_group_geo

Unnamed: 0,id_cliente,valor_compra,estado
0,AA00,250,São Paulo
1,BB01,750,São Paulo
2,CC02,2250,Minas Gerais


In [50]:
compras_group_geo = compras_group_geo.groupby('estado')['valor_compra'].sum().reset_index()
compras_group_geo

Unnamed: 0,estado,valor_compra
0,Minas Gerais,2250
1,São Paulo,1000


In [51]:
df_A = pd.DataFrame({
    'key': [1, 2, 3],
    'value_A': ['A1', 'A2', 'A3']
})
df_A 

Unnamed: 0,key,value_A
0,1,A1
1,2,A2
2,3,A3


In [52]:
df_B = pd.DataFrame({
    'key': [1, 3, 4],
    'value_B': ['B1', 'B2', 'B3']
})
df_B

Unnamed: 0,key,value_B
0,1,B1
1,3,B2
2,4,B3


In [53]:
left_join = df_A.merge(df_B, on='key', how='left')
print("\nLeft Join:")
left_join


Left Join:


Unnamed: 0,key,value_A,value_B
0,1,A1,B1
1,2,A2,
2,3,A3,B2


In [54]:
right_join = df_A.merge(df_B, on='key', how='right')
print("\nRight Join:")
right_join


Right Join:


Unnamed: 0,key,value_A,value_B
0,1,A1,B1
1,3,A3,B2
2,4,,B3


In [55]:
inner_join = df_A.merge(df_B, on='key', how='inner')
print("\nInner Join:")
inner_join


Inner Join:


Unnamed: 0,key,value_A,value_B
0,1,A1,B1
1,3,A3,B2


In [56]:
outer_join = df_A.merge(df_B, on='key', how='outer')
print("\nOuter Join:")
outer_join


Outer Join:


Unnamed: 0,key,value_A,value_B
0,1,A1,B1
1,2,A2,
2,3,A3,B2
3,4,,B3


In [57]:
# Criando o DataFrame de compras de junho e julho

jun_jul = pd.DataFrame({
    'id_cliente': ['AA01', 'BB02', 'AA01', 'CC03', 'BB02', 'DD04'],
    'dt_compra': ['2023-06-05', '2023-06-15', '2023-06-25', '2023-07-05', '2023-07-15', '2023-07-25'],
    'vl_compra': [200.50, 155.75, 333.60, 450.00, 300.10, 250.00]
})


# Criando o DataFrame de compras de agosto e setembro

ago_set = pd.DataFrame({
    'id_cliente': ['EE05', 'EE05', 'FF06', 'GG07'],
    'dt_compra': ['2023-08-05', '2023-08-15', '2023-08-25', '2023-09-05'],
    'vl_compra': [205.55, 233.75, 550.65, 320.82]
})

In [58]:
jun_jul

Unnamed: 0,id_cliente,dt_compra,vl_compra
0,AA01,2023-06-05,200.5
1,BB02,2023-06-15,155.75
2,AA01,2023-06-25,333.6
3,CC03,2023-07-05,450.0
4,BB02,2023-07-15,300.1
5,DD04,2023-07-25,250.0


In [59]:
ago_set

Unnamed: 0,id_cliente,dt_compra,vl_compra
0,EE05,2023-08-05,205.55
1,EE05,2023-08-15,233.75
2,FF06,2023-08-25,550.65
3,GG07,2023-09-05,320.82


In [60]:
df_concat = pd.concat([jun_jul, ago_set])
df_concat

Unnamed: 0,id_cliente,dt_compra,vl_compra
0,AA01,2023-06-05,200.5
1,BB02,2023-06-15,155.75
2,AA01,2023-06-25,333.6
3,CC03,2023-07-05,450.0
4,BB02,2023-07-15,300.1
5,DD04,2023-07-25,250.0
0,EE05,2023-08-05,205.55
1,EE05,2023-08-15,233.75
2,FF06,2023-08-25,550.65
3,GG07,2023-09-05,320.82


### Intervalos discretos com cut e qcut

In [61]:
df = pd.DataFrame({'value': [1, 2, 3, 4, 5]})
df['bin'] = pd.cut(df['value'], bins=[0,3,5], labels=['Low', 'High'])
df

Unnamed: 0,value,bin
0,1,Low
1,2,Low
2,3,Low
3,4,High
4,5,High


In [63]:
df = pd.DataFrame({'value': [1, 2, 3, 4, 5]})
df['bin'] = pd.cut(df['value'], bins=[1,4,5], labels=['Low', 'High'])
df

Unnamed: 0,value,bin
0,1,
1,2,Low
2,3,Low
3,4,Low
4,5,High


In [65]:
df = pd.DataFrame({'value': [1, 2, 3, 4, 5]})
df['equals_bins'] = pd.cut(df['value'], bins=3)
df

Unnamed: 0,value,equals_bins
0,1,"(0.996, 2.333]"
1,2,"(0.996, 2.333]"
2,3,"(2.333, 3.667]"
3,4,"(3.667, 5.0]"
4,5,"(3.667, 5.0]"


In [66]:
# Criando campo novo sem rótulo para intervalos
df = pd.DataFrame({'value': [1, 2, 3, 4, 5]})
df['quantile_bin'] = pd.qcut(df['value'], q=2)
df

Unnamed: 0,value,quantile_bin
0,1,"(0.999, 3.0]"
1,2,"(0.999, 3.0]"
2,3,"(0.999, 3.0]"
3,4,"(3.0, 5.0]"
4,5,"(3.0, 5.0]"


In [67]:
# Criando campo novo sem rótulo para intervalos
df = pd.DataFrame({'value': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})
df['custom_quantile_bin'] = pd.qcut(df['value'], q=[0, 0.1, 0.5, 1])
df

Unnamed: 0,value,custom_quantile_bin
0,1,"(0.999, 1.9]"
1,2,"(1.9, 5.5]"
2,3,"(1.9, 5.5]"
3,4,"(1.9, 5.5]"
4,5,"(1.9, 5.5]"
5,6,"(5.5, 10.0]"
6,7,"(5.5, 10.0]"
7,8,"(5.5, 10.0]"
8,9,"(5.5, 10.0]"
9,10,"(5.5, 10.0]"


### Método sample

In [68]:
# Amostra de 10 estudantes
amostra = stud.sample(n=10)
amostra

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
3129,1,1,1,9773,1,1,121.0,1,1,38,...,0,6,6,6,12.833333,0,11.1,0.6,2.02,Graduate
4071,1,1,2,9254,1,1,130.0,1,1,38,...,0,6,13,4,10.0,0,7.6,2.6,0.32,Dropout
3122,1,17,5,9670,1,1,125.0,1,19,30,...,0,5,6,4,11.75,1,7.6,2.6,0.32,Enrolled
816,1,17,1,9070,1,1,136.0,1,1,37,...,0,6,8,6,11.5,0,10.8,1.4,1.74,Enrolled
1677,5,39,1,9991,0,1,110.0,1,1,1,...,0,5,6,5,12.6,0,8.9,1.4,3.51,Graduate
1595,1,1,1,9670,1,1,127.0,1,1,37,...,0,5,6,4,11.0,0,7.6,2.6,0.32,Enrolled
3326,2,43,1,9991,0,1,140.0,1,37,37,...,7,10,12,8,12.0,0,9.4,-0.8,-3.12,Graduate
1749,1,43,1,9070,1,1,108.0,1,38,38,...,10,15,15,10,12.4,0,12.7,3.7,-1.7,Dropout
1269,1,39,1,9670,1,1,140.0,1,37,37,...,0,6,9,0,0.0,0,10.8,1.4,1.74,Dropout
375,2,51,1,8014,0,19,133.1,1,37,37,...,5,11,14,10,12.8,0,11.1,0.6,2.02,Enrolled


In [69]:
amostra_10_perc = stud.sample(frac=0.1)
amostra_10_perc

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
2115,1,17,2,9670,1,1,129.0,1,1,19,...,0,6,0,0,0.000000,0,8.9,1.4,3.51,Dropout
3996,1,1,1,9853,1,1,128.0,1,1,1,...,0,6,13,2,12.000000,0,12.7,3.7,-1.70,Dropout
3785,1,44,1,9003,1,39,150.0,1,1,38,...,0,6,17,3,11.666667,0,15.5,2.8,-4.06,Enrolled
1078,1,43,1,171,1,1,116.0,1,3,1,...,0,0,0,0,0.000000,0,13.9,-0.3,0.79,Dropout
1606,1,1,5,9254,1,1,101.0,1,3,19,...,0,6,6,6,13.000000,0,12.4,0.5,1.79,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3919,1,17,1,9500,1,1,121.0,1,1,1,...,0,8,8,7,12.910000,0,10.8,1.4,1.74,Graduate
832,1,43,1,9130,1,1,120.0,6,1,3,...,0,5,7,2,12.500000,2,10.8,1.4,1.74,Enrolled
1754,1,17,5,9119,1,1,123.0,1,3,1,...,0,5,5,0,0.000000,0,8.9,1.4,3.51,Dropout
735,1,17,2,9670,1,1,110.0,1,3,3,...,0,6,8,5,12.000000,0,11.1,0.6,2.02,Graduate


### Método nunique e unique

In [70]:
# Retorna os valores únicos
stud.Target.unique()

array(['Dropout', 'Graduate', 'Enrolled'], dtype=object)

In [71]:
# Retorna a quantidade de valores únicos
stud.Target.nunique()

3

In [75]:
# Lista de colunas com tipo de dado 'object' (geralmente texto ou string)
categorical_features = stud.select_dtypes(include=['object']).columns.tolist()
categorical_features

['Target']

In [76]:
# Lista de colunas com tipo de dado 'object' (geralmente texto ou string)
numerical_features = stud.select_dtypes(exclude=['object']).columns.tolist()
numerical_features

['Marital status',
 'Application mode',
 'Application order',
 'Course',
 'Daytime/evening attendance\t',
 'Previous qualification',
 'Previous qualification (grade)',
 'Nacionality',
 "Mother's qualification",
 "Father's qualification",
 "Mother's occupation",
 "Father's occupation",
 'Admission grade',
 'Displaced',
 'Educational special needs',
 'Debtor',
 'Tuition fees up to date',
 'Gender',
 'Scholarship holder',
 'Age at enrollment',
 'International',
 'Curricular units 1st sem (credited)',
 'Curricular units 1st sem (enrolled)',
 'Curricular units 1st sem (evaluations)',
 'Curricular units 1st sem (approved)',
 'Curricular units 1st sem (grade)',
 'Curricular units 1st sem (without evaluations)',
 'Curricular units 2nd sem (credited)',
 'Curricular units 2nd sem (enrolled)',
 'Curricular units 2nd sem (evaluations)',
 'Curricular units 2nd sem (approved)',
 'Curricular units 2nd sem (grade)',
 'Curricular units 2nd sem (without evaluations)',
 'Unemployment rate',
 'Inflation r

In [79]:
# Supondo que df é o seu DataFrame e que seu limite é 10 valores únicos
unique_counts = stud.nunique()

categorical_features = unique_counts[unique_counts <= 10].index.to_list()
numerical_features = unique_counts[unique_counts > 10].index.to_list()

In [82]:
unique_counts

Marital status                                      6
Application mode                                   18
Application order                                   8
Course                                             17
Daytime/evening attendance\t                        2
Previous qualification                             17
Previous qualification (grade)                    101
Nacionality                                        21
Mother's qualification                             29
Father's qualification                             34
Mother's occupation                                32
Father's occupation                                46
Admission grade                                   620
Displaced                                           2
Educational special needs                           2
Debtor                                              2
Tuition fees up to date                             2
Gender                                              2
Scholarship holder          

In [80]:
categorical_features

['Marital status',
 'Application order',
 'Daytime/evening attendance\t',
 'Displaced',
 'Educational special needs',
 'Debtor',
 'Tuition fees up to date',
 'Gender',
 'Scholarship holder',
 'International',
 'Curricular units 2nd sem (without evaluations)',
 'Unemployment rate',
 'Inflation rate',
 'GDP',
 'Target']

In [81]:
numerical_features

['Application mode',
 'Course',
 'Previous qualification',
 'Previous qualification (grade)',
 'Nacionality',
 "Mother's qualification",
 "Father's qualification",
 "Mother's occupation",
 "Father's occupation",
 'Admission grade',
 'Age at enrollment',
 'Curricular units 1st sem (credited)',
 'Curricular units 1st sem (enrolled)',
 'Curricular units 1st sem (evaluations)',
 'Curricular units 1st sem (approved)',
 'Curricular units 1st sem (grade)',
 'Curricular units 1st sem (without evaluations)',
 'Curricular units 2nd sem (credited)',
 'Curricular units 2nd sem (enrolled)',
 'Curricular units 2nd sem (evaluations)',
 'Curricular units 2nd sem (approved)',
 'Curricular units 2nd sem (grade)']

### Funções com apply

In [83]:
df = pd.DataFrame({'A': [1,2,3,4,5]})

df['quadrado'] = df['A'].apply(lambda x: x**2)

df

Unnamed: 0,A,quadrado
0,1,1
1,2,4
2,3,9
3,4,16
4,5,25


In [85]:
stud['new_GDP'] = stud.GDP.apply(lambda x: x*100)
stud[['GDP', 'new_GDP']].head()

Unnamed: 0,GDP,new_GDP
0,1.74,174.0
1,0.79,79.0
2,1.74,174.0
3,-3.12,-312.0
4,0.79,79.0


In [86]:
# Construindo dataframe
df = pd.DataFrame({
    'A': [1,2,3],
    'B': [4,5,6],
    'C': [7,8,9]
})

df

Unnamed: 0,A,B,C
0,1,4,7
1,2,5,8
2,3,6,9


In [88]:
# Média das colunas
df.apply(lambda col: col.mean(), axis=1)

0    4.0
1    5.0
2    6.0
dtype: float64

In [89]:
# Soma das linhas
df.apply(lambda row: row.sum(), axis=0)

A     6
B    15
C    24
dtype: int64

### Tabelas de frequência cruzada

In [91]:
prod = pd.read_csv('../data/bike_store/products.csv')
prod.head()

Unnamed: 0,product_id,product_name,brand_id,category_id,model_year,list_price
0,1,Trek 820 - 2016,9,6,2016,379.99
1,2,Ritchey Timberwolf Frameset - 2016,5,6,2016,749.99
2,3,Surly Wednesday Frameset - 2016,8,6,2016,999.99
3,4,Trek Fuel EX 8 29 - 2016,9,6,2016,2899.99
4,5,Heller Shagamaw Frame - 2016,3,6,2016,1320.99


In [92]:
pd.crosstab(prod['model_year'], prod['brand_id'])

brand_id,1,2,3,4,5,6,7,8,9
model_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016,12,0,1,3,1,0,0,4,5
2017,16,10,0,0,0,0,23,8,28
2018,90,0,2,0,0,3,0,13,96
2019,0,0,0,0,0,0,0,0,6


In [93]:
pd.crosstab(prod['model_year'], prod['brand_id'], margins=True)

brand_id,1,2,3,4,5,6,7,8,9,All
model_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016,12,0,1,3,1,0,0,4,5,26
2017,16,10,0,0,0,0,23,8,28,85
2018,90,0,2,0,0,3,0,13,96,204
2019,0,0,0,0,0,0,0,0,6,6
All,118,10,3,3,1,3,23,25,135,321


In [94]:
# Percentual da linha
pd.crosstab(prod['model_year'], prod['brand_id']).apply(lambda r: r/r.sum(), axis=1)

brand_id,1,2,3,4,5,6,7,8,9
model_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016,0.461538,0.0,0.038462,0.115385,0.038462,0.0,0.0,0.153846,0.192308
2017,0.188235,0.117647,0.0,0.0,0.0,0.0,0.270588,0.094118,0.329412
2018,0.441176,0.0,0.009804,0.0,0.0,0.014706,0.0,0.063725,0.470588
2019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [95]:
# Percentual da coluna
pd.crosstab(prod['model_year'], prod['brand_id']).apply(lambda r: r/r.sum(), axis=0)

brand_id,1,2,3,4,5,6,7,8,9
model_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016,0.101695,0.0,0.333333,1.0,1.0,0.0,0.0,0.16,0.037037
2017,0.135593,1.0,0.0,0.0,0.0,0.0,1.0,0.32,0.207407
2018,0.762712,0.0,0.666667,0.0,0.0,1.0,0.0,0.52,0.711111
2019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.044444


### Valores ausente com isna()

In [96]:
df = pd.DataFrame({
    'A': [1, 2, None, 4, 5],
    'B': ['A', None, 'C', None, 'E']
})

df

Unnamed: 0,A,B
0,1.0,A
1,2.0,
2,,C
3,4.0,
4,5.0,E


In [97]:
df.isna()

Unnamed: 0,A,B
0,False,False
1,False,True
2,True,False
3,False,True
4,False,False


In [98]:
# Ausentes por coluna (soma)
df.isna().sum()

A    1
B    2
dtype: int64

In [100]:
# Ausentes por coluna (percentual)
df.isna().mean()

A    0.2
B    0.4
dtype: float64

In [101]:
# Filtra missing na coluna A
df[df['A'].isna()]

Unnamed: 0,A,B
2,,C


In [102]:
# Filtra missing na coluna B
df[df['B'].isna()]

Unnamed: 0,A,B
1,2.0,
3,4.0,


### Filtrando dados com query()

In [103]:
prod.query('list_price > 5500')

Unnamed: 0,product_id,product_name,brand_id,category_id,model_year,list_price
49,50,Trek Silque SLR 7 Women's - 2017,9,7,2017,5999.99
50,51,Trek Silque SLR 8 Women's - 2017,9,7,2017,6499.99
148,149,Trek Domane SLR 8 Disc - 2018,9,7,2018,7499.99
154,155,Trek Domane SLR 9 Disc - 2018,9,7,2018,11999.99
155,156,Trek Domane SL Frameset - 2018,9,7,2018,6499.99
156,157,Trek Domane SL Frameset Women's - 2018,9,7,2018,6499.99
168,169,Trek Emonda SLR 8 - 2018,9,7,2018,6499.99


In [104]:
prod.query('list_price > 5500 & model_year == 2018')

Unnamed: 0,product_id,product_name,brand_id,category_id,model_year,list_price
148,149,Trek Domane SLR 8 Disc - 2018,9,7,2018,7499.99
154,155,Trek Domane SLR 9 Disc - 2018,9,7,2018,11999.99
155,156,Trek Domane SL Frameset - 2018,9,7,2018,6499.99
156,157,Trek Domane SL Frameset Women's - 2018,9,7,2018,6499.99
168,169,Trek Emonda SLR 8 - 2018,9,7,2018,6499.99


In [105]:
preco = 5550
prod.query('list_price > @preco & model_year == 2018')

Unnamed: 0,product_id,product_name,brand_id,category_id,model_year,list_price
148,149,Trek Domane SLR 8 Disc - 2018,9,7,2018,7499.99
154,155,Trek Domane SLR 9 Disc - 2018,9,7,2018,11999.99
155,156,Trek Domane SL Frameset - 2018,9,7,2018,6499.99
156,157,Trek Domane SL Frameset Women's - 2018,9,7,2018,6499.99
168,169,Trek Emonda SLR 8 - 2018,9,7,2018,6499.99
