In [2]:
import pandas as pd
import numpy as np

In [6]:
df = pd.DataFrame({'k1': ['um', 'dois']*3 + ['dois'],
                   'k2': [1,1,2,3,3,4,4]
                  })
df

Unnamed: 0,k1,k2
0,um,1
1,dois,1
2,um,2
3,dois,3
4,um,3
5,dois,4
6,dois,4


In [7]:
df.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [8]:
df.drop_duplicates()

Unnamed: 0,k1,k2
0,um,1
1,dois,1
2,um,2
3,dois,3
4,um,3
5,dois,4


In [9]:
dict_a = {'alimentos': ['bacon', 'ovo', 'laranja', 'toucinho', 'picanha', 'limão'],
          'qtd': [10, 1, 1, 20, 30, 20]
         }
df2 = pd.DataFrame(data=dict_a)
df2

Unnamed: 0,alimentos,qtd
0,bacon,10
1,ovo,1
2,laranja,1
3,toucinho,20
4,picanha,30
5,limão,20


In [10]:
dict_alimento_tipo = {
    'bacon': 'porco',
    'ovo': 'ave',
    'laranja': 'fruta',
    'toucinho': 'porco',
    'picanha': 'boi',
    'limão': 'fruta'
}

In [12]:
df2['tipo'] = df2['alimentos'].map(dict_alimento_tipo)
df2

Unnamed: 0,alimentos,qtd,tipo
0,bacon,10,porco
1,ovo,1,ave
2,laranja,1,fruta
3,toucinho,20,porco
4,picanha,30,boi
5,limão,20,fruta


In [13]:
s1 = pd.Series([1., -999, -1000, -999, -1000., 3.])

In [14]:
s1

0       1.0
1    -999.0
2   -1000.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [15]:
s1.replace([-999,-1000], 0)

0    1.0
1    0.0
2    0.0
3    0.0
4    0.0
5    3.0
dtype: float64

In [17]:
idades = [20,22,25,27,21,23,37,31,61,45,41,32]

In [18]:
intervalos = [18,25,35,60,100]

In [19]:
categorias = pd.cut(idades,intervalos)

In [20]:
categorias

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [21]:
categorias.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [25]:
categorias.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [26]:
label_categ = ['jovem', 'jovem adulto', 'adulto', 'senior']

In [29]:
classes = pd.cut(idades, intervalos, labels=label_categ)

In [30]:
classes

['jovem', 'jovem', 'jovem', 'jovem adulto', 'jovem', ..., 'jovem adulto', 'senior', 'adulto', 'adulto', 'jovem adulto']
Length: 12
Categories (4, object): ['jovem' < 'jovem adulto' < 'adulto' < 'senior']

In [31]:
pd.value_counts(classes)

jovem           5
jovem adulto    3
adulto          3
senior          1
dtype: int64

In [32]:
#tratamento de outliers
darray = pd.DataFrame(np.random.randn(1000,4))

In [33]:
darray.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.073627,-0.033275,0.026741,0.018519
std,1.010277,1.032154,1.017073,0.975117
min,-3.062247,-3.237608,-3.031508,-2.91341
25%,-0.758451,-0.735487,-0.672954,-0.61595
50%,-0.058262,-0.016436,0.045986,-0.041536
75%,0.558396,0.692082,0.744549,0.632976
max,3.489405,2.959996,2.949937,3.323118


In [34]:
#encontrar na coluna de índice 2 valores com módulo maior que 3
col = darray[2]

In [36]:
col[np.abs(col) > 3]

5   -3.031508
Name: 2, dtype: float64

In [37]:
#linhas com valores maiores que 3 ou -3
darray[(np.abs(darray) > 3).any(1)]

Unnamed: 0,0,1,2,3
5,-3.062247,-1.013566,-3.031508,-0.409121
251,0.7739,-3.237608,0.28944,0.866802
525,0.652107,1.4657,1.12845,3.323118
754,0.134021,-3.197658,-0.269621,-0.441218
865,3.489405,0.242932,1.297426,-0.495738


In [40]:
#eliminar valores fora do intervalo -3 a 3:
print(np.abs(darray) > 3)
print(np.sign(darray) * 3)
darray[np.abs(darray) > 3] = np.sign(darray) * 3 # o que for True fica limitado a e 3 ou -3, de acordo com o sinal
darray

         0      1      2      3
0    False  False  False  False
1    False  False  False  False
2    False  False  False  False
3    False  False  False  False
4    False  False  False  False
..     ...    ...    ...    ...
995  False  False  False  False
996  False  False  False  False
997  False  False  False  False
998  False  False  False  False
999  False  False  False  False

[1000 rows x 4 columns]
       0    1    2    3
0   -3.0  3.0  3.0  3.0
1   -3.0 -3.0 -3.0  3.0
2   -3.0 -3.0  3.0 -3.0
3   -3.0  3.0 -3.0 -3.0
4   -3.0  3.0 -3.0  3.0
..   ...  ...  ...  ...
995  3.0 -3.0  3.0 -3.0
996  3.0 -3.0  3.0  3.0
997 -3.0  3.0 -3.0 -3.0
998 -3.0 -3.0  3.0 -3.0
999 -3.0  3.0 -3.0  3.0

[1000 rows x 4 columns]


Unnamed: 0,0,1,2,3
0,-1.030804,0.385893,0.232725,1.598431
1,-0.813593,-0.806688,-0.155347,0.467307
2,-1.183622,-1.425484,0.175987,-0.404262
3,-0.089002,0.081261,-0.547435,-0.552994
4,-1.279639,1.110969,-0.953672,0.925623
...,...,...,...,...
995,0.409468,-0.459771,1.085293,-0.585430
996,2.436777,-0.789396,1.211573,0.425739
997,-1.059426,0.449552,-2.365681,-0.091691
998,-0.837116,-0.585860,0.046492,-0.841410


In [41]:
darray.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.074054,-0.03284,0.026773,0.018196
std,1.008484,1.030855,1.016979,0.974074
min,-3.0,-3.0,-3.0,-2.91341
25%,-0.758451,-0.735487,-0.672954,-0.61595
50%,-0.058262,-0.016436,0.045986,-0.041536
75%,0.558396,0.692082,0.744549,0.632976
max,3.0,2.959996,2.949937,3.0
