In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({'k1': ['one']*3 + ['two']*4,
                   'k2': [1,1,2,3,3,4,4]})

In [3]:
df

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [4]:
df.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [5]:
df.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [6]:
df['v1'] = np.arange(7)

In [7]:
df

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


In [8]:
df.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
3,two,3,3


In [9]:
df.drop_duplicates(['k1', 'k2'])

Unnamed: 0,k1,k2,v1
0,one,1,0
2,one,2,2
3,two,3,3
5,two,4,5


In [10]:
df.drop_duplicates(['k1','k2'], keep='last')

Unnamed: 0,k1,k2,v1
1,one,1,1
2,one,2,2
4,two,3,4
6,two,4,6


In [11]:
df.drop_duplicates(['k1','k2'], keep='first')

Unnamed: 0,k1,k2,v1
0,one,1,0
2,one,2,2
3,two,3,3
5,two,4,5


In [13]:
df2 = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon',
                             'pastrami', 'honey ham', 'nova lox'],
                    'onunces': [4,3,12,6,7.5,8,3,5,6]})

In [14]:
df2

Unnamed: 0,food,onunces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [15]:
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

In [16]:
df2['animal'] = df2['food'].apply(lambda x: meat_to_animal[x.lower()])

In [17]:
df2

Unnamed: 0,food,onunces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [18]:
s = pd.Series([1., -999., 2., -999., -1000., 3.])

In [19]:
s

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [20]:
s2 = s.replace(-999, np.nan)

In [22]:
s2

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [23]:
s2.replace(np.nan, 0)

0       1.0
1       0.0
2       2.0
3       0.0
4   -1000.0
5       3.0
dtype: float64

In [24]:
s2.fillna(0)

0       1.0
1       0.0
2       2.0
3       0.0
4   -1000.0
5       3.0
dtype: float64

In [25]:
df3 = pd.DataFrame({'id': [1,2,3,4,5,6], 'raw_grade': ['a','b','b','a','a','e']})

In [26]:
df3

Unnamed: 0,id,raw_grade
0,1,a
1,2,b
2,3,b
3,4,a
4,5,a
5,6,e


In [27]:
df3['grade'] = df3['raw_grade'].astype('category')

In [28]:
df3

Unnamed: 0,id,raw_grade,grade
0,1,a,a
1,2,b,b
2,3,b,b
3,4,a,a
4,5,a,a
5,6,e,e


In [29]:
df3['grade']

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): [a, b, e]

In [32]:
df3['grade'].cat.categories

Index(['a', 'b', 'e'], dtype='object')

In [33]:
df3['grade'].cat.categories = ['very good', 'good', 'very bad']

In [34]:
df3

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,a,very good
5,6,e,very bad


In [36]:
df3['grade'] = df3['grade'].cat.set_categories(['very bad','bad','medium','good','very good'])

In [37]:
df3

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,a,very good
5,6,e,very bad


In [38]:
df3['grade']

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]

In [39]:
df3.sort_values(by='grade')

Unnamed: 0,id,raw_grade,grade
5,6,e,very bad
1,2,b,good
2,3,b,good
0,1,a,very good
3,4,a,very good
4,5,a,very good


In [40]:
ages = [20,22,25,27,21,23,37,31,61,45,41,32]
bins = [18,25,35,60,100]

In [41]:
cats = pd.cut(ages, bins)

In [42]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [43]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

In [44]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [45]:
cats.value_counts()

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [46]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [47]:
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [48]:
data = np.random.randn(20)

In [49]:
data

array([  3.85034869e+00,  -7.74915656e-01,   8.63852114e-01,
         4.34326572e-01,  -3.56942483e-01,   1.91401259e+00,
         5.43456196e-01,   2.27256764e+00,  -1.06009164e+00,
         2.23114822e+00,   2.37916072e-01,   1.15734629e+00,
         1.36970324e+00,  -1.73523605e-03,   1.94979836e+00,
         1.13216077e+00,   2.42792624e-01,  -2.50150191e-01,
         1.22815074e+00,   6.38226818e-01])

In [50]:
pd.cut(data, 4, precision=2)

[(2.62, 3.85], (-1.07, 0.17], (0.17, 1.4], (0.17, 1.4], (-1.07, 0.17], ..., (0.17, 1.4], (0.17, 1.4], (-1.07, 0.17], (0.17, 1.4], (0.17, 1.4]]
Length: 20
Categories (4, interval[float64]): [(-1.07, 0.17] < (0.17, 1.4] < (1.4, 2.62] < (2.62, 3.85]]

In [51]:
data2 = np.random.randn(1000)

In [52]:
data2[:5]

array([ 1.41117939,  1.21486215, -1.87213721, -1.59562858,  0.10912874])

In [53]:
cats = pd.qcut(data2, 4)

In [54]:
cats

[(0.744, 3.181], (0.744, 3.181], (-3.268, -0.691], (-3.268, -0.691], (0.00349, 0.744], ..., (0.00349, 0.744], (-3.268, -0.691], (0.00349, 0.744], (0.00349, 0.744], (-3.268, -0.691]]
Length: 1000
Categories (4, interval[float64]): [(-3.268, -0.691] < (-0.691, 0.00349] < (0.00349, 0.744] < (0.744, 3.181]]

In [55]:
pd.qcut?