Attributes of Pandas objects

In [1]:
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(8, 3), index=[1,2,3,4,5,6,7,8], columns=['A', 'B', 'C'])
df.columns = [x.lower() for x in df.columns]
df

Unnamed: 0,a,b,c
1,-0.390794,0.445238,-2.282914
2,0.754265,-0.795424,1.013301
3,0.076105,-0.566549,0.821417
4,0.444199,-0.367382,1.031378
5,-0.763342,-0.372069,-0.070167
6,0.134723,-0.710616,-0.417648
7,0.65153,-0.400741,1.361412
8,0.797227,-0.836956,-1.560932


In [2]:
df.a.array

<PandasArray>
[-0.39079403073851343,   0.7542646886888844,  0.07610490588479413,
   0.4441986966000867,  -0.7633417861390356,  0.13472285226947736,
   0.6515300296472424,   0.7972273576686147]
Length: 8, dtype: float64

In [3]:
data = np.random.randint(0, 7, size=50)
data

array([2, 5, 3, 6, 5, 1, 5, 2, 4, 2, 1, 2, 3, 2, 6, 1, 0, 2, 4, 2, 4, 4,
       1, 3, 6, 3, 6, 4, 3, 0, 4, 2, 1, 2, 2, 5, 2, 4, 2, 6, 4, 3, 5, 1,
       5, 3, 4, 3, 1, 4])

In [9]:
s = pd.Series(data) # adds numbers on left and array to the right. like a DF
s.value_counts() # count the replicates

2    12
4    10
3     8
1     7
5     6
6     5
0     2
dtype: int64

In [18]:
s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7, 8, 8]) # most frequent
s5.mode()

0    3
1    7
dtype: int64

In [25]:
df5 = pd.DataFrame({"A": np.random.randint(0, 7, size=50),
                    "B": np.random.randint(-10, 15, size=50)})

In [26]:
df5.mode()

Unnamed: 0,A,B
0,2,5.0
1,3,


Altering Labels

In [27]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s

a    0.983284
b    1.112316
c    1.684558
d   -0.291741
e    2.441232
dtype: float64

In [29]:
s.reindex(['e', 'b', 'f', 'd']) # does not save over s

e    2.441232
b    1.112316
f         NaN
d   -0.291741
dtype: float64

In [31]:
s # see?

a    0.983284
b    1.112316
c    1.684558
d   -0.291741
e    2.441232
dtype: float64

In [32]:
df = pd.DataFrame({
     'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
     'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
     'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})

In [33]:
df.reindex(index=['c', 'f', 'b'], columns=['three', 'two', 'one'])

Unnamed: 0,three,two,one
c,-0.837545,0.193789,-0.311827
f,,,
b,0.873671,-0.760864,1.061171


In [34]:
df.reindex(['c', 'f', 'b'], axis='index') # axis keyword

Unnamed: 0,one,two,three
c,-0.311827,0.193789,-0.837545
f,,,
b,1.061171,-0.760864,0.873671


dropping labels from axis

In [35]:
df

Unnamed: 0,one,two,three
a,2.004031,0.852024,
b,1.061171,-0.760864,0.873671
c,-0.311827,0.193789,-0.837545
d,,0.380606,-0.774782


In [36]:
df.drop(['a', 'd'], axis=0) # does not alter original

Unnamed: 0,one,two,three
b,1.061171,-0.760864,0.873671
c,-0.311827,0.193789,-0.837545


In [37]:
df.drop(['one'], axis=1) # does not alter original

Unnamed: 0,two,three
a,0.852024,
b,-0.760864,0.873671
c,0.193789,-0.837545
d,0.380606,-0.774782


renaming

In [39]:
s

a    0.983284
b    1.112316
c    1.684558
d   -0.291741
e    2.441232
dtype: float64

In [40]:
s.rename(str.upper)

A    0.983284
B    1.112316
C    1.684558
D   -0.291741
E    2.441232
dtype: float64

In [41]:
df.rename(columns={'one': 'foo', 'two': 'bar'},
              index={'a': 'apple', 'b': 'banana', 'd': 'durian'})

Unnamed: 0,foo,bar,three
apple,2.004031,0.852024,
banana,1.061171,-0.760864,0.873671
c,-0.311827,0.193789,-0.837545
durian,,0.380606,-0.774782


.dt and .str accessors

In [53]:
s = pd.Series(pd.date_range('20130101 09:10:12', periods=4))
s

0   2013-01-01 09:10:12
1   2013-01-02 09:10:12
2   2013-01-03 09:10:12
3   2013-01-04 09:10:12
dtype: datetime64[ns]

In [52]:
s.dt.hour

0    9
1    9
2    9
3    9
dtype: int64

In [51]:
print(s.dt.second)
print(s.dt.day)
print(s.dt.week)

0    12
1    12
2    12
3    12
dtype: int64
0    1
1    2
2    3
3    4
dtype: int64
0    1
1    1
2    1
3    1
dtype: int64


In [54]:
stz = s.dt.tz_localize('US/Eastern') # We can easily produce timezone-aware transformations
stz

0   2013-01-01 09:10:12-05:00
1   2013-01-02 09:10:12-05:00
2   2013-01-03 09:10:12-05:00
3   2013-01-04 09:10:12-05:00
dtype: datetime64[ns, US/Eastern]

In [55]:
s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern')

0   2013-01-01 04:10:12-05:00
1   2013-01-02 04:10:12-05:00
2   2013-01-03 04:10:12-05:00
3   2013-01-04 04:10:12-05:00
dtype: datetime64[ns, US/Eastern]

.str acessor

In [58]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],
                  dtype="string")
# Using .str accessor, we can apply all string functions from standard Python to our Series.

In [57]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5    <NA>
6    caba
7     dog
8     cat
dtype: string

sorting

In [60]:
df = pd.DataFrame({
        'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
        'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
        'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
df

Unnamed: 0,one,two,three
a,-0.244818,-0.660639,
b,0.038707,-1.341393,-1.249187
c,-0.113314,-0.245681,-0.51008
d,,-0.114093,-1.198689


In [63]:
unsorted_df = df.reindex(index=['a', 'd', 'c', 'b'],
                          columns=['three', 'two', 'one'])
unsorted_df

Unnamed: 0,three,two,one
a,,-0.660639,-0.244818
d,-1.198689,-0.114093,
c,-0.51008,-0.245681,-0.113314
b,-1.249187,-1.341393,0.038707


In [64]:
unsorted_df.sort_index() # a b c d

Unnamed: 0,three,two,one
a,,-0.660639,-0.244818
b,-1.249187,-1.341393,0.038707
c,-0.51008,-0.245681,-0.113314
d,-1.198689,-0.114093,


In [66]:
unsorted_df.sort_index(ascending=False) # d c b a

Unnamed: 0,three,two,one
d,-1.198689,-0.114093,
c,-0.51008,-0.245681,-0.113314
b,-1.249187,-1.341393,0.038707
a,,-0.660639,-0.244818


In [68]:
unsorted_df.sort_index(axis=1) # alphabetical top columns

Unnamed: 0,one,three,two
a,-0.244818,,-0.660639
d,,-1.198689,-0.114093
c,-0.113314,-0.51008,-0.245681
b,0.038707,-1.249187,-1.341393


In [69]:
unsorted_df['three'].sort_index() # three column, sorted abcd

a         NaN
b   -1.249187
c   -0.510080
d   -1.198689
Name: three, dtype: float64

sort by values

In [70]:
df1 = pd.DataFrame({'one': [2, 1, 1, 1],
                        'two': [1, 3, 2, 4],
                        'three': [5, 4, 3, 2]})

In [72]:
df1.sort_values(by='two') # sort two, others follow

Unnamed: 0,one,two,three
0,2,1,5
2,1,2,3
1,1,3,4
3,1,4,2


In [74]:
df1[['one', 'two', 'three']].sort_values(by=['one', 'two'])
# Sort DataFrame by columns "one" and "two"

Unnamed: 0,one,two,three
2,1,2,3
1,1,3,4
3,1,4,2
0,2,1,5


In [77]:
s[2] = np.nan
s

0       A
1       B
2    <NA>
3    Aaba
4    Baca
5    <NA>
6    CABA
7     dog
8     cat
dtype: string

In [79]:
s.sort_values() # alphabetical

0       A
3    Aaba
1       B
4    Baca
6    CABA
8     cat
7     dog
2    <NA>
5    <NA>
dtype: string

In [78]:
s.sort_values(na_position='first')

2    <NA>
5    <NA>
0       A
3    Aaba
1       B
4    Baca
6    CABA
8     cat
7     dog
dtype: string

In [80]:
idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 2), 
                                   ('b', 2), ('b', 1), ('b', 1)]) # build multiindex

In [82]:
idx.names = ['first', 'second']
idx

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 2),
            ('b', 2),
            ('b', 1),
            ('b', 1)],
           names=['first', 'second'])

In [84]:
df_multi = pd.DataFrame({'A': np.arange(6, 0, -1)},
                            index=idx) # build dataframe
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,A
first,second,Unnamed: 2_level_1
a,1,6
a,2,5
a,2,4
b,2,3
b,1,2
b,1,1


In [85]:
# Sort DataFrame by 'second' (index) and 'A' (column)
df_multi.sort_values(by=['second', 'A'])

Unnamed: 0_level_0,Unnamed: 1_level_0,A
first,second,Unnamed: 2_level_1
b,1,1
b,1,2
a,1,6
b,2,3
a,2,4
a,2,5
