In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame(np.random.randn(8,3), columns=['A', 'B', 'C'])
df.columns = [x.lower() for x in df.columns]
df

Unnamed: 0,a,b,c
0,-0.705592,0.976007,-0.382187
1,-1.090178,-2.125555,-0.225922
2,-0.734502,-0.516757,-0.51911
3,0.139371,0.808358,-0.874819
4,0.47035,-1.271218,0.7047
5,0.094886,0.047937,-0.272556
6,-1.45106,-1.682724,-0.091887
7,-0.057641,-0.930105,-1.276871


In [4]:
df.a.array

<PandasArray>
[ -0.7055923468499904,   -1.090178083499323,  -0.7345015905247565,
  0.13937138425454879,  0.47035030268123906,  0.09488573978879021,
  -1.4510598698660138, -0.05764099182948282]
Length: 8, dtype: float64

In [5]:
data = np.random.randint(0, 7, size=50)
data

array([1, 6, 0, 5, 0, 2, 3, 6, 3, 1, 1, 4, 0, 2, 4, 1, 6, 2, 2, 0, 0, 1,
       6, 5, 6, 4, 2, 2, 2, 4, 2, 4, 6, 6, 5, 0, 3, 1, 0, 1, 0, 6, 5, 1,
       3, 1, 4, 3, 1, 4])

In [6]:
s = pd.Series(data)
s.value_counts()

1    10
6     8
0     8
2     8
4     7
3     5
5     4
dtype: int64

In [7]:
s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7])

s5.mode()

0    3
1    7
dtype: int64

In [9]:
df5 = pd.DataFrame({
    "A": np.random.randint(0, 7, size=50),
    "B": np.random.randint(-10, 15, size=50)
})

df5.mode()

Unnamed: 0,A,B
0,1,-4.0
1,5,
2,6,


In [11]:
# reindex allows you to reorder data

s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
print(s)
s.reindex(['e', 'b', 'f', 'd'])

a    1.383423
b    1.349725
c   -0.310120
d    0.987694
e   -1.075781
dtype: float64


e   -1.075781
b    1.349725
f         NaN
d    0.987694
dtype: float64

In [12]:
df = pd.DataFrame({
     'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
     'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
     'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})

In [13]:
df.reindex(index=['c', 'f', 'b'], columns=['three', 'two', 'one'])

Unnamed: 0,three,two,one
c,-0.453974,0.712448,0.490871
f,,,
b,-0.954507,1.228449,-0.412847


In [14]:
df.reindex(['c', 'f', 'b'], axis='index')

Unnamed: 0,one,two,three
c,0.490871,0.712448,-0.453974
f,,,
b,-0.412847,1.228449,-0.954507


In [15]:
# drop removes labels
df
df.drop(['a', 'd'], axis=0)

Unnamed: 0,one,two,three
b,-0.412847,1.228449,-0.954507
c,0.490871,0.712448,-0.453974


# Accessors

In [24]:
# datetime accessor allows manipulating date and time

s = pd.Series(pd.date_range('20130101 09:10:12', periods=4))
s

0   2013-01-01 09:10:12
1   2013-01-02 09:10:12
2   2013-01-03 09:10:12
3   2013-01-04 09:10:12
dtype: datetime64[ns]

In [18]:
s.dt.hour

0    9
1    9
2    9
3    9
dtype: int64

In [19]:
stz = s.dt.tz_localize("US/Eastern")
stz

0   2013-01-01 09:10:12-05:00
1   2013-01-02 09:10:12-05:00
2   2013-01-03 09:10:12-05:00
3   2013-01-04 09:10:12-05:00
dtype: datetime64[ns, US/Eastern]

In [20]:
stz.dt.tz

<DstTzInfo 'US/Eastern' LMT-1 day, 19:04:00 STD>

In [21]:
s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern')

0   2013-01-01 04:10:12-05:00
1   2013-01-02 04:10:12-05:00
2   2013-01-03 04:10:12-05:00
3   2013-01-04 04:10:12-05:00
dtype: datetime64[ns, US/Eastern]

In [22]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],
                  dtype="string")

In [23]:
# Using .str accessor, we can apply all string functions from standard Python to our Series.

s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5    <NA>
6    caba
7     dog
8     cat
dtype: string

# Sorting

## By index

In [26]:
df = pd.DataFrame({
    'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
    'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
    'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])
})

In [27]:
unsorted_df = df.reindex(index=['a', 'd', 'c', 'b'],
                         columns=['three', 'two', 'one'])

In [28]:
unsorted_df

Unnamed: 0,three,two,one
a,,0.633399,-0.108516
d,0.327108,0.506221,
c,-2.576694,0.175545,-1.709815
b,-1.709173,-1.941317,0.731643


In [29]:
# sort DataFrame by index
unsorted_df.sort_index()

Unnamed: 0,three,two,one
a,,0.633399,-0.108516
b,-1.709173,-1.941317,0.731643
c,-2.576694,0.175545,-1.709815
d,0.327108,0.506221,


In [30]:
unsorted_df.sort_index(ascending=False)

Unnamed: 0,three,two,one
d,0.327108,0.506221,
c,-2.576694,0.175545,-1.709815
b,-1.709173,-1.941317,0.731643
a,,0.633399,-0.108516


In [32]:
# sort by column
unsorted_df.sort_index(axis=1)

Unnamed: 0,one,three,two
a,-0.108516,,0.633399
d,,0.327108,0.506221
c,-1.709815,-2.576694,0.175545
b,0.731643,-1.709173,-1.941317


In [33]:
# sort specific column by index
unsorted_df['three'].sort_index()

a         NaN
b   -1.709173
c   -2.576694
d    0.327108
Name: three, dtype: float64

In [34]:
# sort by values
df1 = pd.DataFrame({'one': [2, 1, 1, 1],
                        'two': [1, 3, 2, 4],
                        'three': [5, 4, 3, 2]})

In [35]:
## values of specific column
df1.sort_values(by='two')

Unnamed: 0,one,two,three
0,2,1,5
2,1,2,3
1,1,3,4
3,1,4,2


In [36]:
s[2] = np.nan

In [37]:
s.sort_values()

0   2013-01-01 09:10:12
1   2013-01-02 09:10:12
3   2013-01-04 09:10:12
2                   NaT
dtype: datetime64[ns]

In [38]:
s.sort_values(na_position='first')

2                   NaT
0   2013-01-01 09:10:12
1   2013-01-02 09:10:12
3   2013-01-04 09:10:12
dtype: datetime64[ns]

In [39]:
## multi-index
idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 2),
                                   ('b', 2), ('b', 1), ('b', 1)])

In [40]:
idx.names = ['first', 'second']

In [41]:
## make dataframe
df_multi = pd.DataFrame({'A': np.arange(6, 0, -1)},
                            index=idx)

In [42]:
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,A
first,second,Unnamed: 2_level_1
a,1,6
a,2,5
a,2,4
b,2,3
b,1,2
b,1,1


In [43]:
# Sort DataFrame by 'second' (index) and 'A' (column)
df_multi.sort_values(by=['second', 'A'])

Unnamed: 0_level_0,Unnamed: 1_level_0,A
first,second,Unnamed: 2_level_1
b,1,1
b,1,2
a,1,6
b,2,3
a,2,4
a,2,5
