# Chapter 5: Getting Started with pandas

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

## 5.1: Introduction to pandas Data Structures

### Series

In [2]:
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [3]:
obj.array

<PandasArray>
[4, 7, -5, 3]
Length: 4, dtype: int64

In [4]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
pd.Series([1,'no',2])

0     1
1    no
2     2
dtype: object

In [6]:
obj2 = pd.Series([4, 7, -5, 3], index=["d", "b", "a", "c"])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [7]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [8]:
obj2["a"]

-5

In [9]:
obj3=obj2

In [10]:
obj3

d    4
b    7
a   -5
c    3
dtype: int64

In [11]:
obj3["a"]

-5

In [12]:
obj3["a"]="hello"

In [13]:
obj3

d        4
b        7
a    hello
c        3
dtype: object

In [14]:
obj2

d        4
b        7
a    hello
c        3
dtype: object

In [15]:
obj2[["c","b","d"]]

c    3
b    7
d    4
dtype: object

In [16]:
obj2[obj2>0]

TypeError: '>' not supported between instances of 'str' and 'int'

In [None]:
obj2["a"]=-8

In [None]:
obj2

d     4
b     7
a    -8
c     3
dtype: object

In [None]:
obj2[obj2>0]

d    4
b    7
c    3
dtype: object

In [None]:
obj2*2

d      8
b     14
a    -16
c      6
dtype: object

In [None]:
np.exp(obj2)

TypeError: loop of ufunc does not support argument 0 of type int which has no callable exp method

In [None]:
obj2

d     4
b     7
a    -8
c     3
dtype: object

In [None]:
np.exp(obj2)

TypeError: loop of ufunc does not support argument 0 of type int which has no callable exp method

In [None]:
obj2=obj2.astype(int)

In [None]:
obj2

d    4
b    7
a   -8
c    3
dtype: int64

In [None]:
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.000335
c      20.085537
dtype: float64

In [None]:
"b" in obj2

True

In [None]:
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}


In [None]:
obj3=pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [None]:
obj3.to_dict()

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [None]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [None]:
states=['California','Ohio','Oregon','Texas']
obj4=pd.Series(sdata,index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [None]:
pd.isna(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [None]:
pd.notna(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [None]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [None]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [None]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [None]:
obj3+obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [None]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [None]:
obj4.name="population"
obj4.index.name="state"
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [None]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [None]:
obj.index=['Bob','Steve','Jeff','Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

### DataFrame

In [21]:
data={"state":["Ohio","Ohio","Ohio","Nevada","Nevada","Nevada"],
"year":[2000,2001,2002,2001,2002,2003],
"pop":[1.5,1.7,3.6,2.4,2.9,3.2]
}

In [26]:
data

{'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002, 2003],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [27]:
frame=pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [28]:
frame.head(n=2)

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7


In [29]:
frame.tail()

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [30]:
pd.DataFrame(data,columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [31]:
frame2=pd.DataFrame(data,columns=['year','state','pop','debt'])
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [32]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [33]:
frame2['state']

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [34]:
frame2.state

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [None]:
frame2.loc[1]

year     2001
state    Ohio
pop       1.7
debt      NaN
Name: 1, dtype: object

In [None]:
frame2.iloc[1]

year     2001
state    Ohio
pop       1.7
debt      NaN
Name: 1, dtype: object

In [35]:
frame2['debt']=16.5
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,16.5
1,2001,Ohio,1.7,16.5
2,2002,Ohio,3.6,16.5
3,2001,Nevada,2.4,16.5
4,2002,Nevada,2.9,16.5
5,2003,Nevada,3.2,16.5


In [36]:
frame2['debt']=np.arange(6.)
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,0.0
1,2001,Ohio,1.7,1.0
2,2002,Ohio,3.6,2.0
3,2001,Nevada,2.4,3.0
4,2002,Nevada,2.9,4.0
5,2003,Nevada,3.2,5.0


In [37]:
val = pd.Series([-1.2, -1.5, -1.7], index=["two", "four", "five"])
val

two    -1.2
four   -1.5
five   -1.7
dtype: float64

In [None]:
frame2['debt']

0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
5    5.0
Name: debt, dtype: float64

In [38]:
frame2['debt']=val
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [39]:
val2=pd.Series([-1.2, -1.5, -1.7,-1.2,-1.5,-1.7],index=["one","two","three","four","five","six"])
val2

one     -1.2
two     -1.5
three   -1.7
four    -1.2
five    -1.5
six     -1.7
dtype: float64

In [43]:
val3 = pd.Series([-1.2, -1.5, -1.7, -1.2, -1.5, -1.7],
                 index=[0, 1, "three", "four", "five", "six"])
val3


0       -1.2
1       -1.5
three   -1.7
four    -1.2
five    -1.5
six     -1.7
dtype: float64

In [40]:
frame2['debt']=val2
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [44]:
frame2['debt']=val3
frame2

Unnamed: 0,year,state,pop,debt,eastern
0,2000,Ohio,1.5,-1.2,True
1,2001,Ohio,1.7,-1.5,True
2,2002,Ohio,3.6,,True
3,2001,Nevada,2.4,,False
4,2002,Nevada,2.9,,False
5,2003,Nevada,3.2,,False


In [47]:
frame2['eastern']=frame2['state']=="Ohio"
frame2

Unnamed: 0,year,state,pop,debt,eastern
0,2000,Ohio,1.5,-1.2,True
1,2001,Ohio,1.7,-1.5,True
2,2002,Ohio,3.6,,True
3,2001,Nevada,2.4,,False
4,2002,Nevada,2.9,,False
5,2003,Nevada,3.2,,False


In [51]:
del frame2['eastern']
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,-1.2
1,2001,Ohio,1.7,-1.5
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [52]:
populations={"Ohio":{2000:1.5,2001:1.7,2002:3.6},
"Nevada":{2001:2.4,2002:2.9,2003:3.2}}

In [53]:
populations

{'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6},
 'Nevada': {2001: 2.4, 2002: 2.9, 2003: 3.2}}

In [64]:
frame3=pd.DataFrame(populations)
frame3

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9
2003,,3.2


In [None]:
frame3.T

Unnamed: 0,2000,2001,2002,2003
Ohio,1.5,1.7,3.6,
Nevada,,2.4,2.9,3.2


In [None]:
frame3

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9
2003,,3.2


In [None]:
frame3.dtypes

Ohio      float64
Nevada    float64
dtype: object

In [None]:
frame3.T.T.dtypes

Ohio      float64
Nevada    float64
dtype: object

In [None]:
frame3

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9
2003,,3.2


In [None]:
pd.DataFrame(populations,index=[2001,2002,2003])

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9
2003,,3.2


In [None]:
frame3["Ohio"][:-1]

2000    1.5
2001    1.7
2002    3.6
Name: Ohio, dtype: float64

In [None]:
frame3["Nevada"][:2]

2000    NaN
2001    2.4
Name: Nevada, dtype: float64

In [None]:
pdata = pd.DataFrame({'Ohio': frame3["Ohio"][:-1],
'Nevada':frame3["Nevada"][:2]})
pdata


Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,


In [None]:
frame3

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9
2003,,3.2


In [68]:
frame3.index.name="year"
frame3

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9
2003,,3.2


In [66]:
frame3.columns.name="state"

In [None]:
frame3

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9
2003,,3.2


In [None]:
frame3['Ohio']

year
2000    1.5
2001    1.7
2002    3.6
2003    NaN
Name: Ohio, dtype: float64

In [None]:
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,-1.2
1,2001,Ohio,1.7,-1.5
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [None]:
frame3.to_numpy()

array([[1.5, nan],
       [1.7, 2.4],
       [3.6, 2.9],
       [nan, 3.2]])

In [None]:
frame3.shape

(4, 2)

In [None]:
x=frame3.to_numpy
x

<bound method DataFrame.to_numpy of state  Ohio  Nevada
year               
2000    1.5     NaN
2001    1.7     2.4
2002    3.6     2.9
2003    NaN     3.2>

In [None]:
x=np.array(x)

In [None]:
frame2.to_numpy()

array([[2000, 'Ohio', 1.5, -1.2],
       [2001, 'Ohio', 1.7, -1.5],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, nan],
       [2002, 'Nevada', 2.9, nan],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

In [None]:
y=frame2.values
y

array([[2000, 'Ohio', 1.5, -1.2],
       [2001, 'Ohio', 1.7, -1.5],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, nan],
       [2002, 'Nevada', 2.9, nan],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

In [None]:
y[:,1]

array(['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], dtype=object)

In [54]:
frame2.index

RangeIndex(start=0, stop=6, step=1)

### Index Objects

In [56]:
object=pd.Series(np.arange(3),index=["a","b","c"])

In [57]:
index=object.index
index

Index(['a', 'b', 'c'], dtype='object')

In [59]:
index[1:]

Index(['b', 'c'], dtype='object')

In [60]:
index[1]="d"

TypeError: Index does not support mutable operations

In [61]:
labels=pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

In [62]:
obj2=pd.Series([1.5,-2.5,0],index=labels)
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [63]:
obj2.index is labels

True

In [69]:
frame3

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9
2003,,3.2


In [70]:
frame3.index

Int64Index([2000, 2001, 2002, 2003], dtype='int64', name='year')

In [71]:
frame3.columns

Index(['Ohio', 'Nevada'], dtype='object', name='state')

In [72]:
"Ohio" in frame3.columns

True

In [73]:
2000 in frame3.index

True

In [74]:
pd.Index(["foo", "foo", "bar", "bar"])


Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

In [80]:
a=pd.DataFrame(pd.Series([-1,-2,-3],index=["foo","foo","bar"]),columns=["word"])
a

Unnamed: 0,word
foo,-1
foo,-2
bar,-3


In [85]:
a.loc["foo"]

Unnamed: 0,word
foo,-1
foo,-2


## 5.2: Essential Functionality

### Reindexing

In [86]:
obj=pd.Series([4.5,7.2,-5.3,3.6],index=["d","b","a","c"])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [87]:
obj.reindex(["a","b","c","d","e"])

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [88]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [89]:
obj2=obj.reindex(["a","b","c","d","e"])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [90]:
obj3=pd.Series(["blue","purple","yellow"],index=[0,2,4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [91]:
obj3.reindex(np.arange(6),method="ffill")

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [93]:
frame=pd.DataFrame(np.arange(9).reshape((3,3)),index=["a","c","d"],columns=["Ohio","Texas","California"])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [94]:
frame2=frame.reindex(index=["a","b","c","d"])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [95]:
states=["Texas","Utah","California"]
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [96]:
frame.reindex(states,axis="columns")

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [98]:
frame.loc[["a","d","c"],["Texas","California"]]

Unnamed: 0,Texas,California
a,1,2
d,7,8
c,4,5


### Dropping Entries from an Axis

In [99]:
obj = pd.Series(np.arange(5.), index=["a", "b", "c", "d", "e"])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [100]:
obj.drop("a")

b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [101]:
obj.drop(["a","d"])

b    1.0
c    2.0
e    4.0
dtype: float64

In [104]:
data=pd.DataFrame(np.arange(16).reshape((4,4)),
index=["Ohio","California","Utah","New York"],
columns=["one","two","three","four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
California,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [105]:
data.drop(index=["California","New York"])

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Utah,8,9,10,11


In [106]:
data.drop(columns=["two","four"])

Unnamed: 0,one,three
Ohio,0,2
California,4,6
Utah,8,10
New York,12,14


In [107]:
data.drop(["two","four"],axis=1)

Unnamed: 0,one,three
Ohio,0,2
California,4,6
Utah,8,10
New York,12,14


In [108]:
data.drop(["two","four"],axis="columns")

Unnamed: 0,one,three
Ohio,0,2
California,4,6
Utah,8,10
New York,12,14


### Indexing, Selection, and Filtering

In [109]:
obj=pd.Series(np.arange(4),index=["a","b","c","d"])
obj

a    0
b    1
c    2
d    3
dtype: int64

In [110]:
obj["b"]

1

In [111]:
obj[1]

1

In [112]:
obj[2:4]

c    2
d    3
dtype: int64

In [113]:
obj[["b","a","d"]]

b    1
a    0
d    3
dtype: int64

In [114]:
obj[[1,2]]

b    1
c    2
dtype: int64

In [115]:
obj[obj<2]

a    0
b    1
dtype: int64

In [116]:
obj.loc[["b","a","d"]]

b    1
a    0
d    3
dtype: int64

In [117]:
obj1=pd.Series([1,2,3],index=[2,0,1])
obj2=pd.Series([1,2,3],index=["a","b","c"])

In [118]:
obj1

2    1
0    2
1    3
dtype: int64

In [119]:
obj2

a    1
b    2
c    3
dtype: int64

In [120]:
obj1[[0,1,2]]

0    2
1    3
2    1
dtype: int64

In [121]:
obj1.loc[[0,1,2]]

0    2
1    3
2    1
dtype: int64

In [122]:
obj1.iloc[[0,1,2]]

2    1
0    2
1    3
dtype: int64

In [123]:
obj2[[0,1,2]]

a    1
b    2
c    3
dtype: int64

In [124]:
obj2.loc[[0,1,2]]

KeyError: "None of [Int64Index([0, 1, 2], dtype='int64')] are in the [index]"

In [None]:
#loc works with index label
#iloc works with index location

In [125]:
obj1.iloc[[0,1,2]]

2    1
0    2
1    3
dtype: int64

In [127]:
obj1.loc[[0,1,2]]

0    2
1    3
2    1
dtype: int64

In [128]:
obj2.loc["b":"c"] #inclusive of both

b    2
c    3
dtype: int64

In [130]:
obj2[[1]]=7
obj2

a    1
b    7
c    3
dtype: int64

In [131]:
obj2["b":"c"]=9
obj2

a    1
b    9
c    9
dtype: int64

In [3]:
data=pd.DataFrame(np.arange(16).reshape((4,4)),
index=["Ohio","Colorado","Utah","New York"],
columns=["one","two","three","four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [9]:
data.loc['Colorado':'New York',"one":"three"]
# dataframe.loc['start_row':'end_row,'start_col':'end_col'] #inclusive!

Unnamed: 0,one,two,three
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


In [14]:
data.iloc[1:3,1:3]

Unnamed: 0,two,three
Colorado,5,6
Utah,9,10


In [133]:
data["two"]

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [134]:
data[["three","one"]]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [136]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [142]:
data[2:]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [143]:
data[data["three"]>5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [144]:
data["three"]>5

Ohio        False
Colorado     True
Utah         True
New York     True
Name: three, dtype: bool

In [146]:
data[[1,2]]

KeyError: "None of [Int64Index([1, 2], dtype='int64')] are in the [columns]"

In [147]:
data<5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [148]:
data[data<5]=0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


#### Selection on Dataframe with loc and iloc

In [149]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [150]:
data["Colorado"]

KeyError: 'Colorado'

In [151]:
data["one"]

Ohio         0
Colorado     0
Utah         8
New York    12
Name: one, dtype: int64

In [152]:
data[["one"]]

Unnamed: 0,one
Ohio,0
Colorado,0
Utah,8
New York,12


In [153]:
data.loc["Colorado"]

one      0
two      5
three    6
four     7
Name: Colorado, dtype: int64

In [154]:
data.loc[["Colorado"]]

Unnamed: 0,one,two,three,four
Colorado,0,5,6,7


In [155]:
data.loc[["Colorado","New York"]]

Unnamed: 0,one,two,three,four
Colorado,0,5,6,7
New York,12,13,14,15


In [156]:
data.loc["Colorado","two"]

5

In [159]:
data.loc["Colorado",["two"]]

two    5
Name: Colorado, dtype: int64

In [160]:
data.loc[["Colorado"],["two"]]

Unnamed: 0,two
Colorado,5


In [162]:
data.loc[:,"two"]

Ohio         0
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [163]:
data.loc[:,["two"]]

Unnamed: 0,two
Ohio,0
Colorado,5
Utah,9
New York,13


In [164]:
data.loc["Ohio","two"]

0

In [165]:
data.loc[["Ohio"],"two"]

Ohio    0
Name: two, dtype: int64

In [166]:
data.loc["Ohio",["two"]]

two    0
Name: Ohio, dtype: int64

In [167]:
data.loc[["Ohio"],["two"]]

Unnamed: 0,two
Ohio,0


In [168]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [169]:
data.iloc[[2]]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11


In [170]:
data.iloc[2,1]

9

In [173]:
data.iloc[[2],1]

Utah    9
Name: two, dtype: int64

In [174]:
data.iloc[2,[1]]

two    9
Name: Utah, dtype: int64

In [172]:
data.iloc[[2],[1]]

Unnamed: 0,two
Utah,9


In [171]:
data.iloc[[2,1]]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
Colorado,0,5,6,7


In [175]:
data.iloc[[2,1],]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
Colorado,0,5,6,7


In [176]:
data.iloc[[2,1],:]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
Colorado,0,5,6,7


In [177]:
data.iloc[[2,1],3]

Utah        11
Colorado     7
Name: four, dtype: int64

In [178]:
data.iloc[[2,1],[3]]

Unnamed: 0,four
Utah,11
Colorado,7


In [179]:
data.iloc[[2,1],[3,1]]

Unnamed: 0,four,two
Utah,11,9
Colorado,7,5


In [180]:
data.loc[:"Utah","two"]

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64

In [182]:
data.loc[:"Utah",["two"]]

Unnamed: 0,two
Ohio,0
Colorado,5
Utah,9


In [184]:
data.loc[:,"two"]

Ohio         0
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [185]:
data.loc[:,["two"]]

Unnamed: 0,two
Ohio,0
Colorado,5
Utah,9
New York,13


In [186]:
data.loc[[:,"two"]]

SyntaxError: invalid syntax (2262762751.py, line 1)

In [187]:
data.loc[[,"two"]]

SyntaxError: invalid syntax (3943716549.py, line 1)

In [191]:
data.iloc[:,[2]]

Unnamed: 0,three
Ohio,0
Colorado,6
Utah,10
New York,14


In [192]:
data.iloc[:,:3]

Unnamed: 0,one,two,three
Ohio,0,0,0
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [193]:
data.three>5

Ohio        False
Colorado     True
Utah         True
New York     True
Name: three, dtype: bool

In [194]:
data.iloc[:,:3][data.three>5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [195]:
data.loc[data.three>5]

Unnamed: 0,one,two,three,four
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [197]:
data.iloc[data.three>5]

ValueError: iLocation based boolean indexing cannot use an indexable as a mask

#### Integer indexing pitfalls

In [198]:
ser=pd.Series(np.arange(3))

In [199]:
ser

0    0
1    1
2    2
dtype: int64

In [200]:
ser[-1]

KeyError: -1

In [204]:
ser2=ser.copy()
ser2=ser2.astype('float64')
ser2

0    0.0
1    1.0
2    2.0
dtype: float64

In [209]:
ser2.index=["a","b","c"]
ser2

a    0.0
b    1.0
c    2.0
dtype: float64

In [210]:
ser2[-1]

2.0

In [211]:
ser.iloc[-1]

2

In [None]:
# data selection is always label oriented
# data slicing is index oriented

In [212]:
ser[:2]

0    0
1    1
dtype: int64

#### Pitfalls with chained indexing

In [214]:
data.loc[:,"one"]=1

In [215]:
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,1,5,6,7
Utah,1,9,10,11
New York,1,13,14,15


In [217]:
data.iloc[2]=5
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,1,5,6,7
Utah,5,5,5,5
New York,1,13,14,15


In [218]:
data.loc[data["four"]>5]

Unnamed: 0,one,two,three,four
Colorado,1,5,6,7
New York,1,13,14,15


In [224]:
data[["four"]][data["four"]>5]

Unnamed: 0,four
Colorado,7
New York,15


In [225]:
data.loc[data["four"]>5]=3
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,3,3,3,3
Utah,5,5,5,5
New York,3,3,3,3


In [228]:
data.loc[data.three==5]["three"]=6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[data.three==5]["three"]=6


In [229]:
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,3,3,3,3
Utah,5,5,5,5
New York,3,3,3,3


In [242]:
data=data.drop("Utah",axis=1)

In [248]:
data.loc["Utah"]=5
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,3,3,3,3
Utah,5,5,5,5
New York,3,3,3,3


In [259]:
data.loc[data.loc[:,"three"]==5,"three"]=6
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colorado,3,3,3,3
Utah,5,5,6,5
New York,3,3,3,3


In [258]:
data.loc[:,"three"]==5

Ohio        False
Colorado    False
Utah         True
New York    False
Name: three, dtype: bool

In [272]:
data.loc[data.three==5,"three"]=6
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0.0,0
Colorado,3,3,3.0,3
Utah,5,5,6.0,5
New York,3,3,3.0,3


In [273]:
data.loc[data.three==6,"three"]

Utah    6.0
Name: three, dtype: float64

In [263]:
data.dtypes

one        int64
two        int64
three    float64
four       int64
dtype: object

In [274]:
data.three["Utah"]

6.0

In [279]:
data.loc[data.loc[:,"three"]==6,["three"]]=9

In [280]:
data

Unnamed: 0,one,two,three,four
Ohio,1,0,0.0,0
Colorado,3,3,3.0,3
Utah,5,5,9.0,5
New York,3,3,3.0,3


### Arithmetic and Data Alignment

In [281]:
s1=pd.Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
s2=pd.Series([-2.1, 3.6, -1.5, 4, 3.1],index=["a","c","e","f","g"])

In [282]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [283]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [284]:
s1+s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [285]:
df1=pd.DataFrame(np.arange(9.).reshape((3,3)),index=["Ohio","Texas","Colorado"],columns=list("bcd"))
df2=pd.DataFrame(np.arange(12.).reshape((4,3)),index=["Utah","Ohio","Texas","Oregon"],columns=list("bde"))

In [286]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [287]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [288]:
df1+df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [289]:
df1=pd.DataFrame({"A":[1,2]})
df2=pd.DataFrame({"B":[3,4]})

In [290]:
df1

Unnamed: 0,A
0,1
1,2


In [291]:
df2

Unnamed: 0,B
0,3
1,4


In [292]:
df1+df2

Unnamed: 0,A,B
0,,
1,,


In [294]:
df1=pd.DataFrame(np.arange(3),index=["a","b","c"],columns=["one"])
df1

Unnamed: 0,one
a,0
b,1
c,2


In [298]:
df2 = pd.DataFrame(np.arange(4,7), index=["d", "e", "f"], columns=["two"])
df2


Unnamed: 0,two
d,4
e,5
f,6


In [299]:
df1+df2

Unnamed: 0,one,two
a,,
b,,
c,,
d,,
e,,
f,,


In [301]:
df1 = pd.DataFrame(np.arange(3), index=["a", "b", "c"], columns=["one"])
df2 = pd.DataFrame(np.arange(4,7), index=["d", "b", "f"], columns=["one"])
df1+df2

Unnamed: 0,one
a,
b,6.0
c,
d,
f,


#### Arithmetic methods with fill values

In [305]:
df1=pd.DataFrame(np.arange(12.).reshape((3,4)),columns=list("abcd"))
df2=pd.DataFrame(np.arange(20.).reshape((4,5)),columns=list("abcde"))

In [306]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [307]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [310]:
df2.loc[1,"b"]=np.NaN
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [311]:
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [314]:
df1.add(df2,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [315]:
df1.add(df2)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [316]:
df1.radd(df2)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [317]:
df1.div(1)

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [318]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [320]:
1/df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [319]:
df1.rdiv(1)

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [322]:
df1.reindex(columns=df2.columns,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


#### Operations between DataFrame and Series

In [323]:
arr=np.arange(12.).reshape((3,4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [324]:
arr[0]

array([0., 1., 2., 3.])

In [325]:
arr-arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [326]:
frame=pd.DataFrame(np.arange(12.).reshape((4,3)),columns=list("bde"),index=["Utah","Ohio","Texas","Oregon"])
series=frame.iloc[0]

In [327]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [328]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [329]:
frame-series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [330]:
series2 = pd.Series(np.arange(3), index=["b", "e", "f"])
series2

b    0
e    1
f    2
dtype: int64

In [338]:
frame-series2

Unnamed: 0,b,d,e,f
Utah,0.0,,1.0,
Ohio,3.0,,4.0,
Texas,6.0,,7.0,
Oregon,9.0,,10.0,


In [339]:
frame.add(series2)

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [340]:
series3=frame["d"]
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [344]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [343]:
frame.sub(series3,axis="index")

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


### Function Application and Mapping

In [345]:
frame=pd.DataFrame(np.random.standard_normal((4,3)),columns=list("bde"),index=["Utah","Ohio","Texas","Oregon"])
frame

Unnamed: 0,b,d,e
Utah,0.584288,0.290294,0.012434
Ohio,-1.22238,-0.601846,-1.88643
Texas,-1.165027,0.908346,-1.379789
Oregon,-1.45318,0.304028,-0.002463


In [346]:
frame.abs()

Unnamed: 0,b,d,e
Utah,0.584288,0.290294,0.012434
Ohio,1.22238,0.601846,1.88643
Texas,1.165027,0.908346,1.379789
Oregon,1.45318,0.304028,0.002463


In [347]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.584288,0.290294,0.012434
Ohio,1.22238,0.601846,1.88643
Texas,1.165027,0.908346,1.379789
Oregon,1.45318,0.304028,0.002463


In [351]:
frame.max()

b    0.584288
d    0.908346
e    0.012434
dtype: float64

In [350]:
frame.max(axis="columns")

Utah      0.584288
Ohio     -0.601846
Texas     0.908346
Oregon    0.304028
dtype: float64

In [352]:
def f1(x):
    return x.max()-x.min()

In [353]:
f1(frame)

b    2.037468
d    1.510191
e    1.898864
dtype: float64

In [354]:
frame.apply(f1)

b    2.037468
d    1.510191
e    1.898864
dtype: float64

In [356]:
frame.apply(f1,axis="columns")

Utah      0.571855
Ohio      1.284584
Texas     2.288134
Oregon    1.757207
dtype: float64

In [359]:
frame

Unnamed: 0,b,d,e
Utah,0.584288,0.290294,0.012434
Ohio,-1.22238,-0.601846,-1.88643
Texas,-1.165027,0.908346,-1.379789
Oregon,-1.45318,0.304028,-0.002463


In [358]:
frame.sum()

b   -3.256299
d    0.900821
e   -3.256248
dtype: float64

In [360]:
frame.sum(axis="columns")

Utah      0.887016
Ohio     -3.710656
Texas    -1.636470
Oregon   -1.151615
dtype: float64

In [361]:
def f2(x):
    return pd.Series([x.min(),x.max()],index=["min","max"])

In [362]:
f2(frame)

min    b   -1.453180
d   -0.601846
e   -1.886430
dtyp...
max    b    0.584288
d    0.908346
e    0.012434
dtyp...
dtype: object

In [363]:
frame.apply(f2)

Unnamed: 0,b,d,e
min,-1.45318,-0.601846,-1.88643
max,0.584288,0.908346,0.012434


In [364]:
def add_one(x):
    return x+1

In [365]:
add_one(frame)

Unnamed: 0,b,d,e
Utah,1.584288,1.290294,1.012434
Ohio,-0.22238,0.398154,-0.88643
Texas,-0.165027,1.908346,-0.379789
Oregon,-0.45318,1.304028,0.997537


In [366]:
frame

Unnamed: 0,b,d,e
Utah,0.584288,0.290294,0.012434
Ohio,-1.22238,-0.601846,-1.88643
Texas,-1.165027,0.908346,-1.379789
Oregon,-1.45318,0.304028,-0.002463


In [367]:
frame.apply(add_one) 

Unnamed: 0,b,d,e
Utah,1.584288,1.290294,1.012434
Ohio,-0.22238,0.398154,-0.88643
Texas,-0.165027,1.908346,-0.379789
Oregon,-0.45318,1.304028,0.997537


In [368]:
frame.applymap(add_one)

Unnamed: 0,b,d,e
Utah,1.584288,1.290294,1.012434
Ohio,-0.22238,0.398154,-0.88643
Texas,-0.165027,1.908346,-0.379789
Oregon,-0.45318,1.304028,0.997537


In [375]:
def my_format(x):
    return f"{x:.2f}"

In [376]:
frame

Unnamed: 0,b,d,e
Utah,0.584288,0.290294,0.012434
Ohio,-1.22238,-0.601846,-1.88643
Texas,-1.165027,0.908346,-1.379789
Oregon,-1.45318,0.304028,-0.002463


In [377]:
my_format(frame)

TypeError: unsupported format string passed to DataFrame.__format__

In [378]:
frame.apply(my_format)

TypeError: unsupported format string passed to Series.__format__

In [379]:
frame

Unnamed: 0,b,d,e
Utah,0.584288,0.290294,0.012434
Ohio,-1.22238,-0.601846,-1.88643
Texas,-1.165027,0.908346,-1.379789
Oregon,-1.45318,0.304028,-0.002463


In [380]:
frame.applymap(my_format)

Unnamed: 0,b,d,e
Utah,0.58,0.29,0.01
Ohio,-1.22,-0.6,-1.89
Texas,-1.17,0.91,-1.38
Oregon,-1.45,0.3,-0.0


In [382]:
frame["e"].map(my_format)

Utah       0.01
Ohio      -1.89
Texas     -1.38
Oregon    -0.00
Name: e, dtype: object

First major difference: DEFINITION

map is defined on Series ONLY
applymap is defined on DataFrames ONLY
apply is defined on BOTH

Third major difference: BEHAVIOR

map is elementwise for Series
applymap is elementwise for DataFrames
apply also works elementwise but is suited to more complex operations and aggregation. The behaviour and return value depends on the function.

Fourth major difference (the most important one): USE CASE

map is meant for mapping values from one domain to another, so is optimised for performance (e.g., df['A'].map({1:'a', 2:'b', 3:'c'}))
applymap is good for elementwise transformations across multiple rows/columns (e.g., df[['A', 'B', 'C']].applymap(str.strip))
apply is for applying any function that cannot be vectorised (e.g., df['sentences'].apply(nltk.sent_tokenize)).

### Sorting and Ranking

In [383]:
obj = pd.Series(np.arange(4), index=["d", "a", "b", "c"])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [385]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [387]:
obj.sort_values()

d    0
a    1
b    2
c    3
dtype: int64

In [388]:
frame = pd.DataFrame(
    np.arange(8).reshape((2, 4)),
    index=["three", "one"],
    columns=["d", "a", "b", "c"])

frame


Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [389]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [391]:
frame.sort_index(axis="columns")

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [399]:
pd.DataFrame.sort_index?

[0;31mSignature:[0m
[0mpd[0m[0;34m.[0m[0mDataFrame[0m[0;34m.[0m[0msort_index[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mself[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maxis[0m[0;34m:[0m [0;34m'Axis'[0m [0;34m=[0m [0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlevel[0m[0;34m:[0m [0;34m'Level | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mascending[0m[0;34m:[0m [0;34m'bool | int | Sequence[bool | int]'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minplace[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkind[0m[0;34m:[0m [0;34m'str'[0m [0;34m=[0m [0;34m'quicksort'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mna_position[0m[0;34m:[0m [0;34m'str'[0m [0;34m=[0m [0;34m'last'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msort_remaining[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m

In [400]:
frame.sort_index(axis="columns",ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [401]:
obj = pd.Series([4, 7, -3, 2])

obj

0    4
1    7
2   -3
3    2
dtype: int64

In [402]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [403]:
obj.sort_values(ascending=False)

1    7
0    4
3    2
2   -3
dtype: int64

In [404]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj

0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64

In [405]:
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [407]:
obj.sort_values(na_position="first")

1    NaN
3    NaN
4   -3.0
5    2.0
0    4.0
2    7.0
dtype: float64

In [413]:
frame = pd.DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 2, 1]})

frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,2
3,2,1


In [414]:
frame.sort_values("b")

Unnamed: 0,b,a
2,-3,2
3,2,1
0,4,0
1,7,1


In [415]:
frame.sort_values(["a","b"])

Unnamed: 0,b,a
0,4,0
3,2,1
1,7,1
2,-3,2


In [416]:
obj=pd.Series([7, -5, 7, 4, 2, 0, 4])

In [417]:
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [418]:
obj.sort_values()

1   -5
5    0
4    2
3    4
6    4
0    7
2    7
dtype: int64

In [419]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [420]:
obj.rank(method="first")

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [421]:
obj.rank(ascending=False)

0    1.5
1    7.0
2    1.5
3    3.5
4    5.0
5    6.0
6    3.5
dtype: float64

In [422]:
frame = pd.DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1],"c": [-2, 5, 8, -2.5]})


In [423]:
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [424]:
frame.rank()

Unnamed: 0,b,a,c
0,3.0,1.5,2.0
1,4.0,3.5,3.0
2,1.0,1.5,4.0
3,2.0,3.5,1.0


In [425]:
frame.rank(axis="columns")

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


### Axis Indexes with Duplicate Labels

In [426]:
obj = pd.Series(np.arange(5), index=["a", "a", "b", "b", "c"])

obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [429]:
obj.index.is_unique

False

In [430]:
obj["a"]

a    0
a    1
dtype: int64

In [431]:
obj["c"]

4

In [432]:
df=pd.DataFrame(np.random.standard_normal((5, 3)),index=["a", "a", "b", "b", "c"])
df

Unnamed: 0,0,1,2
a,0.776815,-1.459511,-0.925699
a,0.704129,0.531786,-0.029929
b,0.031382,-0.549132,1.500923
b,0.59462,1.308208,0.193661
c,-1.416985,0.211253,-0.963458


In [433]:
df.loc["b"]

Unnamed: 0,0,1,2
b,0.031382,-0.549132,1.500923
b,0.59462,1.308208,0.193661


In [434]:
df.loc["c"]

0   -1.416985
1    0.211253
2   -0.963458
Name: c, dtype: float64

In [436]:
df.loc[["c"]]

Unnamed: 0,0,1,2
c,-1.416985,0.211253,-0.963458


## 5.3: Summarizing and Computing Descriptive Statistics

In [437]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]],index=list("abcd"),columns=["one","two"])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [438]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [439]:
df.sum(axis="columns")

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [440]:
df.sum(axis="index", skipna=False)

one   NaN
two   NaN
dtype: float64

In [442]:
df.sum(axis="columns", skipna=False)


a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [444]:
df.mean()

one    3.083333
two   -2.900000
dtype: float64

In [443]:
df.mean(axis="columns")


a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [445]:
df.max()

one    7.1
two   -1.3
dtype: float64

In [446]:
df.idxmax()

one    b
two    d
dtype: object

In [447]:
df.cumsum()


Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [448]:
df.describe()


Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [449]:
obj = pd.Series(["a", "a", "b", "c"] * 4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [451]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

### Correlation and Covariance

In [452]:
pd.read_pickle?

[0;31mSignature:[0m
[0mpd[0m[0;34m.[0m[0mread_pickle[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mfilepath_or_buffer[0m[0;34m:[0m [0;34m'FilePath | ReadPickleBuffer'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcompression[0m[0;34m:[0m [0;34m'CompressionOptions'[0m [0;34m=[0m [0;34m'infer'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstorage_options[0m[0;34m:[0m [0;34m'StorageOptions'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Load pickled pandas object (or any object) from file.


   Loading pickled data received from untrusted sources can be
   unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__.

Parameters
----------
filepath_or_buffer : str, path object, or file-like object
    String, path object (implementing ``os.PathLike[str]``), or file-like
    object implementing a binary ``readlines()`` function.

    .. versionchanged:: 1.0.0
       Accept URL. U

In [456]:
os.getcwd()

'/Users/karimbadr/bookclub-py4da/Karim'

In [459]:
os.chdir("..")
os.getcwd()


'/Users/karimbadr/bookclub-py4da'

In [461]:
price = pd.read_pickle("examples/yahoo_price.pkl")

In [463]:
price.head()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,27.990226,313.062468,113.304536,25.884104
2010-01-05,28.038618,311.683844,111.935822,25.892466
2010-01-06,27.592626,303.826685,111.208683,25.733566
2010-01-07,27.541619,296.753749,110.823732,25.465944
2010-01-08,27.724725,300.709808,111.935822,25.641571


In [464]:
volume = pd.read_pickle("examples/yahoo_volume.pkl")
volume.head()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,123432400,3927000,6155300,38409100
2010-01-05,150476200,6031900,6841400,49749600
2010-01-06,138040000,7987100,5605300,58182400
2010-01-07,119282800,12876600,5840600,50559700
2010-01-08,111902700,9483900,4197200,51197400


In [465]:
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-10-17,-0.00068,0.001837,0.002072,-0.003483
2016-10-18,-0.000681,0.019616,-0.026168,0.00769
2016-10-19,-0.002979,0.007846,0.003583,-0.002255
2016-10-20,-0.000512,-0.005652,0.001719,-0.004867
2016-10-21,-0.00393,0.003011,-0.012474,0.042096


In [467]:
returns["MSFT"].corr(returns["IBM"])


0.49976361144151144

In [468]:
returns["MSFT"].cov(returns["IBM"])


8.870655479703546e-05

In [469]:
returns.MSFT.corr(returns.IBM)

0.49976361144151144

In [470]:
returns.corr()


Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.407919,0.386817,0.389695
GOOG,0.407919,1.0,0.405099,0.465919
IBM,0.386817,0.405099,1.0,0.499764
MSFT,0.389695,0.465919,0.499764,1.0


In [471]:
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.000277,0.000107,7.8e-05,9.5e-05
GOOG,0.000107,0.000251,7.8e-05,0.000108
IBM,7.8e-05,7.8e-05,0.000146,8.9e-05
MSFT,9.5e-05,0.000108,8.9e-05,0.000215


In [473]:
returns.corrwith(returns["IBM"])

AAPL    0.386817
GOOG    0.405099
IBM     1.000000
MSFT    0.499764
dtype: float64

In [474]:
returns.corrwith(volume)

AAPL   -0.075565
GOOG   -0.007067
IBM    -0.204849
MSFT   -0.092950
dtype: float64

In [475]:
returns.corrwith(volume,axis="columns")

Date
2010-01-04         NaN
2010-01-05    0.737298
2010-01-06    0.017069
2010-01-07    0.507614
2010-01-08   -0.779646
                ...   
2016-10-17   -0.881606
2016-10-18   -0.303369
2016-10-19   -0.970723
2016-10-20   -0.304414
2016-10-21    0.927824
Length: 1714, dtype: float64

In [495]:
a=volume.iloc[[1]]
a

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-05,150476200,6031900,6841400,49749600


In [496]:
b=returns.iloc[[1]]
b

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-05,0.001729,-0.004404,-0.01208,0.000323


In [500]:
returns.iloc[[1]].corrwith(volume.iloc[[1]],axis="columns")


Date
2010-01-05    0.737298
dtype: float64

In [499]:
a.corrwith(b,axis="columns")

Date
2010-01-05    0.737298
dtype: float64

### Unique Values, Value Counts, and Membership

In [501]:
obj = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [502]:
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [503]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [504]:
pd.value_counts(obj.to_numpy(), sort=False)


c    3
a    3
d    1
b    2
dtype: int64

In [505]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [506]:
mask=obj.isin(["b","c"])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [507]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [508]:
to_match = pd.Series(["c", "a", "b", "b", "c", "a"])
to_match

0    c
1    a
2    b
3    b
4    c
5    a
dtype: object

In [510]:
unique_vals = pd.Series(["c", "b", "a"])
unique_vals

0    c
1    b
2    a
dtype: object

In [511]:
pd.Index(unique_vals)


Index(['c', 'b', 'a'], dtype='object')

In [512]:
pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2])

In [513]:
data = pd.DataFrame({"Qu1": [1, 3, 4, 3, 4], "Qu2": [
                    2, 3, 1, 2, 3], "Qu3": [1, 5, 2, 4, 4]})


In [514]:
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [515]:
data["Qu1"]

0    1
1    3
2    4
3    3
4    4
Name: Qu1, dtype: int64

In [516]:
data["Qu1"].value_counts()

3    2
4    2
1    1
Name: Qu1, dtype: int64

In [517]:
data["Qu1"].value_counts().sort_index()


1    1
3    2
4    2
Name: Qu1, dtype: int64

In [518]:
data.apply(pd.value_counts)


Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,,2.0,1.0
3,2.0,2.0,
4,2.0,,2.0
5,,,1.0


In [519]:
data.apply(pd.value_counts).fillna(0)

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [520]:
data = pd.DataFrame({"a": [1, 1, 1, 2, 2], "b": [0, 0, 1, 0,0]})


In [521]:
data

Unnamed: 0,a,b
0,1,0
1,1,0
2,1,1
3,2,0
4,2,0


In [522]:
data.value_counts()


a  b
1  0    2
2  0    2
1  1    1
dtype: int64