# Getting Started with Pandas

In [1]:
import numpy as np
import pandas as pd

from pandas import Series, DataFrame

In [4]:
import numpy as np
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc("figure", figsize=(10, 6))
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
pd.options.display.max_columns = 20
pd.options.display.max_colwidth = 80
np.set_printoptions(precision=4, suppress=True)

## Series

In [10]:
obj = pd.Series([4, 7, -5, 3])
print(obj)
print()
print(obj.array)
print()
print(obj.index)

0    4
1    7
2   -5
3    3
dtype: int64

<PandasArray>
[4, 7, -5, 3]
Length: 4, dtype: int64

RangeIndex(start=0, stop=4, step=1)


In [11]:
obj2 = pd.Series([4, 7, -5, 3], index=["d", "b", "a", "c"])
print(obj2)
print()
print(obj2.index)

d    4
b    7
a   -5
c    3
dtype: int64

Index(['d', 'b', 'a', 'c'], dtype='object')


In [14]:
print(obj2["a"])
print()
obj2["d"] = 6
print(obj2[["c", "a", "d"]])
print()
print(obj2)

-5

c    3
a   -5
d    6
dtype: int64

d    6
b    7
a   -5
c    3
dtype: int64


In [16]:
print(obj2[obj2 > 0])
print()
print(obj2 * 2)
import numpy as np
np.exp(obj2)

d    6
b    7
c    3
dtype: int64

d    12
b    14
a   -10
c     6
dtype: int64


d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [18]:
print("b" in obj2)
"e" in obj2

True


False

In [19]:
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
obj3 = pd.Series(sdata)
print(obj3)

obj3.to_dict()

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64


{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [22]:
states = ["California", "Ohio", "Oregon", "Texas"]
obj4 = pd.Series(sdata, index=states)
obj4

print(pd.isna(obj4))

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64


California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [25]:
print(pd.isna(obj4))
print(pd.notna(obj4))
print(obj4.isna())

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool


In [26]:
print(obj3)
print()
print(obj4)
print()
print(obj3 + obj4)

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64


In [29]:
obj4.name = "population"
obj4.index.name = "state"
print(obj4)

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64


In [30]:
print(obj)
obj.index = ["Bob", "Steve", "Jeff", "Ryan"]
print(obj)

0    4
1    7
2   -5
3    3
dtype: int64
Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64


## DataFrame

In [33]:
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

print(frame)
print()
print(frame.head())
print()
print(frame.tail())

    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2

    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9

    state  year  pop
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2


In [34]:
pd.DataFrame(data, columns=["year", "state", "pop"])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [35]:
frame2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"])
print(frame2)
print()
frame2.columns

   year   state  pop debt
0  2000    Ohio  1.5  NaN
1  2001    Ohio  1.7  NaN
2  2002    Ohio  3.6  NaN
3  2001  Nevada  2.4  NaN
4  2002  Nevada  2.9  NaN
5  2003  Nevada  3.2  NaN



Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [38]:
print(frame2["state"])
print()
print(frame2.year)

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64


In [40]:
print(frame2.loc[1])
print()
print(frame2.iloc[2])

year     2001
state    Ohio
pop       1.7
debt      NaN
Name: 1, dtype: object

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: 2, dtype: object


In [42]:
frame2["debt"] = 16.5
print(frame2)
frame2["debt"] = np.arange(6.)
frame2

   year   state  pop  debt
0  2000    Ohio  1.5  16.5
1  2001    Ohio  1.7  16.5
2  2002    Ohio  3.6  16.5
3  2001  Nevada  2.4  16.5
4  2002  Nevada  2.9  16.5
5  2003  Nevada  3.2  16.5


Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,0.0
1,2001,Ohio,1.7,1.0
2,2002,Ohio,3.6,2.0
3,2001,Nevada,2.4,3.0
4,2002,Nevada,2.9,4.0
5,2003,Nevada,3.2,5.0


In [43]:
val = pd.Series([-1.2, -1.5, -1.7], index=["two", "four", "five"])
frame2["debt"] = val
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [44]:
frame2["eastern"] = frame2["state"] == "Ohio"
frame2

Unnamed: 0,year,state,pop,debt,eastern
0,2000,Ohio,1.5,,True
1,2001,Ohio,1.7,,True
2,2002,Ohio,3.6,,True
3,2001,Nevada,2.4,,False
4,2002,Nevada,2.9,,False
5,2003,Nevada,3.2,,False


In [46]:
del frame2["eastern"]
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [48]:
populations = {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
               "Nevada": {2001: 2.4, 2002: 2.9}}

frame3 = pd.DataFrame(populations)
print(frame3)
print()
print(frame3.T)

      Ohio  Nevada
2000   1.5     NaN
2001   1.7     2.4
2002   3.6     2.9

        2000  2001  2002
Ohio     1.5   1.7   3.6
Nevada   NaN   2.4   2.9


In [49]:
pd.DataFrame(populations, index=[2001, 2002, 2003])

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9
2003,,


In [50]:
pdata = {"Ohio": frame3["Ohio"][:-1],
         "Nevada": frame3["Nevada"][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4


In [51]:
frame3.index.name = "year"
frame3.columns.name = "state"
frame3

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [52]:
frame3.to_numpy()

array([[1.5, nan],
       [1.7, 2.4],
       [3.6, 2.9]])

In [53]:
frame2.to_numpy()

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, nan],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, nan],
       [2002, 'Nevada', 2.9, nan],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

In [55]:
obj = pd.Series(np.arange(3), index=["a", "b", "c"])
index = obj.index
print(index)
print(index[1:])

Index(['a', 'b', 'c'], dtype='object')
Index(['b', 'c'], dtype='object')


## Index

In [56]:
labels = pd.Index(np.arange(3))
print(labels)
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
print(obj2)
obj2.index is labels

Int64Index([0, 1, 2], dtype='int64')
0    1.5
1   -2.5
2    0.0
dtype: float64


True

In [58]:
print(frame3)
print(frame3.columns)
print("Ohio" in frame3.columns)
print(2003 in frame3.index)

state  Ohio  Nevada
year               
2000    1.5     NaN
2001    1.7     2.4
2002    3.6     2.9
Index(['Ohio', 'Nevada'], dtype='object', name='state')
True
False


In [59]:
# Index
pd.Index(["foo", "foo", "bar", "bar"])

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

In [60]:
# Series
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=["d", "b", "a", "c"])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [61]:
obj2 = obj.reindex(["a", "b", "c", "d", "e"])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [62]:
obj3 = pd.Series(["blue", "purple", "yellow"], index=[0, 2, 4])
obj3
obj3.reindex(np.arange(6), method="ffill")

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [63]:
# DataFrame
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=["a", "c", "d"],
                     columns=["Ohio", "Texas", "California"])
frame
frame2 = frame.reindex(index=["a", "b", "c", "d"])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [64]:
states = ["Texas", "Utah", "California"]
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [65]:
frame.reindex(states, axis="columns")

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [66]:
frame.loc[["a", "d", "c"], ["California", "Texas"]]

Unnamed: 0,California,Texas
a,2,1
d,8,7
c,5,4


In [68]:
obj = pd.Series(np.arange(5.), index=["a", "b", "c", "d", "e"])
print(obj)
print()
new_obj = obj.drop("c")
print(new_obj)
print()
obj.drop(["d", "c"])
print(obj)

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64


In [69]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=["Ohio", "Colorado", "Utah", "New York"],
                    columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [70]:
data.drop(index=["Colorado", "Ohio"])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [71]:
data.drop(columns=["two"])

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [72]:
data.drop("two", axis=1)
data.drop(["two", "four"], axis="columns")

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [77]:
obj = pd.Series(np.arange(4.), index=["a", "b", "c", "d"])
print(obj)
print()
print(obj["b"])
print()
print(obj[1])
print()
print(obj[2:4])
print()
print(obj[["b", "a", "d"]])
print()
print(obj[[1, 3]])
print()
print(obj[obj < 2])

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

1.0

1.0

c    2.0
d    3.0
dtype: float64

b    1.0
a    0.0
d    3.0
dtype: float64

b    1.0
d    3.0
dtype: float64

a    0.0
b    1.0
dtype: float64


In [78]:
obj.loc[["b", "a", "d"]]

b    1.0
a    0.0
d    3.0
dtype: float64

In [81]:
obj1 = pd.Series([1, 2, 3], index=[2, 0, 1])
obj2 = pd.Series([1, 2, 3], index=["a", "b", "c"])
print(obj1)
print()
print(obj2)
print()
print(obj1[[0, 1, 2]])
print()
print(obj2[[0, 1, 2]])

2    1
0    2
1    3
dtype: int64

a    1
b    2
c    3
dtype: int64

0    2
1    3
2    1
dtype: int64

a    1
b    2
c    3
dtype: int64


In [84]:
print(obj1.iloc[[0, 1, 2]])
print(obj2.iloc[[0, 1, 2]])

2    1
0    2
1    3
dtype: int64
a    1
b    2
c    3
dtype: int64


In [85]:
print(obj2.loc["b":"c"])

b    2
c    3
dtype: int64


In [86]:
obj2.loc["b":"c"] = 5
obj2

a    1
b    5
c    5
dtype: int64

In [87]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=["Ohio", "Colorado", "Utah", "New York"],
                    columns=["one", "two", "three", "four"])
data
data["two"]
data[["three", "one"]]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [88]:
data[:2]
data[data["three"] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [89]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [90]:
data[data < 5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [91]:
data
data.loc["Colorado"]

one      0
two      5
three    6
four     7
Name: Colorado, dtype: int64

In [92]:
data.loc[["Colorado", "New York"]]

Unnamed: 0,one,two,three,four
Colorado,0,5,6,7
New York,12,13,14,15


In [93]:
data.loc["Colorado", ["two", "three"]]

two      5
three    6
Name: Colorado, dtype: int64

In [94]:
print(data.iloc[2])
print()
print(data.iloc[[2, 1]])
print()
print(data.iloc[2, [3, 0, 1]])
print()
print(data.iloc[[1, 2], [3, 0, 1]])

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

          one  two  three  four
Utah        8    9     10    11
Colorado    0    5      6     7

four    11
one      8
two      9
Name: Utah, dtype: int64

          four  one  two
Colorado     7    0    5
Utah        11    8    9


In [97]:
print(data.loc[:"Utah", "two"])
print(data.iloc[:, :3][data.three > 5])

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64
          one  two  three
Colorado    0    5      6
Utah        8    9     10
New York   12   13     14
