# Introduction to Pandas

NumPy arrays are like Python’s built-in `list` type, but NumPy arrays provide much more efficient storage and data operations as the arrays grow larger in size

In [1]:
import numpy as np
import random
import pandas as pd

In [2]:
index = [
    ("California", 2000),
    ("California", 2010),
    ("New York", 2000),
    ("New York", 2010),
    ("Texas", 2000),
    ("Texas", 2010),
]

populations = [33871648, 37253956, 18976457, 19378102, 20851820, 25145561]

In [3]:
pop = pd.Series(populations, index=index)

In [4]:
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [5]:
index = pd.MultiIndex.from_tuples(index)

In [6]:
index

MultiIndex(levels=[['California', 'New York', 'Texas'], [2000, 2010]],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

In [7]:
pop = pop.reindex(index)

In [8]:
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [9]:
pop[:, 2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [10]:
pop["California"]

2000    33871648
2010    37253956
dtype: int64

In [11]:
pop_df = pop.unstack()

In [12]:
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [13]:
pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [14]:
pop_df = pd.DataFrame(
    {"total": pop, "under18": [9267089, 9284094, 4687374, 4318033, 5906301, 6879014]}
)
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [15]:
f_u18 = pop_df["under18"] / pop_df["total"]

In [16]:
f_u18

California  2000    0.273594
            2010    0.249211
New York    2000    0.247010
            2010    0.222831
Texas       2000    0.283251
            2010    0.273568
dtype: float64

In [17]:
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


In [18]:
df = pd.DataFrame(
    np.random.rand(4, 2),
    index=[["a", "a", "b", "b"], [1, 2, 1, 2]],
    columns=["data1", "data2"],
)

In [19]:
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.353095,0.365637
a,2,0.658287,0.548377
b,1,0.598256,0.676103
b,2,0.835672,0.687217


In [20]:
data = {
    ("California", 2000): 33871648,
    ("California", 2010): 37253956,
    ("Texas", 2000): 20851820,
    ("Texas", 2010): 25145561,
    ("New York", 2000): 18976457,
    ("New York", 2010): 19378102,
}

In [21]:
pd.Series(data)

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [22]:
pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [1, 2, 1, 2]])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [23]:
pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [24]:
pd.MultiIndex.from_product([["a", "b"], [1, 2]])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [25]:
pd.MultiIndex(levels=[["a", "b"], [1, 2]], labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [26]:
pop.index.names = ["state", "year"]
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [27]:
# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]], names=["year", "visit"])
columns = pd.MultiIndex.from_product(
    [["Bob", "Guido", "Sue"], ["HR", "Temp"]], names=["subject", "type"]
)

In [28]:
# mock some data
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
# create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,25.0,36.9,36.0,37.1,21.0,36.8
2013,2,47.0,37.2,29.0,38.3,38.0,37.6
2014,1,34.0,35.9,46.0,37.6,40.0,37.6
2014,2,44.0,36.5,49.0,35.0,33.0,35.9


In [29]:
health_data["Guido"]

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,36.0,37.1
2013,2,29.0,38.3
2014,1,46.0,37.6
2014,2,49.0,35.0


In [30]:
health_data.xs((2013))

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
visit,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,25.0,36.9,36.0,37.1,21.0,36.8
2,47.0,37.2,29.0,38.3,38.0,37.6


In [31]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [32]:
pop["California", 2000]

33871648

In [33]:
pop["California"]

year
2000    33871648
2010    37253956
dtype: int64

In [34]:
pop.loc["California":"New York"]

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [35]:
pop[pop > 22000000]

state       year
California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64

In [36]:
pop[["California", "Texas"]]

state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64

In [37]:
pop.loc[["California", "Texas"], 2000]

state       year
California  2000    33871648
Texas       2000    20851820
dtype: int64

In [38]:
pop.loc[:, 2010]

state
California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [39]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,25.0,36.9,36.0,37.1,21.0,36.8
2013,2,47.0,37.2,29.0,38.3,38.0,37.6
2014,1,34.0,35.9,46.0,37.6,40.0,37.6
2014,2,44.0,36.5,49.0,35.0,33.0,35.9


In [40]:
health_data["Guido", "HR"]

year  visit
2013  1        36.0
      2        29.0
2014  1        46.0
      2        49.0
Name: (Guido, HR), dtype: float64

In [41]:
health_data[2013]

KeyError: 2013

In [None]:
health_data.loc[2013, "Guido"]

In [None]:
health_data.loc[:, "Guido"]

In [None]:
health_data

In [None]:
health_data.loc[:, :]["HR"]

In [None]:
health_data.iloc[:2, :2]

In [None]:
health_data.loc[:, "Guido"]

In [None]:
health_data.loc[:, ("Bob")]

In [None]:
health_data.loc[:, ("Bob", "HR")]

In [None]:
health_data.loc[:, (:, 'HR')]

In [42]:
idx = pd.IndexSlice

In [43]:
health_data.loc[idx[:, 1], idx[:, "HR"]]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,25.0,36.0,21.0
2014,1,34.0,46.0,40.0


In [44]:
health_data.loc[:, idx[:, "HR"]]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,25.0,36.0,21.0
2013,2,47.0,29.0,38.0
2014,1,34.0,46.0,40.0
2014,2,44.0,49.0,33.0


In [45]:
health_data.loc[2013, idx[:, "HR"]]

subject,Bob,Guido,Sue
type,HR,HR,HR
visit,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,25.0,36.0,21.0
2,47.0,29.0,38.0


In [46]:
health_data.loc[idx[:, 1], idx[:, "HR"]]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,25.0,36.0,21.0
2014,1,34.0,46.0,40.0


In [47]:
index = pd.MultiIndex.from_product([["a", "c", "b"], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ["char", "int"]
data

char  int
a     1      0.740913
      2      0.849341
c     1      0.259568
      2      0.691510
b     1      0.821836
      2      0.556731
dtype: float64

In [48]:
data["a"]

int
1    0.740913
2    0.849341
dtype: float64

In [49]:
data["a":"b"]

UnsortedIndexError: 'Key length (1) was greater than MultiIndex lexsort depth (0)'

In [50]:
data = data.sort_index()

In [51]:
data["a":"b"]

char  int
a     1      0.740913
      2      0.849341
b     1      0.821836
      2      0.556731
dtype: float64

In [52]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [53]:
pop.unstack(level=0)

state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [54]:
pop.unstack(level=1)

year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [55]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [56]:
pop_flat = pop.reset_index(name="population")

In [57]:
pop_flat

Unnamed: 0,state,year,population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [58]:
pop_flat.set_index(["state", "year"])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [59]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,25.0,36.9,36.0,37.1,21.0,36.8
2013,2,47.0,37.2,29.0,38.3,38.0,37.6
2014,1,34.0,35.9,46.0,37.6,40.0,37.6
2014,2,44.0,36.5,49.0,35.0,33.0,35.9


In [60]:
data_mean = health_data.mean(level="year")

In [61]:
data_mean

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,36.0,37.05,32.5,37.7,29.5,37.2
2014,39.0,36.2,47.5,36.3,36.5,36.75


In [62]:
data_mean.mean(axis=1, level="type")

type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,32.666667,37.316667
2014,41.0,36.416667


In [63]:
health_data.mean(axis=1, level="subject")

Unnamed: 0_level_0,subject,Bob,Guido,Sue
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,1,30.95,36.55,28.9
2013,2,42.1,33.65,37.8
2014,1,34.95,41.8,38.8
2014,2,40.25,42.0,34.45


In [64]:
df = pd.DataFrame(
    {"key": ["A", "B", "C", "A", "B", "C"], "data": range(6), "data2": range(12, 18)},
    columns=["key", "data", "data2"],
)

In [65]:
df

Unnamed: 0,key,data,data2
0,A,0,12
1,B,1,13
2,C,2,14
3,A,3,15
4,B,4,16
5,C,5,17


In [66]:
df.groupby("key")

<pandas.core.groupby.DataFrameGroupBy object at 0x00000000096D9C88>

In [67]:
df.groupby("key")["data"]

<pandas.core.groupby.SeriesGroupBy object at 0x00000000096D9F60>

In [68]:
df.groupby("key")["data"].mean()

key
A    1.5
B    2.5
C    3.5
Name: data, dtype: float64

In [69]:
df.groupby("key").mean()

Unnamed: 0_level_0,data,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,1.5,13.5
B,2.5,14.5
C,3.5,15.5


In [70]:
for (key, group_data) in df.groupby("key"):
    print(key, group_data)

A   key  data  data2
0   A     0     12
3   A     3     15
B   key  data  data2
1   B     1     13
4   B     4     16
C   key  data  data2
2   C     2     14
5   C     5     17


In [71]:
df.groupby("key")["data2"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,2.0,13.5,2.12132,12.0,12.75,13.5,14.25,15.0
B,2.0,14.5,2.12132,13.0,13.75,14.5,15.25,16.0
C,2.0,15.5,2.12132,14.0,14.75,15.5,16.25,17.0


In [72]:
rng = np.random.RandomState(0)
df = pd.DataFrame(
    {
        "key": ["A", "B", "C", "A", "B", "C"],
        "data1": range(6),
        "data2": rng.randint(0, 10, 6),
    },
    columns=["key", "data1", "data2"],
)

In [73]:
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [74]:
df.groupby("key").aggregate({"data1": "max", "data2": "min"})

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,3,3
B,4,0
C,5,3


In [75]:
df.groupby("key").filter(lambda g: g["data1"].max() > 4)

Unnamed: 0,key,data1,data2
2,C,2,3
5,C,5,9


In [76]:
df.groupby("key").transform(lambda g: g - g.mean())

Unnamed: 0,data1,data2
0,-1.5,1.0
1,-1.5,-3.5
2,-1.5,-3.0
3,1.5,-1.0
4,1.5,3.5
5,1.5,3.0


In [77]:
df.groupby("key").apply(lambda g: g["data2"] * g["data2"])

key   
A    0    25
     3     9
B    1     0
     4    49
C    2     9
     5    81
Name: data2, dtype: int32

In [78]:
L = [0, 1, 0, 1, 2, 0]

In [79]:
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [80]:
df.groupby(L).sum()

Unnamed: 0,data1,data2
0,7,17
1,4,3
2,4,7


In [81]:
df2 = df.set_index("key")

In [82]:
df2

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,0
C,2,3
A,3,3
B,4,7
C,5,9


In [83]:
mapping = {"A": "vowel", "B": "consonant", "C": "consonant"}

In [84]:
df2.groupby(mapping).sum()

Unnamed: 0,data1,data2
consonant,12,19
vowel,3,8


In [89]:
di = pd.DataFrame({"data": [2, 5, 7, 1, 2, 5, 23, 46, 23, 6, 13, 4]}, index=range(12))

In [90]:
di

Unnamed: 0,data
0,2
1,5
2,7
3,1
4,2
5,5
6,23
7,46
8,23
9,6


In [120]:
np.where((di.iloc[:, 0] < 3) & (di.iloc[:, 0] > 0))[0].astype(int).tolist()

[0, 3, 4]

In [121]:
df.index.get_loc((di["data"] < 3) & (di["data"] > 0))

TypeError: '0      True
1     False
2     False
3      True
4      True
5     False
6     False
7     False
8     False
9     False
10    False
11    False
Name: data, dtype: bool' is an invalid key

In [107]:
di.bool

<bound method NDFrame.bool of     data
0      2
1      5
2      7
3      1
4      2
5      5
6     23
7     46
8     23
9      6
10    13
11     4>

In [None]:
np.allclose()

### Understanding Data Types in Python

In [67]:
X = np.random.randint(0, 10, (3, 2))

In [68]:
X = np.array([[1, 2], [3, 4], [5, 6]])

In [69]:
dist_sq = np.sum((X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2, axis=-1)

In [48]:
y = np.arange(3)

In [49]:
y

array([0, 1, 2])

In [53]:
y1 = y[:, np.newaxis]

In [54]:
y2 = y[np.newaxis, :]

In [56]:
y1

array([[0],
       [1],
       [2]])

In [59]:
y2

array([[0, 1, 2]])

In [60]:
y1 - y2

array([[ 0, -1, -2],
       [ 1,  0, -1],
       [ 2,  1,  0]])

In [41]:
X[:, np.newaxis, :].shape

(3, 1, 2)

In [42]:
np.sum((X[:, np.newaxis, :] - X[np.newaxis, :, :]), axis=-1)

array([[ 0, -4, -8],
       [ 4,  0, -4],
       [ 8,  4,  0]])

In [43]:
X[:, np.newaxis, :]

array([[[1, 2]],

       [[3, 4]],

       [[5, 6]]])

In [44]:
X[np.newaxis, :, :].shape

(1, 3, 2)

In [45]:
X[np.newaxis, :, :]

array([[[1, 2],
        [3, 4],
        [5, 6]]])

In [25]:
(X[:, np.newaxis, :] - X[np.newaxis, :, :]).shape

(3, 3, 2)

In [26]:
X[:, np.newaxis, :] - X[np.newaxis, :, :]

array([[[ 0,  0],
        [ 2, -4],
        [-2, -4]],

       [[-2,  4],
        [ 0,  0],
        [-4,  0]],

       [[ 2,  4],
        [ 4,  0],
        [ 0,  0]]])

## The Basics of NumPy Arrays

### NumPy Array Attributes

In [1]:
import numpy as np

np.random.seed(0)  # seed for reproducibility
x1 = np.random.randint(10, size=6)  # One-dimensional array
x2 = np.random.randint(10, size=(3, 4))  # Two-dimensional array
x3 = np.random.randint(10, size=(3, 4, 5))  # Three-dimensional array

In [2]:
print("x3 ndim: ", x3.ndim)
print("x3 shape:", x3.shape)
print("x3 size: ", x3.size)

x3 ndim:  3
x3 shape: (3, 4, 5)
x3 size:  60


In [3]:
print("dtype:", x3.dtype)

dtype: int32


In [5]:
print("itemsize:", x3.itemsize, "bytes")
print("nbytes:", x3.nbytes, "bytes")  #  itemsize * size

itemsize: 4 bytes
nbytes: 240 bytes


In [1]:
import pandas as pd

nrows, ncols = 10000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols)) for i in range(4))

NameError: name 'np' is not defined

In [None]:
df1.shape, df2.shape, df3.shape, df4.shape

In [None]:
%timeit df1 + df2 + df3 + df4

In [2]:
%timeit pd.eval('df1 + df2 + df3 + df4')

UndefinedVariableError: name 'df1' is not defined

In [4]:
import numpy as np
import pandas as pd

In [7]:
nrows, ncols = 10000, 100

In [8]:
rng = np.random.RandomState(42)

In [9]:
x = pd.DataFrame(rng.rand(nrows, ncols))

In [12]:
%timeit len(x)

1.07 µs ± 12.7 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [13]:
%timeit x.shape[0]

1.85 µs ± 12.4 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [17]:
%timeit list(x)

5.77 µs ± 28.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [25]:
s = pd.Series(rng.rand(nrows))

In [27]:
type(s)

pandas.core.series.Series

In [30]:
%timeit len(s)

1.41 µs ± 7.3 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [31]:
%timeit s.size

819 ns ± 1.83 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
