# pandas
* https://pandas.pydata.org/docs/
- 10 minutes to pandas: https://pandas.pydata.org/docs/user_guide/10min.html#min

In [2]:
!pip install pandas
!pip install numpy



In [None]:
# cleanup
!pip uninstall pandas -y
!pip uninstall numpy -y

In [3]:
import numpy as np
import pandas as pd

# from pandas import Series, DataFrame


def print_df(df: pd.DataFrame):
    from IPython.core.display import display_html, HTML
    display_html(HTML(df.to_html()))

# Object creation

In [None]:
# Series: a one-dimensional labeled array holding data of any type
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [6]:
# DataFrame: a two-dimensional data structure that holds data like a two-dimension array or a table with rows and columns
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [None]:
# NumPy array with a DatetimeIndex
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,1.534095,0.550853,-0.639638,0.33903
2013-01-02,-0.247095,-0.341552,-1.25417,2.158001
2013-01-03,0.932847,-0.724955,-2.049136,0.642671
2013-01-04,-0.572871,0.488306,-0.051305,-0.637467
2013-01-05,0.857205,0.19889,-0.52007,-0.248701
2013-01-06,1.040064,0.982761,0.559295,0.761451


In [9]:
# dict of objects: key are columns, values are column values
df2 = pd.DataFrame({
    "A": 1.0,
    "B": pd.Timestamp("20130102"),
    "C": pd.Series(1, index=list(range(4)), dtype='float32'),
    "D": np.array([3] * 4, dtype="int32"),
    "E": pd.Categorical(["test", "train", "test", "train"]),
    "F": "foo"
})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [10]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

# Viewing data

In [11]:
# DataFrame.head()
# DataFrame.tail()
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,1.534095,0.550853,-0.639638,0.33903
2013-01-02,-0.247095,-0.341552,-1.25417,2.158001
2013-01-03,0.932847,-0.724955,-2.049136,0.642671
2013-01-04,-0.572871,0.488306,-0.051305,-0.637467
2013-01-05,0.857205,0.19889,-0.52007,-0.248701


In [12]:
df.tail()

Unnamed: 0,A,B,C,D
2013-01-02,-0.247095,-0.341552,-1.25417,2.158001
2013-01-03,0.932847,-0.724955,-2.049136,0.642671
2013-01-04,-0.572871,0.488306,-0.051305,-0.637467
2013-01-05,0.857205,0.19889,-0.52007,-0.248701
2013-01-06,1.040064,0.982761,0.559295,0.761451


In [13]:
# DataFrame.index
# DataFrame.columns
df.index, df.columns

(DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
                '2013-01-05', '2013-01-06'],
               dtype='datetime64[ns]', freq='D'),
 Index(['A', 'B', 'C', 'D'], dtype='object'))

In [17]:
# DataFrame.to_numpy()
df.to_numpy(), type(df.to_numpy()), df.dtypes

(array([[ 1.53409455,  0.55085286, -0.6396385 ,  0.33903004],
        [-0.24709457, -0.34155224, -1.25417032,  2.15800147],
        [ 0.932847  , -0.72495461, -2.04913643,  0.64267122],
        [-0.57287127,  0.48830562, -0.05130515, -0.63746685],
        [ 0.85720501,  0.19889025, -0.52006958, -0.24870062],
        [ 1.04006442,  0.98276074,  0.55929494,  0.76145063]]),
 numpy.ndarray,
 A    float64
 B    float64
 C    float64
 D    float64
 dtype: object)

In [18]:
# DataFrame.describe()
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.590708,0.192384,-0.659171,0.502498
std,0.8168,0.627281,0.912131,0.971384
min,-0.572871,-0.724955,-2.049136,-0.637467
25%,0.02898,-0.206442,-1.100537,-0.101768
50%,0.895026,0.343598,-0.579854,0.490851
75%,1.01326,0.535216,-0.168496,0.731756
max,1.534095,0.982761,0.559295,2.158001


In [19]:
# transpose
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,1.534095,-0.247095,0.932847,-0.572871,0.857205,1.040064
B,0.550853,-0.341552,-0.724955,0.488306,0.19889,0.982761
C,-0.639638,-1.25417,-2.049136,-0.051305,-0.52007,0.559295
D,0.33903,2.158001,0.642671,-0.637467,-0.248701,0.761451


In [21]:
# DataFrame.sort_index()
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.33903,-0.639638,0.550853,1.534095
2013-01-02,2.158001,-1.25417,-0.341552,-0.247095
2013-01-03,0.642671,-2.049136,-0.724955,0.932847
2013-01-04,-0.637467,-0.051305,0.488306,-0.572871
2013-01-05,-0.248701,-0.52007,0.19889,0.857205
2013-01-06,0.761451,0.559295,0.982761,1.040064


In [22]:
# DataFrame.sort_values()
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-03,0.932847,-0.724955,-2.049136,0.642671
2013-01-02,-0.247095,-0.341552,-1.25417,2.158001
2013-01-05,0.857205,0.19889,-0.52007,-0.248701
2013-01-04,-0.572871,0.488306,-0.051305,-0.637467
2013-01-01,1.534095,0.550853,-0.639638,0.33903
2013-01-06,1.040064,0.982761,0.559295,0.761451


# Selection

## Getitem

In [24]:
df["A"], df.A

(2013-01-01    1.534095
 2013-01-02   -0.247095
 2013-01-03    0.932847
 2013-01-04   -0.572871
 2013-01-05    0.857205
 2013-01-06    1.040064
 Freq: D, Name: A, dtype: float64,
 2013-01-01    1.534095
 2013-01-02   -0.247095
 2013-01-03    0.932847
 2013-01-04   -0.572871
 2013-01-05    0.857205
 2013-01-06    1.040064
 Freq: D, Name: A, dtype: float64)

In [25]:
# slice using :
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,1.534095,0.550853,-0.639638,0.33903
2013-01-02,-0.247095,-0.341552,-1.25417,2.158001
2013-01-03,0.932847,-0.724955,-2.049136,0.642671


In [27]:
df["2013-01-02":"2013-01-04"]

Unnamed: 0,A,B,C,D
2013-01-02,-0.247095,-0.341552,-1.25417,2.158001
2013-01-03,0.932847,-0.724955,-2.049136,0.642671
2013-01-04,-0.572871,0.488306,-0.051305,-0.637467


## Selection by label

In [29]:
# select a row matching a label
dates[0], df.loc[dates[0]]

(Timestamp('2013-01-01 00:00:00'),
 A    1.534095
 B    0.550853
 C   -0.639638
 D    0.339030
 Name: 2013-01-01 00:00:00, dtype: float64)

In [30]:
# select all rows with a select column labels
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2013-01-01,1.534095,0.550853
2013-01-02,-0.247095,-0.341552
2013-01-03,0.932847,-0.724955
2013-01-04,-0.572871,0.488306
2013-01-05,0.857205,0.19889
2013-01-06,1.040064,0.982761


In [31]:
df.loc["2013-01-02":"2013-01-04", ["A", "B"]]

Unnamed: 0,A,B
2013-01-02,-0.247095,-0.341552
2013-01-03,0.932847,-0.724955
2013-01-04,-0.572871,0.488306


In [32]:
# select a single row and column
dates[0], df.loc[dates[0], "A"]

(Timestamp('2013-01-01 00:00:00'), np.float64(1.5340945526056085))

In [33]:
df.at[dates[0], "A"]

np.float64(1.5340945526056085)

## Selection by position

In [41]:
# DataFrame.iloc()
# DataFrame.iat()

# 4th row
print_df(df)
df.iloc[3]

Unnamed: 0,A,B,C,D
2013-01-01,1.534095,0.550853,-0.639638,0.33903
2013-01-02,-0.247095,-0.341552,-1.25417,2.158001
2013-01-03,0.932847,-0.724955,-2.049136,0.642671
2013-01-04,-0.572871,0.488306,-0.051305,-0.637467
2013-01-05,0.857205,0.19889,-0.52007,-0.248701
2013-01-06,1.040064,0.982761,0.559295,0.761451


A   -0.572871
B    0.488306
C   -0.051305
D   -0.637467
Name: 2013-01-04 00:00:00, dtype: float64

In [42]:
# integer slice
print_df(df)
df.iloc[3:5, 0:2]

Unnamed: 0,A,B,C,D
2013-01-01,1.534095,0.550853,-0.639638,0.33903
2013-01-02,-0.247095,-0.341552,-1.25417,2.158001
2013-01-03,0.932847,-0.724955,-2.049136,0.642671
2013-01-04,-0.572871,0.488306,-0.051305,-0.637467
2013-01-05,0.857205,0.19889,-0.52007,-0.248701
2013-01-06,1.040064,0.982761,0.559295,0.761451


Unnamed: 0,A,B
2013-01-04,-0.572871,0.488306
2013-01-05,0.857205,0.19889


In [43]:
# list of integer position locations
print_df(df)
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,B,C,D
2013-01-01,1.534095,0.550853,-0.639638,0.33903
2013-01-02,-0.247095,-0.341552,-1.25417,2.158001
2013-01-03,0.932847,-0.724955,-2.049136,0.642671
2013-01-04,-0.572871,0.488306,-0.051305,-0.637467
2013-01-05,0.857205,0.19889,-0.52007,-0.248701
2013-01-06,1.040064,0.982761,0.559295,0.761451


Unnamed: 0,A,C
2013-01-02,-0.247095,-1.25417
2013-01-03,0.932847,-2.049136
2013-01-05,0.857205,-0.52007


In [44]:
# slicing rows
print_df(df)
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-01,1.534095,0.550853,-0.639638,0.33903
2013-01-02,-0.247095,-0.341552,-1.25417,2.158001
2013-01-03,0.932847,-0.724955,-2.049136,0.642671
2013-01-04,-0.572871,0.488306,-0.051305,-0.637467
2013-01-05,0.857205,0.19889,-0.52007,-0.248701
2013-01-06,1.040064,0.982761,0.559295,0.761451


Unnamed: 0,A,B,C,D
2013-01-02,-0.247095,-0.341552,-1.25417,2.158001
2013-01-03,0.932847,-0.724955,-2.049136,0.642671


In [45]:
# slicing columns
print_df(df)
df.iloc[:, 1:3]

Unnamed: 0,A,B,C,D
2013-01-01,1.534095,0.550853,-0.639638,0.33903
2013-01-02,-0.247095,-0.341552,-1.25417,2.158001
2013-01-03,0.932847,-0.724955,-2.049136,0.642671
2013-01-04,-0.572871,0.488306,-0.051305,-0.637467
2013-01-05,0.857205,0.19889,-0.52007,-0.248701
2013-01-06,1.040064,0.982761,0.559295,0.761451


Unnamed: 0,B,C
2013-01-01,0.550853,-0.639638
2013-01-02,-0.341552,-1.25417
2013-01-03,-0.724955,-2.049136
2013-01-04,0.488306,-0.051305
2013-01-05,0.19889,-0.52007
2013-01-06,0.982761,0.559295


In [47]:
# cell value
print_df(df)
df.iloc[1, 1], df.iat[1, 1]

Unnamed: 0,A,B,C,D
2013-01-01,1.534095,0.550853,-0.639638,0.33903
2013-01-02,-0.247095,-0.341552,-1.25417,2.158001
2013-01-03,0.932847,-0.724955,-2.049136,0.642671
2013-01-04,-0.572871,0.488306,-0.051305,-0.637467
2013-01-05,0.857205,0.19889,-0.52007,-0.248701
2013-01-06,1.040064,0.982761,0.559295,0.761451


(np.float64(-0.34155223797193085), np.float64(-0.34155223797193085))

## Boolean indexing

In [None]:
# where df.A > 0
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.534095,0.550853,-0.639638,0.33903
2013-01-03,0.932847,-0.724955,-2.049136,0.642671
2013-01-05,0.857205,0.19889,-0.52007,-0.248701
2013-01-06,1.040064,0.982761,0.559295,0.761451


In [None]:
# selecting values from DataFrame where boolean condition is met
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.534095,0.550853,,0.33903
2013-01-02,,,,2.158001
2013-01-03,0.932847,,,0.642671
2013-01-04,,0.488306,,
2013-01-05,0.857205,0.19889,,
2013-01-06,1.040064,0.982761,0.559295,0.761451


In [50]:
# isin(): filtering
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]
print_df(df2)

df2[df2["E"].isin(["two", "four"])]

Unnamed: 0,A,B,C,D,E
2013-01-01,1.534095,0.550853,-0.639638,0.33903,one
2013-01-02,-0.247095,-0.341552,-1.25417,2.158001,one
2013-01-03,0.932847,-0.724955,-2.049136,0.642671,two
2013-01-04,-0.572871,0.488306,-0.051305,-0.637467,three
2013-01-05,0.857205,0.19889,-0.52007,-0.248701,four
2013-01-06,1.040064,0.982761,0.559295,0.761451,three


Unnamed: 0,A,B,C,D,E
2013-01-03,0.932847,-0.724955,-2.049136,0.642671,two
2013-01-05,0.857205,0.19889,-0.52007,-0.248701,four


## Setting

In [None]:
print_df(df)
# set a new column: align the data by indexes
df["F"] = pd.Series([1, 2, 3, 4, 5, 6],
                    index=pd.date_range("20130102", periods=6))
print_df(df)

Unnamed: 0,A,B,C,D
2013-01-01,1.534095,0.550853,-0.639638,0.33903
2013-01-02,-0.247095,-0.341552,-1.25417,2.158001
2013-01-03,0.932847,-0.724955,-2.049136,0.642671
2013-01-04,-0.572871,0.488306,-0.051305,-0.637467
2013-01-05,0.857205,0.19889,-0.52007,-0.248701
2013-01-06,1.040064,0.982761,0.559295,0.761451


Unnamed: 0,A,B,C,D,F
2013-01-01,1.534095,0.550853,-0.639638,0.33903,
2013-01-02,-0.247095,-0.341552,-1.25417,2.158001,1.0
2013-01-03,0.932847,-0.724955,-2.049136,0.642671,2.0
2013-01-04,-0.572871,0.488306,-0.051305,-0.637467,3.0
2013-01-05,0.857205,0.19889,-0.52007,-0.248701,4.0
2013-01-06,1.040064,0.982761,0.559295,0.761451,5.0


In [52]:
# set values by label
print_df(df)
df.at[dates[0], "A"] = 0
print_df(df)

Unnamed: 0,A,B,C,D,F
2013-01-01,1.534095,0.550853,-0.639638,0.33903,
2013-01-02,-0.247095,-0.341552,-1.25417,2.158001,1.0
2013-01-03,0.932847,-0.724955,-2.049136,0.642671,2.0
2013-01-04,-0.572871,0.488306,-0.051305,-0.637467,3.0
2013-01-05,0.857205,0.19889,-0.52007,-0.248701,4.0
2013-01-06,1.040064,0.982761,0.559295,0.761451,5.0


Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.550853,-0.639638,0.33903,
2013-01-02,-0.247095,-0.341552,-1.25417,2.158001,1.0
2013-01-03,0.932847,-0.724955,-2.049136,0.642671,2.0
2013-01-04,-0.572871,0.488306,-0.051305,-0.637467,3.0
2013-01-05,0.857205,0.19889,-0.52007,-0.248701,4.0
2013-01-06,1.040064,0.982761,0.559295,0.761451,5.0


In [53]:
# set value by position
print_df(df)
df.iat[0, 1] = 0
print_df(df)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.550853,-0.639638,0.33903,
2013-01-02,-0.247095,-0.341552,-1.25417,2.158001,1.0
2013-01-03,0.932847,-0.724955,-2.049136,0.642671,2.0
2013-01-04,-0.572871,0.488306,-0.051305,-0.637467,3.0
2013-01-05,0.857205,0.19889,-0.52007,-0.248701,4.0
2013-01-06,1.040064,0.982761,0.559295,0.761451,5.0


Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.639638,0.33903,
2013-01-02,-0.247095,-0.341552,-1.25417,2.158001,1.0
2013-01-03,0.932847,-0.724955,-2.049136,0.642671,2.0
2013-01-04,-0.572871,0.488306,-0.051305,-0.637467,3.0
2013-01-05,0.857205,0.19889,-0.52007,-0.248701,4.0
2013-01-06,1.040064,0.982761,0.559295,0.761451,5.0


In [54]:
# set by assign a NumPy array
print_df(df)
df.loc[:, "D"] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.639638,0.33903,
2013-01-02,-0.247095,-0.341552,-1.25417,2.158001,1.0
2013-01-03,0.932847,-0.724955,-2.049136,0.642671,2.0
2013-01-04,-0.572871,0.488306,-0.051305,-0.637467,3.0
2013-01-05,0.857205,0.19889,-0.52007,-0.248701,4.0
2013-01-06,1.040064,0.982761,0.559295,0.761451,5.0


Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.639638,5.0,
2013-01-02,-0.247095,-0.341552,-1.25417,5.0,1.0
2013-01-03,0.932847,-0.724955,-2.049136,5.0,2.0
2013-01-04,-0.572871,0.488306,-0.051305,5.0,3.0
2013-01-05,0.857205,0.19889,-0.52007,5.0,4.0
2013-01-06,1.040064,0.982761,0.559295,5.0,5.0


In [None]:
df2 = df.copy()
print_df(df2)
# set with where
df2[df2 > 0] = -df
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.639638,5.0,
2013-01-02,-0.247095,-0.341552,-1.25417,5.0,1.0
2013-01-03,0.932847,-0.724955,-2.049136,5.0,2.0
2013-01-04,-0.572871,0.488306,-0.051305,5.0,3.0
2013-01-05,0.857205,0.19889,-0.52007,5.0,4.0
2013-01-06,1.040064,0.982761,0.559295,5.0,5.0


Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.639638,-5.0,
2013-01-02,-0.247095,-0.341552,-1.25417,-5.0,-1.0
2013-01-03,-0.932847,-0.724955,-2.049136,-5.0,-2.0
2013-01-04,-0.572871,-0.488306,-0.051305,-5.0,-3.0
2013-01-05,-0.857205,-0.19889,-0.52007,-5.0,-4.0
2013-01-06,-1.040064,-0.982761,-0.559295,-5.0,-5.0


# Missing data

# Operations

# Merge

# Grouping

# Reshaping

# Time series

# Categoricals

# Plotting

# Importing and exporting data
* https://pandas.pydata.org/docs/reference/io.html

## CSV

In [4]:
data = pd.read_csv('https://raw.githubusercontent.com/gedeck/practical-statistics-for-data-scientists/refs/heads/master/data/web_page_data.csv')
data

Unnamed: 0,Page,Time
0,Page A,0.21
1,Page B,2.53
2,Page A,0.35
3,Page B,0.71
4,Page A,0.67
5,Page B,0.85
6,Page A,2.11
7,Page B,2.46
8,Page A,1.32
9,Page B,1.49


# Gotchas