In [1]:
import pandas as pd
import numpy as np

In [2]:
index = pd.date_range("1/1/2000", periods=8)
index

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
              dtype='datetime64[ns]', freq='D')

In [3]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
s

a    0.226653
b    0.536675
c    1.472243
d   -1.098187
e    0.333052
dtype: float64

In [4]:
df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"])
df

Unnamed: 0,A,B,C
2000-01-01,0.12108,0.678015,-0.946181
2000-01-02,-0.011672,0.097127,0.771051
2000-01-03,-1.329823,-0.866895,-1.810622
2000-01-04,1.397277,0.048422,1.327762
2000-01-05,1.99328,-1.455339,1.473374
2000-01-06,0.878855,0.5326,-1.753983
2000-01-07,2.483546,-0.568352,-0.304539
2000-01-08,0.691885,0.046702,0.886395


In [5]:
long_series = pd.Series(np.random.randn(1000))

long_series.head()

0    1.289646
1   -1.068033
2   -0.277597
3    0.345562
4    0.223636
dtype: float64

In [6]:
long_series.tail(3)

997   -0.150359
998    1.369789
999    0.257525
dtype: float64

In [7]:
df[:2]

Unnamed: 0,A,B,C
2000-01-01,0.12108,0.678015,-0.946181
2000-01-02,-0.011672,0.097127,0.771051


In [8]:
df.columns = [x.lower() for x in df.columns]
df.columns

Index(['a', 'b', 'c'], dtype='object')

In [9]:
df

Unnamed: 0,a,b,c
2000-01-01,0.12108,0.678015,-0.946181
2000-01-02,-0.011672,0.097127,0.771051
2000-01-03,-1.329823,-0.866895,-1.810622
2000-01-04,1.397277,0.048422,1.327762
2000-01-05,1.99328,-1.455339,1.473374
2000-01-06,0.878855,0.5326,-1.753983
2000-01-07,2.483546,-0.568352,-0.304539
2000-01-08,0.691885,0.046702,0.886395


In [10]:
s

a    0.226653
b    0.536675
c    1.472243
d   -1.098187
e    0.333052
dtype: float64

In [11]:
s.array

<NumpyExtensionArray>
[0.22665273101563216,  0.5366748785428379,  1.4722429958286873,
 -1.0981873168171383, 0.33305235062824184]
Length: 5, dtype: float64

In [12]:
s.index.array

<NumpyExtensionArray>
['a', 'b', 'c', 'd', 'e']
Length: 5, dtype: object

In [13]:
s.to_numpy()

array([ 0.22665273,  0.53667488,  1.472243  , -1.09818732,  0.33305235])

In [14]:
np.asarray(s)

array([ 0.22665273,  0.53667488,  1.472243  , -1.09818732,  0.33305235])

In [15]:
ser = pd.Series(pd.date_range("2000", periods=2, tz="CET"))
ser


0   2000-01-01 00:00:00+01:00
1   2000-01-02 00:00:00+01:00
dtype: datetime64[ns, CET]

In [16]:
ser.to_numpy(dtype=object)

array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'),
       Timestamp('2000-01-02 00:00:00+0100', tz='CET')], dtype=object)

In [17]:
ser.to_numpy(dtype="datetime64[ns]")

array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00.000000000'],
      dtype='datetime64[ns]')

In [18]:
df.to_numpy()

array([[ 0.12108027,  0.67801528, -0.94618061],
       [-0.01167243,  0.09712694,  0.77105124],
       [-1.32982273, -0.86689492, -1.81062194],
       [ 1.39727723,  0.04842226,  1.32776197],
       [ 1.99328019, -1.4553394 ,  1.47337378],
       [ 0.87885505,  0.53260025, -1.75398331],
       [ 2.48354577, -0.56835242, -0.30453878],
       [ 0.69188511,  0.04670164,  0.88639474]])

In [19]:
pd.set_option("compute.use_bottleneck", True)
pd.set_option("compute.use_numexpr", True)

In [20]:
df = pd.DataFrame(

    {

        "one": pd.Series(np.random.randn(3), index=["a", "b", "c"]),

        "two": pd.Series(np.random.randn(4), index=["a", "b", "c", "d"]),

        "three": pd.Series(np.random.randn(3), index=["b", "c", "d"]),

    }

)



df

Unnamed: 0,one,two,three
a,0.342308,-0.698858,
b,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028
d,,-0.314475,-1.6948


In [21]:
row = df.iloc[1]
row

one      1.393615
two      1.504289
three   -0.084983
Name: b, dtype: float64

In [22]:
column = df["two"]
column

a   -0.698858
b    1.504289
c    0.993717
d   -0.314475
Name: two, dtype: float64

In [23]:
df.sub(row, axis="columns")

Unnamed: 0,one,two,three
a,-1.051306,-2.203147,
b,0.0,0.0,0.0
c,-1.294851,-0.510571,-0.171045
d,,-1.818764,-1.609817


In [24]:
df.sub(row, axis=1)

Unnamed: 0,one,two,three
a,-1.051306,-2.203147,
b,0.0,0.0,0.0
c,-1.294851,-0.510571,-0.171045
d,,-1.818764,-1.609817


In [25]:
df.sub(column, axis="index")

Unnamed: 0,one,two,three
a,1.041167,0.0,
b,-0.110674,0.0,-1.589272
c,-0.894953,0.0,-1.249745
d,,0.0,-1.380325


In [26]:
df.sub(column, axis=0)

Unnamed: 0,one,two,three
a,1.041167,0.0,
b,-0.110674,0.0,-1.589272
c,-0.894953,0.0,-1.249745
d,,0.0,-1.380325


In [27]:
#Furthermore you can align a level of a MultiIndexed DataFrame with a Series.
dfmi = df.copy()

dfmi.index = pd.MultiIndex.from_tuples(

    [(1, "a"), (1, "b"), (1, "c"), (2, "a")], names=["first", "second"]

)

In [28]:
dfmi

Unnamed: 0_level_0,Unnamed: 1_level_0,one,two,three
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,0.342308,-0.698858,
1,b,1.393615,1.504289,-0.084983
1,c,0.098764,0.993717,-0.256028
2,a,,-0.314475,-1.6948


In [29]:
dfmi.sub(column, axis=0, level="second")

Unnamed: 0_level_0,Unnamed: 1_level_0,one,two,three
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,1.041167,0.0,
1,b,-0.110674,0.0,-1.589272
1,c,-0.894953,0.0,-1.249745
2,a,,0.384384,-0.995942


In [30]:
s = pd.Series(np.arange(10))
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32

In [31]:
div, rem = divmod(s, 3)
div

0    0
1    0
2    0
3    1
4    1
5    1
6    2
7    2
8    2
9    3
dtype: int32

In [32]:
rem

0    0
1    1
2    2
3    0
4    1
5    2
6    0
7    1
8    2
9    0
dtype: int32

In [33]:
idx = pd.Index(np.arange(10))
idx

Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int32')

In [34]:
div, rem = divmod(idx, 3)
div

Index([0, 0, 0, 1, 1, 1, 2, 2, 2, 3], dtype='int32')

In [35]:
rem

Index([0, 1, 2, 0, 1, 2, 0, 1, 2, 0], dtype='int32')

In [36]:
div, rem = divmod(s, [2, 2, 3, 3, 4, 4, 5, 5, 6, 6])
div

0    0
1    0
2    0
3    1
4    1
5    1
6    1
7    1
8    1
9    1
dtype: int32

In [37]:
rem

0    0
1    1
2    2
3    0
4    0
5    1
6    1
7    2
8    2
9    3
dtype: int32

In [38]:
#missing data operations
df2 = df.copy()
df2

Unnamed: 0,one,two,three
a,0.342308,-0.698858,
b,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028
d,,-0.314475,-1.6948


In [39]:
df2.loc["a", "three"] = 1.0
df2

Unnamed: 0,one,two,three
a,0.342308,-0.698858,1.0
b,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028
d,,-0.314475,-1.6948


In [40]:
df + df2

Unnamed: 0,one,two,three
a,0.684617,-1.397717,
b,2.78723,3.008578,-0.169966
c,0.197528,1.987435,-0.512056
d,,-0.62895,-3.3896


In [41]:
df.add(df2, fill_value=0)

Unnamed: 0,one,two,three
a,0.684617,-1.397717,1.0
b,2.78723,3.008578,-0.169966
c,0.197528,1.987435,-0.512056
d,,-0.62895,-3.3896


In [42]:
df2.add(df, fill_value=0)

Unnamed: 0,one,two,three
a,0.684617,-1.397717,1.0
b,2.78723,3.008578,-0.169966
c,0.197528,1.987435,-0.512056
d,,-0.62895,-3.3896


In [43]:
df.gt(df2)

Unnamed: 0,one,two,three
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


In [44]:
df2.ne(df)

Unnamed: 0,one,two,three
a,False,False,True
b,False,False,False
c,False,False,False
d,True,False,False


In [48]:
df

Unnamed: 0,one,two,three
a,0.342308,-0.698858,
b,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028
d,,-0.314475,-1.6948


In [45]:
df > 0

Unnamed: 0,one,two,three
a,True,False,False
b,True,True,False
c,True,True,False
d,False,False,False


In [46]:
(df > 0).all()

one      False
two      False
three    False
dtype: bool

In [47]:
(df > 0).any()

one       True
two       True
three    False
dtype: bool

In [49]:
(df > 0).any().any()

True

In [50]:
df.empty

False

In [51]:
pd.DataFrame(columns=list("ABC")).empty

True

In [52]:
df + df == df * 2

Unnamed: 0,one,two,three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


In [53]:
(df + df == df * 2).all()

one      False
two       True
three    False
dtype: bool

In [54]:
np.nan == np.nan

False

In [55]:
(df + df).equals(df * 2)

True

In [56]:
df1 = pd.DataFrame({"col": ["foo", 0, np.nan]})

df2 = pd.DataFrame({"col": [np.nan, 0, "foo"]}, index=[2, 1, 0])

In [57]:
df1

Unnamed: 0,col
0,foo
1,0
2,


In [58]:
df2

Unnamed: 0,col
2,
1,0
0,foo


In [59]:
df1.equals(df2)

False

In [61]:
df2.sort_index()

Unnamed: 0,col
0,foo
1,0
2,


In [60]:
df1.equals(df2.sort_index())

True

In [62]:
pd.Series(["foo", "bar", "baz"]) == "foo"

0     True
1    False
2    False
dtype: bool

In [64]:
type(pd.Series(["foo", "bar", "baz"]) == "foo")

pandas.core.series.Series

In [63]:
pd.Index(["foo", "bar", "baz"]) == "foo"

array([ True, False, False])

In [65]:
type(pd.Index(["foo", "bar", "baz"]) == "foo")

numpy.ndarray

In [66]:
pd.Series(["foo", "bar", "baz"]) == pd.Index(["foo", "bar", "qux"])

0     True
1     True
2    False
dtype: bool

In [67]:
pd.Series(["foo", "bar", "baz"]) == np.array(["foo", "bar", "qux"])

0     True
1     True
2    False
dtype: bool

In [68]:
df1 = pd.DataFrame(

    {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]}

)



df2 = pd.DataFrame(

    {

        "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0],

        "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0],

    }

)

In [69]:
df1

Unnamed: 0,A,B
0,1.0,
1,,2.0
2,3.0,3.0
3,5.0,
4,,6.0


In [70]:
df2

Unnamed: 0,A,B
0,5.0,
1,2.0,
2,4.0,3.0
3,,4.0
4,3.0,6.0
5,7.0,8.0


In [71]:
df1.combine_first(df2)

Unnamed: 0,A,B
0,1.0,
1,2.0,2.0
2,3.0,3.0
3,5.0,4.0
4,3.0,6.0
5,7.0,8.0


In [72]:
def combiner(x, y):
    return np.where(pd.isna(x), y, x)

In [73]:
df1.combine(df2, combiner)

Unnamed: 0,A,B
0,1.0,
1,2.0,2.0
2,3.0,3.0
3,5.0,4.0
4,3.0,6.0
5,7.0,8.0


In [74]:
df

Unnamed: 0,one,two,three
a,0.342308,-0.698858,
b,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028
d,,-0.314475,-1.6948


In [75]:
df.mean(0)

one      0.611562
two      0.371168
three   -0.678604
dtype: float64

In [76]:
df.mean(1)

a   -0.178275
b    0.937640
c    0.278818
d   -1.004638
dtype: float64

In [77]:
df.sum(0, skipna=False)

one           NaN
two      1.484673
three         NaN
dtype: float64

In [78]:
df.sum(axis=1, skipna=True)

a   -0.356550
b    2.812921
c    0.836454
d   -2.009275
dtype: float64

In [81]:
df.mean()

one      0.611562
two      0.371168
three   -0.678604
dtype: float64

In [83]:
df.mean(1)

a   -0.178275
b    0.937640
c    0.278818
d   -1.004638
dtype: float64

In [82]:
df.std()

one      0.688137
two      1.046676
three    0.884198
dtype: float64

In [79]:
ts_stand = (df - df.mean()) / df.std()

ts_stand.std()

one      1.0
two      1.0
three    1.0
dtype: float64

In [85]:
ts_stand

Unnamed: 0,one,two,three
a,-0.391279,-1.022309,
b,1.136477,1.082589,0.671366
c,-0.745198,0.594787,0.47792
d,,-0.655067,-1.149287


In [80]:
xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0)

xs_stand.std(1)

a    1.0
b    1.0
c    1.0
d    1.0
dtype: float64

In [87]:
df

Unnamed: 0,one,two,three
a,0.342308,-0.698858,
b,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028
d,,-0.314475,-1.6948


In [86]:
df.cumsum()

Unnamed: 0,one,two,three
a,0.342308,-0.698858,
b,1.735923,0.80543,-0.084983
c,1.834687,1.799148,-0.341011
d,,1.484673,-2.035811


In [88]:
np.mean(df["one"])

0.6115624781508879

In [89]:
df["one"].mean()

0.6115624781508879

In [91]:
np.mean(df["one"].to_numpy())

nan

In [92]:
series = pd.Series(np.random.randn(500))

series[20:500] = np.nan

series[10:20] = 5

series.nunique()

11

In [93]:
series = pd.Series(np.random.randn(1000))

series[::2] = np.nan

series.describe()

count    500.000000
mean       0.002191
std        1.023972
min       -2.545618
25%       -0.703997
50%       -0.021504
75%        0.663194
max        3.580700
dtype: float64

In [95]:
frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"])

frame.iloc[::2] = np.nan

frame

Unnamed: 0,a,b,c,d,e
0,,,,,
1,-0.640765,0.903101,0.938694,0.086676,0.214302
2,,,,,
3,-0.543897,-2.615456,1.157518,1.095664,-0.494296
4,,,,,
...,...,...,...,...,...
995,-1.373696,-0.768637,0.824005,-1.154017,-1.310304
996,,,,,
997,0.358162,0.550522,-0.452393,0.680302,-0.313340
998,,,,,


In [96]:
frame.describe()

Unnamed: 0,a,b,c,d,e
count,500.0,500.0,500.0,500.0,500.0
mean,0.010974,0.021146,-0.011155,0.069428,-0.033026
std,1.004994,0.988079,0.99635,1.005681,0.95165
min,-2.858664,-2.615456,-2.861155,-3.002902,-3.255303
25%,-0.692149,-0.632836,-0.701961,-0.580114,-0.650566
50%,0.019151,0.019034,-0.053929,0.045881,-0.07688
75%,0.719754,0.69763,0.651049,0.723683,0.618942
max,2.742931,2.545021,3.652229,3.29484,3.373784


In [97]:
series.describe(percentiles=[0.05, 0.25, 0.75, 0.95])

count    500.000000
mean       0.002191
std        1.023972
min       -2.545618
5%        -1.703468
25%       -0.703997
50%       -0.021504
75%        0.663194
95%        1.755931
max        3.580700
dtype: float64

In [100]:
s = pd.Series(["a", "a", "b", "b", "a", "a", np.nan, "c", "d", "a"])
s

0      a
1      a
2      b
3      b
4      a
5      a
6    NaN
7      c
8      d
9      a
dtype: object

In [101]:
s.describe()

count     9
unique    4
top       a
freq      5
dtype: object

In [102]:
frame = pd.DataFrame({"a": ["Yes", "Yes", "No", "No"], "b": range(4)})
frame

Unnamed: 0,a,b
0,Yes,0
1,Yes,1
2,No,2
3,No,3


In [103]:
frame.describe()

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [104]:
frame.describe(include=["object"])

Unnamed: 0,a
count,4
unique,2
top,Yes
freq,2


In [105]:
frame.describe(include=["number"])

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [106]:
frame.describe(include="all")

Unnamed: 0,a,b
count,4,4.0
unique,2,
top,Yes,
freq,2,
mean,,1.5
std,,1.290994
min,,0.0
25%,,0.75
50%,,1.5
75%,,2.25


In [108]:
s1 = pd.Series(np.random.randn(5))
s1

0    0.416804
1    0.599824
2    1.061644
3    0.475948
4    0.219582
dtype: float64

In [109]:
s1.idxmin(), s1.idxmax()

(4, 2)

In [110]:
df1 = pd.DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"])

df1

Unnamed: 0,A,B,C
0,-0.963447,-0.353149,-0.749629
1,-1.223804,1.442604,-0.526273
2,-0.138254,0.559683,0.190601
3,-2.039707,-0.957862,0.248311
4,0.400016,-0.040743,0.654431


In [111]:
df1.idxmin(axis=0)

A    3
B    3
C    0
dtype: int64

In [113]:
df1.idxmax(axis=1)

0    B
1    B
2    B
3    C
4    C
dtype: object

In [114]:
df3 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=["A"], index=list("edcba"))

df3

Unnamed: 0,A
e,2.0
d,1.0
c,1.0
b,3.0
a,


In [115]:
df3["A"].idxmin()

'd'

In [116]:
data = np.random.randint(0, 7, size=50)

data

array([6, 5, 1, 1, 3, 6, 4, 4, 2, 5, 2, 5, 1, 4, 3, 0, 5, 0, 1, 3, 1, 1,
       1, 2, 6, 5, 5, 3, 5, 3, 1, 1, 2, 1, 2, 2, 5, 3, 5, 4, 1, 1, 4, 5,
       5, 0, 5, 3, 0, 4])

In [117]:
s = pd.Series(data)

s.value_counts()

5    12
1    12
3     7
4     6
2     6
0     4
6     3
Name: count, dtype: int64

In [118]:
data = {"a": [1, 2, 3, 4], "b": ["x", "x", "y", "y"]}

frame = pd.DataFrame(data)

frame

Unnamed: 0,a,b
0,1,x
1,2,x
2,3,y
3,4,y


In [119]:
frame.value_counts()

a  b
1  x    1
2  x    1
3  y    1
4  y    1
Name: count, dtype: int64

In [120]:
s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7])

s5.mode()

0    3
1    7
dtype: int64

In [126]:
df5 = pd.DataFrame(

    {

        "A": np.random.randint(0, 7, size=50),

        "B": np.random.randint(-10, 15, size=50),

    }

)

df5.head()

Unnamed: 0,A,B
0,3,9
1,4,-8
2,6,8
3,2,5
4,2,11


In [127]:
df5.mode()

Unnamed: 0,A,B
0,4,4


In [128]:
arr = np.random.randn(20)

arr

array([ 1.08398043, -0.12091578, -0.34865767, -0.7361803 , -0.75451588,
       -0.51104198,  2.13189682,  0.48958342,  1.02261704, -2.2568324 ,
        0.75001553, -0.82640328,  0.22246626, -0.71057076, -0.73159076,
        1.06369519,  1.91975033,  1.3028142 , -1.34650138, -0.99014711])

In [142]:
factor = pd.cut(arr, 4)
factor

[(1.035, 2.132], (-1.16, -0.0625], (-1.16, -0.0625], (-1.16, -0.0625], (-1.16, -0.0625], ..., (1.035, 2.132], (1.035, 2.132], (1.035, 2.132], (-2.261, -1.16], (-1.16, -0.0625]]
Length: 20
Categories (4, interval[float64, right]): [(-2.261, -1.16] < (-1.16, -0.0625] < (-0.0625, 1.035] < (1.035, 2.132]]

In [143]:
factor[0]

Interval(1.035, 2.132, closed='right')

In [131]:
factor = pd.cut(arr, [-5, -1, 0, 1, 5])

factor

[(1, 5], (-1, 0], (-1, 0], (-1, 0], (-1, 0], ..., (1, 5], (1, 5], (1, 5], (-5, -1], (-1, 0]]
Length: 20
Categories (4, interval[int64, right]): [(-5, -1] < (-1, 0] < (0, 1] < (1, 5]]

In [144]:
arr = np.random.randn(30)

factor = pd.qcut(arr, [0, 0.25, 0.5, 0.75, 1])

factor

[(0.633, 2.087], (0.0851, 0.633], (0.633, 2.087], (0.633, 2.087], (-0.738, 0.0851], ..., (-2.538, -0.738], (-2.538, -0.738], (0.0851, 0.633], (0.633, 2.087], (-0.738, 0.0851]]
Length: 30
Categories (4, interval[float64, right]): [(-2.538, -0.738] < (-0.738, 0.0851] < (0.0851, 0.633] < (0.633, 2.087]]

In [145]:
arr = np.random.randn(20)

factor = pd.cut(arr, [-np.inf, 0, np.inf])

factor

[(0.0, inf], (0.0, inf], (-inf, 0.0], (-inf, 0.0], (-inf, 0.0], ..., (-inf, 0.0], (-inf, 0.0], (-inf, 0.0], (0.0, inf], (-inf, 0.0]]
Length: 20
Categories (2, interval[float64, right]): [(-inf, 0.0] < (0.0, inf]]

In [146]:
def extract_city_name(df):

    """

    Chicago, IL -> Chicago for city_name column

    """

    df["city_name"] = df["city_and_code"].str.split(",").str.get(0)

    return df



def add_country_name(df, country_name=None):

    """

    Chicago -> Chicago-US for city_name column

    """

    col = "city_name"

    df["city_and_country"] = df[col] + country_name

    return df



df_p = pd.DataFrame({"city_and_code": ["Chicago, IL"]})
df_p

Unnamed: 0,city_and_code
0,"Chicago, IL"


In [147]:
add_country_name(extract_city_name(df_p), country_name="US")

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


In [148]:
df_p.pipe(extract_city_name).pipe(add_country_name, country_name="US")

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


In [155]:
import statsmodels.formula.api as sm

bb = pd.read_csv(r"C:\Users\YOSSI\Downloads/baseball.csv", index_col="id")

In [156]:
(

    bb.query("h > 0")

    .assign(ln_h=lambda df: np.log(df.h))

    .pipe((sm.ols, "data"), "hr ~ ln_h + year + g + C(lg)")

    .fit()

    .summary()

)

0,1,2,3
Dep. Variable:,hr,R-squared:,0.685
Model:,OLS,Adj. R-squared:,0.665
Method:,Least Squares,F-statistic:,34.28
Date:,"Thu, 17 Oct 2024",Prob (F-statistic):,3.48e-15
Time:,19:50:47,Log-Likelihood:,-205.92
No. Observations:,68,AIC:,421.8
Df Residuals:,63,BIC:,432.9
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-8484.7720,4664.146,-1.819,0.074,-1.78e+04,835.780
C(lg)[T.NL],-2.2736,1.325,-1.716,0.091,-4.922,0.375
ln_h,-1.3542,0.875,-1.547,0.127,-3.103,0.395
year,4.2277,2.324,1.819,0.074,-0.417,8.872
g,0.1841,0.029,6.258,0.000,0.125,0.243

0,1,2,3
Omnibus:,10.875,Durbin-Watson:,1.999
Prob(Omnibus):,0.004,Jarque-Bera (JB):,17.298
Skew:,0.537,Prob(JB):,0.000175
Kurtosis:,5.225,Cond. No.,14900000.0


In [158]:
df

Unnamed: 0,one,two,three
a,0.342308,-0.698858,
b,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028
d,,-0.314475,-1.6948


In [157]:
df.apply(lambda x: np.mean(x))

one      0.611562
two      0.371168
three   -0.678604
dtype: float64

In [159]:
df.apply(lambda x: np.mean(x), axis=1)

a   -0.178275
b    0.937640
c    0.278818
d   -1.004638
dtype: float64

In [160]:
df.apply(lambda x: x.max() - x.min())

one      1.294851
two      2.203147
three    1.609817
dtype: float64

In [161]:
df.apply(np.cumsum)

Unnamed: 0,one,two,three
a,0.342308,-0.698858,
b,1.735923,0.80543,-0.084983
c,1.834687,1.799148,-0.341011
d,,1.484673,-2.035811


In [162]:
df.apply(np.exp)

Unnamed: 0,one,two,three
a,1.408195,0.497152,
b,4.029389,4.500952,0.918528
c,1.103806,2.701258,0.77412
d,,0.730172,0.183636


In [163]:
df.apply("mean")

one      0.611562
two      0.371168
three   -0.678604
dtype: float64

In [164]:
df.apply("mean", axis=1)

a   -0.178275
b    0.937640
c    0.278818
d   -1.004638
dtype: float64

In [165]:
tsdf = pd.DataFrame(

    np.random.randn(1000, 3),

    columns=["A", "B", "C"],

    index=pd.date_range("1/1/2000", periods=1000),

)



tsdf.apply(lambda x: x.idxmax())

A   2001-05-14
B   2002-07-30
C   2000-09-30
dtype: datetime64[ns]

In [166]:
def subtract_and_divide(x, sub, divide=1):

    return (x - sub) / divide



df_udf = pd.DataFrame(np.ones((2, 2)))

df_udf.apply(subtract_and_divide, args=(5,), divide=3)

Unnamed: 0,0,1
0,-1.333333,-1.333333
1,-1.333333,-1.333333


In [167]:
tsdf = pd.DataFrame(

    np.random.randn(10, 3),

    columns=["A", "B", "C"],

    index=pd.date_range("1/1/2000", periods=10),

)



tsdf.iloc[3:7] = np.nan

In [168]:
tsdf

Unnamed: 0,A,B,C
2000-01-01,-0.66666,0.2246,-2.100259
2000-01-02,1.376065,-1.065364,-1.939103
2000-01-03,0.069356,-0.260523,0.073013
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,-1.190028,-0.936135,0.653488
2000-01-09,0.004559,0.521101,1.135044
2000-01-10,-0.366584,0.192615,-0.625746


In [169]:
tsdf.apply(pd.Series.interpolate)

Unnamed: 0,A,B,C
2000-01-01,-0.66666,0.2246,-2.100259
2000-01-02,1.376065,-1.065364,-1.939103
2000-01-03,0.069356,-0.260523,0.073013
2000-01-04,-0.182521,-0.395646,0.189108
2000-01-05,-0.434398,-0.530768,0.305203
2000-01-06,-0.686275,-0.66589,0.421298
2000-01-07,-0.938152,-0.801012,0.537393
2000-01-08,-1.190028,-0.936135,0.653488
2000-01-09,0.004559,0.521101,1.135044
2000-01-10,-0.366584,0.192615,-0.625746


In [170]:
tsdf.agg(lambda x: np.sum(x))

A   -0.773293
B   -1.323706
C   -2.803564
dtype: float64

In [171]:
tsdf.agg("sum")

A   -0.773293
B   -1.323706
C   -2.803564
dtype: float64

In [172]:
# these are equivalent to a ``.sum()`` because we are aggregating
# on a single function

tsdf.sum()

A   -0.773293
B   -1.323706
C   -2.803564
dtype: float64

In [173]:
tsdf["A"].agg("sum")

-0.773292931368611

In [174]:
tsdf.agg(["sum", "mean"])

Unnamed: 0,A,B,C
sum,-0.773293,-1.323706,-2.803564
mean,-0.128882,-0.220618,-0.467261


In [175]:
tsdf["A"].agg(["sum", "mean"])

sum    -0.773293
mean   -0.128882
Name: A, dtype: float64

In [176]:
tsdf["A"].agg(["sum", lambda x: x.mean()])

sum        -0.773293
<lambda>   -0.128882
Name: A, dtype: float64

In [177]:
def mymean(x):

    return x.mean()



tsdf["A"].agg(["sum", mymean])

sum      -0.773293
mymean   -0.128882
Name: A, dtype: float64

In [178]:
tsdf

Unnamed: 0,A,B,C
2000-01-01,-0.66666,0.2246,-2.100259
2000-01-02,1.376065,-1.065364,-1.939103
2000-01-03,0.069356,-0.260523,0.073013
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,-1.190028,-0.936135,0.653488
2000-01-09,0.004559,0.521101,1.135044
2000-01-10,-0.366584,0.192615,-0.625746


In [179]:
tsdf.agg({"A": "mean", "B": "sum"})

A   -0.128882
B   -1.323706
dtype: float64

In [180]:
tsdf.agg({"A": ["mean", "min"], "B": "sum"})

Unnamed: 0,A,B
mean,-0.128882,
min,-1.190028,
sum,,-1.323706


In [181]:
from functools import partial

q_25 = partial(pd.Series.quantile, q=0.25)

q_25.__name__ = "25%"

q_75 = partial(pd.Series.quantile, q=0.75)

q_75.__name__ = "75%"

tsdf.agg(["count", "mean", "std", "min", q_25, "median", q_75, "max"])

Unnamed: 0,A,B,C
count,6.0,6.0,6.0
mean,-0.128882,-0.220618,-0.467261
std,0.871058,0.655176,1.339662
min,-1.190028,-1.065364,-2.100259
25%,-0.591641,-0.767232,-1.610764
median,-0.181013,-0.033954,-0.276367
75%,0.053157,0.216604,0.508369
max,1.376065,0.521101,1.135044


In [182]:
tsdf = pd.DataFrame(

    np.random.randn(10, 3),

    columns=["A", "B", "C"],

    index=pd.date_range("1/1/2000", periods=10),

)



tsdf.iloc[3:7] = np.nan

tsdf

Unnamed: 0,A,B,C
2000-01-01,-1.095191,0.237926,-0.752956
2000-01-02,0.196347,-0.239966,-0.431888
2000-01-03,-0.152079,0.74823,-0.180869
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,-1.473149,0.174117,-0.969409
2000-01-09,0.617741,-1.414483,-0.047319
2000-01-10,-1.282051,-0.086657,0.7468


In [183]:
tsdf.transform(np.abs)

Unnamed: 0,A,B,C
2000-01-01,1.095191,0.237926,0.752956
2000-01-02,0.196347,0.239966,0.431888
2000-01-03,0.152079,0.74823,0.180869
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,1.473149,0.174117,0.969409
2000-01-09,0.617741,1.414483,0.047319
2000-01-10,1.282051,0.086657,0.7468


In [184]:
tsdf.transform("abs")

Unnamed: 0,A,B,C
2000-01-01,1.095191,0.237926,0.752956
2000-01-02,0.196347,0.239966,0.431888
2000-01-03,0.152079,0.74823,0.180869
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,1.473149,0.174117,0.969409
2000-01-09,0.617741,1.414483,0.047319
2000-01-10,1.282051,0.086657,0.7468


In [185]:
tsdf.transform(lambda x: x.abs())

Unnamed: 0,A,B,C
2000-01-01,1.095191,0.237926,0.752956
2000-01-02,0.196347,0.239966,0.431888
2000-01-03,0.152079,0.74823,0.180869
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,1.473149,0.174117,0.969409
2000-01-09,0.617741,1.414483,0.047319
2000-01-10,1.282051,0.086657,0.7468


In [186]:
np.abs(tsdf)

Unnamed: 0,A,B,C
2000-01-01,1.095191,0.237926,0.752956
2000-01-02,0.196347,0.239966,0.431888
2000-01-03,0.152079,0.74823,0.180869
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,1.473149,0.174117,0.969409
2000-01-09,0.617741,1.414483,0.047319
2000-01-10,1.282051,0.086657,0.7468


In [187]:
tsdf["A"].transform(np.abs)

2000-01-01    1.095191
2000-01-02    0.196347
2000-01-03    0.152079
2000-01-04         NaN
2000-01-05         NaN
2000-01-06         NaN
2000-01-07         NaN
2000-01-08    1.473149
2000-01-09    0.617741
2000-01-10    1.282051
Freq: D, Name: A, dtype: float64

In [188]:
type(tsdf["A"].transform(np.abs))

pandas.core.series.Series

In [189]:
tsdf.transform([np.abs, lambda x: x + 1])

Unnamed: 0_level_0,A,A,B,B,C,C
Unnamed: 0_level_1,absolute,<lambda>,absolute,<lambda>,absolute,<lambda>
2000-01-01,1.095191,-0.095191,0.237926,1.237926,0.752956,0.247044
2000-01-02,0.196347,1.196347,0.239966,0.760034,0.431888,0.568112
2000-01-03,0.152079,0.847921,0.74823,1.74823,0.180869,0.819131
2000-01-04,,,,,,
2000-01-05,,,,,,
2000-01-06,,,,,,
2000-01-07,,,,,,
2000-01-08,1.473149,-0.473149,0.174117,1.174117,0.969409,0.030591
2000-01-09,0.617741,1.617741,1.414483,-0.414483,0.047319,0.952681
2000-01-10,1.282051,-0.282051,0.086657,0.913343,0.7468,1.7468


In [190]:
tsdf["A"].transform([np.abs, lambda x: x + 1])

Unnamed: 0,absolute,<lambda>
2000-01-01,1.095191,-0.095191
2000-01-02,0.196347,1.196347
2000-01-03,0.152079,0.847921
2000-01-04,,
2000-01-05,,
2000-01-06,,
2000-01-07,,
2000-01-08,1.473149,-0.473149
2000-01-09,0.617741,1.617741
2000-01-10,1.282051,-0.282051


In [191]:
tsdf.transform({"A": np.abs, "B": lambda x: x + 1})

Unnamed: 0,A,B
2000-01-01,1.095191,1.237926
2000-01-02,0.196347,0.760034
2000-01-03,0.152079,1.74823
2000-01-04,,
2000-01-05,,
2000-01-06,,
2000-01-07,,
2000-01-08,1.473149,1.174117
2000-01-09,0.617741,-0.414483
2000-01-10,1.282051,0.913343


In [192]:
tsdf.transform({"A": np.abs, "B": [lambda x: x + 1, "sqrt"]})

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0_level_0,A,B,B
Unnamed: 0_level_1,absolute,<lambda>,sqrt
2000-01-01,1.095191,1.237926,0.487776
2000-01-02,0.196347,0.760034,
2000-01-03,0.152079,1.74823,0.865003
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,1.473149,1.174117,0.417273
2000-01-09,0.617741,-0.414483,
2000-01-10,1.282051,0.913343,


In [193]:
df4 = df.copy()

df4

Unnamed: 0,one,two,three
a,0.342308,-0.698858,
b,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028
d,,-0.314475,-1.6948


In [194]:
def f(x):

    return len(str(x))



df4["one"].map(f)

a    18
b    18
c    19
d     3
Name: one, dtype: int64

In [195]:
df4.map(f)

Unnamed: 0,one,two,three
a,18,19,3
b,18,18,18
c,19,18,19
d,3,20,18


In [196]:
s = pd.Series(

    ["six", "seven", "six", "seven", "six"], index=["a", "b", "c", "d", "e"]

)



t = pd.Series({"six": 6.0, "seven": 7.0})


In [197]:
s

a      six
b    seven
c      six
d    seven
e      six
dtype: object

In [198]:
t

six      6.0
seven    7.0
dtype: float64

In [199]:
s.map(t)

a    6.0
b    7.0
c    6.0
d    7.0
e    6.0
dtype: float64

In [200]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
s

a    0.159485
b   -0.280843
c   -0.938410
d    0.983228
e   -2.152307
dtype: float64

In [201]:
s.reindex(["e", "b", "f", "d"])

e   -2.152307
b   -0.280843
f         NaN
d    0.983228
dtype: float64

In [202]:
df

Unnamed: 0,one,two,three
a,0.342308,-0.698858,
b,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028
d,,-0.314475,-1.6948


In [203]:
df.reindex(index=["c", "f", "b"], columns=["three", "two", "one"])

Unnamed: 0,three,two,one
c,-0.256028,0.993717,0.098764
f,,,
b,-0.084983,1.504289,1.393615


In [205]:
rs = s.reindex(df.index)
rs

a    0.159485
b   -0.280843
c   -0.938410
d    0.983228
dtype: float64

In [206]:
rs.index is df.index

True

In [207]:
df.reindex(["c", "f", "b"], axis="index")

Unnamed: 0,one,two,three
c,0.098764,0.993717,-0.256028
f,,,
b,1.393615,1.504289,-0.084983


In [208]:
df.reindex(["three", "two", "one"], axis="columns")

Unnamed: 0,three,two,one
a,,-0.698858,0.342308
b,-0.084983,1.504289,1.393615
c,-0.256028,0.993717,0.098764
d,-1.6948,-0.314475,


In [209]:
df2 = df.reindex(["a", "b", "c"], columns=["one", "two"])
df2

Unnamed: 0,one,two
a,0.342308,-0.698858
b,1.393615,1.504289
c,0.098764,0.993717


In [210]:
df3 = df2 - df2.mean()

df2

Unnamed: 0,one,two
a,0.342308,-0.698858
b,1.393615,1.504289
c,0.098764,0.993717


In [211]:
df

Unnamed: 0,one,two,three
a,0.342308,-0.698858,
b,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028
d,,-0.314475,-1.6948


In [212]:
df.reindex_like(df2)

Unnamed: 0,one,two
a,0.342308,-0.698858
b,1.393615,1.504289
c,0.098764,0.993717


In [213]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])

s1 = s[:4]

s2 = s[1:]

In [214]:
s1

a    0.952527
b    0.035430
c   -0.513971
d    0.116693
dtype: float64

In [215]:
s2

b    0.035430
c   -0.513971
d    0.116693
e   -1.028005
dtype: float64

In [216]:
s1.align(s2)

(a    0.952527
 b    0.035430
 c   -0.513971
 d    0.116693
 e         NaN
 dtype: float64,
 a         NaN
 b    0.035430
 c   -0.513971
 d    0.116693
 e   -1.028005
 dtype: float64)

In [217]:
s1.align(s2)[0]

a    0.952527
b    0.035430
c   -0.513971
d    0.116693
e         NaN
dtype: float64

In [218]:
s1.align(s2)[1]

a         NaN
b    0.035430
c   -0.513971
d    0.116693
e   -1.028005
dtype: float64

In [219]:
s1

a    0.952527
b    0.035430
c   -0.513971
d    0.116693
dtype: float64

In [220]:
s2

b    0.035430
c   -0.513971
d    0.116693
e   -1.028005
dtype: float64

In [221]:
s1.align(s2, join="inner")

(b    0.035430
 c   -0.513971
 d    0.116693
 dtype: float64,
 b    0.035430
 c   -0.513971
 d    0.116693
 dtype: float64)

In [222]:
s1.align(s2, join="left")

(a    0.952527
 b    0.035430
 c   -0.513971
 d    0.116693
 dtype: float64,
 a         NaN
 b    0.035430
 c   -0.513971
 d    0.116693
 dtype: float64)

In [223]:
df

Unnamed: 0,one,two,three
a,0.342308,-0.698858,
b,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028
d,,-0.314475,-1.6948


In [224]:
df2

Unnamed: 0,one,two
a,0.342308,-0.698858
b,1.393615,1.504289
c,0.098764,0.993717


In [225]:
df.align(df2, join="inner")

(        one       two
 a  0.342308 -0.698858
 b  1.393615  1.504289
 c  0.098764  0.993717,
         one       two
 a  0.342308 -0.698858
 b  1.393615  1.504289
 c  0.098764  0.993717)

In [226]:
df.align(df2, join="inner", axis=0)

(        one       two     three
 a  0.342308 -0.698858       NaN
 b  1.393615  1.504289 -0.084983
 c  0.098764  0.993717 -0.256028,
         one       two
 a  0.342308 -0.698858
 b  1.393615  1.504289
 c  0.098764  0.993717)

In [227]:
df2.iloc[0]

one    0.342308
two   -0.698858
Name: a, dtype: float64

In [228]:
df.align(df2.iloc[0], axis=1)

(        one     three       two
 a  0.342308       NaN -0.698858
 b  1.393615 -0.084983  1.504289
 c  0.098764 -0.256028  0.993717
 d       NaN -1.694800 -0.314475,
 one      0.342308
 three         NaN
 two     -0.698858
 Name: a, dtype: float64)

In [229]:
df.align(df2.iloc[0], axis=0)

(          one       two     three
 a    0.342308 -0.698858       NaN
 b    1.393615  1.504289 -0.084983
 c    0.098764  0.993717 -0.256028
 d         NaN -0.314475 -1.694800
 one       NaN       NaN       NaN
 two       NaN       NaN       NaN,
 a           NaN
 b           NaN
 c           NaN
 d           NaN
 one    0.342308
 two   -0.698858
 Name: a, dtype: float64)

In [230]:
rng = pd.date_range("1/3/2000", periods=8)

ts = pd.Series(np.random.randn(8), index=rng)

ts2 = ts.iloc[[0, 3, 6]]

In [231]:
ts

2000-01-03    0.017226
2000-01-04   -1.160472
2000-01-05   -0.929044
2000-01-06   -0.498242
2000-01-07   -0.253988
2000-01-08    0.083784
2000-01-09   -0.308428
2000-01-10    0.370830
Freq: D, dtype: float64

In [232]:
ts2

2000-01-03    0.017226
2000-01-06   -0.498242
2000-01-09   -0.308428
Freq: 3D, dtype: float64

In [233]:
ts2.reindex(ts.index)

2000-01-03    0.017226
2000-01-04         NaN
2000-01-05         NaN
2000-01-06   -0.498242
2000-01-07         NaN
2000-01-08         NaN
2000-01-09   -0.308428
2000-01-10         NaN
Freq: D, dtype: float64

In [234]:
ts2.reindex(ts.index, method="ffill")

2000-01-03    0.017226
2000-01-04    0.017226
2000-01-05    0.017226
2000-01-06   -0.498242
2000-01-07   -0.498242
2000-01-08   -0.498242
2000-01-09   -0.308428
2000-01-10   -0.308428
Freq: D, dtype: float64

In [237]:
ts2.reindex(ts.index).ffill()

2000-01-03    0.017226
2000-01-04    0.017226
2000-01-05    0.017226
2000-01-06   -0.498242
2000-01-07   -0.498242
2000-01-08   -0.498242
2000-01-09   -0.308428
2000-01-10   -0.308428
Freq: D, dtype: float64

In [235]:
ts2.reindex(ts.index, method="bfill")

2000-01-03    0.017226
2000-01-04   -0.498242
2000-01-05   -0.498242
2000-01-06   -0.498242
2000-01-07   -0.308428
2000-01-08   -0.308428
2000-01-09   -0.308428
2000-01-10         NaN
Freq: D, dtype: float64

In [236]:
ts2.reindex(ts.index, method="nearest")

2000-01-03    0.017226
2000-01-04    0.017226
2000-01-05   -0.498242
2000-01-06   -0.498242
2000-01-07   -0.498242
2000-01-08   -0.308428
2000-01-09   -0.308428
2000-01-10   -0.308428
Freq: D, dtype: float64

In [238]:
ts2.reindex(ts.index, method="ffill", limit=1)

2000-01-03    0.017226
2000-01-04    0.017226
2000-01-05         NaN
2000-01-06   -0.498242
2000-01-07   -0.498242
2000-01-08         NaN
2000-01-09   -0.308428
2000-01-10   -0.308428
Freq: D, dtype: float64

In [239]:
ts2.reindex(ts.index, method="ffill", tolerance="1 day")

2000-01-03    0.017226
2000-01-04    0.017226
2000-01-05         NaN
2000-01-06   -0.498242
2000-01-07   -0.498242
2000-01-08         NaN
2000-01-09   -0.308428
2000-01-10   -0.308428
Freq: D, dtype: float64

In [240]:
df

Unnamed: 0,one,two,three
a,0.342308,-0.698858,
b,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028
d,,-0.314475,-1.6948


In [241]:
df.drop(["a", "d"], axis=0)

Unnamed: 0,one,two,three
b,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028


In [242]:
df.drop(["one"], axis=1)

Unnamed: 0,two,three
a,-0.698858,
b,1.504289,-0.084983
c,0.993717,-0.256028
d,-0.314475,-1.6948


In [243]:
df.reindex(df.index.difference(["a", "d"]))

Unnamed: 0,one,two,three
b,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028


In [244]:
s

a    0.952527
b    0.035430
c   -0.513971
d    0.116693
e   -1.028005
dtype: float64

In [245]:
s.rename(str.upper)

A    0.952527
B    0.035430
C   -0.513971
D    0.116693
E   -1.028005
dtype: float64

In [246]:
df.rename(

    columns={"one": "foo", "two": "bar"},

    index={"a": "apple", "b": "banana", "d": "durian"},

)

Unnamed: 0,foo,bar,three
apple,0.342308,-0.698858,
banana,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028
durian,,-0.314475,-1.6948


In [247]:
df.rename({"one": "foo", "two": "bar"}, axis="columns")

Unnamed: 0,foo,bar,three
a,0.342308,-0.698858,
b,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028
d,,-0.314475,-1.6948


In [248]:
df.rename({"a": "apple", "b": "banana", "d": "durian"}, axis="index")

Unnamed: 0,one,two,three
apple,0.342308,-0.698858,
banana,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028
durian,,-0.314475,-1.6948


In [249]:
s.rename("scalar-name")

a    0.952527
b    0.035430
c   -0.513971
d    0.116693
e   -1.028005
Name: scalar-name, dtype: float64

In [250]:
df = pd.DataFrame(

    {"x": [1, 2, 3, 4, 5, 6], "y": [10, 20, 30, 40, 50, 60]},

    index=pd.MultiIndex.from_product(

        [["a", "b", "c"], [1, 2]], names=["let", "num"]

    ),

)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,x,y
let,num,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,1,10
a,2,2,20
b,1,3,30
b,2,4,40
c,1,5,50
c,2,6,60


In [251]:
df.rename_axis(index={"let": "abc"})

Unnamed: 0_level_0,Unnamed: 1_level_0,x,y
abc,num,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,1,10
a,2,2,20
b,1,3,30
b,2,4,40
c,1,5,50
c,2,6,60


In [253]:
df.rename_axis(index=str.upper)

Unnamed: 0_level_0,Unnamed: 1_level_0,x,y
LET,NUM,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,1,10
a,2,2,20
b,1,3,30
b,2,4,40
c,1,5,50
c,2,6,60


In [254]:
df = pd.DataFrame(

    {"col1": np.random.randn(3), "col2": np.random.randn(3)}, index=["a", "b", "c"]

)



for col in df:

    print(col)

col1
col2


In [255]:
df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})

for index, row in df.iterrows():

    row["a"] = 10



df

Unnamed: 0,a,b
0,1,a
1,2,b
2,3,c


In [257]:
for label, ser in df.items():

    print(label)

    print(ser)

a
0    1
1    2
2    3
Name: a, dtype: int64
b
0    a
1    b
2    c
Name: b, dtype: object


In [258]:
for row_index, row in df.iterrows():

    print(row_index, row, sep="\n")

0
a    1
b    a
Name: 0, dtype: object
1
a    2
b    b
Name: 1, dtype: object
2
a    3
b    c
Name: 2, dtype: object


In [259]:
df_orig = pd.DataFrame([[1, 1.5]], columns=["int", "float"])

df_orig.dtypes

int        int64
float    float64
dtype: object

In [260]:
row = next(df_orig.iterrows())[1]

row

int      1.0
float    1.5
Name: 0, dtype: float64

In [261]:
row["int"].dtype

dtype('float64')

In [262]:
df_orig["int"].dtype

dtype('int64')

In [263]:
df2 = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})

print(df2)

   x  y
0  1  4
1  2  5
2  3  6


In [264]:
print(df2.T)

   0  1  2
x  1  2  3
y  4  5  6


In [265]:
df2_t = pd.DataFrame({idx: values for idx, values in df2.iterrows()})

print(df2_t)

   0  1  2
x  1  2  3
y  4  5  6


In [274]:
for row in df.itertuples():
    print(row)
    print(row[2])
    

Pandas(Index=0, a=1, b='a')
a
Pandas(Index=1, a=2, b='b')
b
Pandas(Index=2, a=3, b='c')
c


In [276]:
# datetime

s = pd.Series(pd.date_range("20130101 09:10:12", periods=4))
s

0   2013-01-01 09:10:12
1   2013-01-02 09:10:12
2   2013-01-03 09:10:12
3   2013-01-04 09:10:12
dtype: datetime64[ns]

In [277]:
s.dt.hour

0    9
1    9
2    9
3    9
dtype: int32

In [278]:
s.dt.second

0    12
1    12
2    12
3    12
dtype: int32

In [279]:
s.dt.day

0    1
1    2
2    3
3    4
dtype: int32

In [280]:
s[s.dt.day == 2]

1   2013-01-02 09:10:12
dtype: datetime64[ns]

In [281]:
stz = s.dt.tz_localize("US/Eastern")
stz

0   2013-01-01 09:10:12-05:00
1   2013-01-02 09:10:12-05:00
2   2013-01-03 09:10:12-05:00
3   2013-01-04 09:10:12-05:00
dtype: datetime64[ns, US/Eastern]

In [282]:
stz.dt.tz

<DstTzInfo 'US/Eastern' LMT-1 day, 19:04:00 STD>

In [283]:
s.dt.tz_localize("UTC").dt.tz_convert("US/Eastern")

0   2013-01-01 04:10:12-05:00
1   2013-01-02 04:10:12-05:00
2   2013-01-03 04:10:12-05:00
3   2013-01-04 04:10:12-05:00
dtype: datetime64[ns, US/Eastern]

In [284]:
# DatetimeIndex

s = pd.Series(pd.date_range("20130101", periods=4))

s

0   2013-01-01
1   2013-01-02
2   2013-01-03
3   2013-01-04
dtype: datetime64[ns]

In [285]:
s.dt.strftime("%Y/%m/%d")

0    2013/01/01
1    2013/01/02
2    2013/01/03
3    2013/01/04
dtype: object

In [286]:
# PeriodIndex

s = pd.Series(pd.period_range("20130101", periods=4))

s

0    2013-01-01
1    2013-01-02
2    2013-01-03
3    2013-01-04
dtype: period[D]

In [287]:
s.dt.strftime("%Y/%m/%d")

0    2013/01/01
1    2013/01/02
2    2013/01/03
3    2013/01/04
dtype: object

In [288]:
# period

s = pd.Series(pd.period_range("20130101", periods=4, freq="D"))

s

0    2013-01-01
1    2013-01-02
2    2013-01-03
3    2013-01-04
dtype: period[D]

In [289]:
s.dt.year

0    2013
1    2013
2    2013
3    2013
dtype: int64

In [290]:
s.dt.day

0    1
1    2
2    3
3    4
dtype: int64

In [291]:
# timedelta

s = pd.Series(pd.timedelta_range("1 day 00:00:05", periods=4, freq="s"))

s

0   1 days 00:00:05
1   1 days 00:00:06
2   1 days 00:00:07
3   1 days 00:00:08
dtype: timedelta64[ns]

In [292]:
s.dt.days

0    1
1    1
2    1
3    1
dtype: int64

In [293]:
s.dt.seconds

0    5
1    6
2    7
3    8
dtype: int32

In [294]:
s.dt.components

Unnamed: 0,days,hours,minutes,seconds,milliseconds,microseconds,nanoseconds
0,1,0,0,5,0,0,0
1,1,0,0,6,0,0,0
2,1,0,0,7,0,0,0
3,1,0,0,8,0,0,0


In [295]:
#Vectorized string methods
s = pd.Series(

    ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string"

)


s

0       A
1       B
2       C
3    Aaba
4    Baca
5    <NA>
6    CABA
7     dog
8     cat
dtype: string

In [296]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5    <NA>
6    caba
7     dog
8     cat
dtype: string

In [297]:
#Sorting
df = pd.DataFrame(

    {

        "one": pd.Series(np.random.randn(3), index=["a", "b", "c"]),

        "two": pd.Series(np.random.randn(4), index=["a", "b", "c", "d"]),

        "three": pd.Series(np.random.randn(3), index=["b", "c", "d"]),

    }

)
df

Unnamed: 0,one,two,three
a,-1.380352,0.671819,
b,0.17892,-0.527346,-0.627793
c,-2.039832,-0.377936,1.155266
d,,-0.574601,-1.015822


In [298]:
unsorted_df = df.reindex(

    index=["a", "d", "c", "b"], columns=["three", "two", "one"]

)


unsorted_df

Unnamed: 0,three,two,one
a,,0.671819,-1.380352
d,-1.015822,-0.574601,
c,1.155266,-0.377936,-2.039832
b,-0.627793,-0.527346,0.17892


In [299]:
unsorted_df.sort_index()

Unnamed: 0,three,two,one
a,,0.671819,-1.380352
b,-0.627793,-0.527346,0.17892
c,1.155266,-0.377936,-2.039832
d,-1.015822,-0.574601,


In [300]:
unsorted_df.sort_index(ascending=False)

Unnamed: 0,three,two,one
d,-1.015822,-0.574601,
c,1.155266,-0.377936,-2.039832
b,-0.627793,-0.527346,0.17892
a,,0.671819,-1.380352


In [301]:
unsorted_df.sort_index(axis=1)

Unnamed: 0,one,three,two
a,-1.380352,,0.671819
d,,-1.015822,-0.574601
c,-2.039832,1.155266,-0.377936
b,0.17892,-0.627793,-0.527346


In [302]:
# Series

unsorted_df["three"].sort_index()

a         NaN
b   -0.627793
c    1.155266
d   -1.015822
Name: three, dtype: float64

In [303]:
s1 = pd.DataFrame({"a": ["B", "a", "C"], "b": [1, 2, 3], "c": [2, 3, 4]}).set_index(

    list("ab")

)



s1

Unnamed: 0_level_0,Unnamed: 1_level_0,c
a,b,Unnamed: 2_level_1
B,1,2
a,2,3
C,3,4


In [304]:
s1.sort_index(level="a")

Unnamed: 0_level_0,Unnamed: 1_level_0,c
a,b,Unnamed: 2_level_1
B,1,2
C,3,4
a,2,3


In [305]:
s1.sort_index(level="a", key=lambda idx: idx.str.lower())

Unnamed: 0_level_0,Unnamed: 1_level_0,c
a,b,Unnamed: 2_level_1
a,2,3
B,1,2
C,3,4


In [307]:
df1 = pd.DataFrame(

    {"one": [2, 1, 1, 1], "two": [1, 3, 2, 4], "three": [5, 4, 3, 2]}

)
df1

Unnamed: 0,one,two,three
0,2,1,5
1,1,3,4
2,1,2,3
3,1,4,2


In [308]:
df1.sort_values(by="two")

Unnamed: 0,one,two,three
0,2,1,5
2,1,2,3
1,1,3,4
3,1,4,2


In [309]:
df1[["one", "two", "three"]].sort_values(by=["one", "two"])

Unnamed: 0,one,two,three
2,1,2,3
1,1,3,4
3,1,4,2
0,2,1,5


In [310]:
df1.sort_values(by=["one", "two"])

Unnamed: 0,one,two,three
2,1,2,3
1,1,3,4
3,1,4,2
0,2,1,5


In [311]:
s

0       A
1       B
2       C
3    Aaba
4    Baca
5    <NA>
6    CABA
7     dog
8     cat
dtype: string

In [313]:
s[2] = np.nan
s

0       A
1       B
2    <NA>
3    Aaba
4    Baca
5    <NA>
6    CABA
7     dog
8     cat
dtype: string

In [314]:
s.sort_values()

0       A
3    Aaba
1       B
4    Baca
6    CABA
8     cat
7     dog
2    <NA>
5    <NA>
dtype: string

In [315]:
s.sort_values(na_position="first")

2    <NA>
5    <NA>
0       A
3    Aaba
1       B
4    Baca
6    CABA
8     cat
7     dog
dtype: string

In [316]:
s1 = pd.Series(["B", "a", "C"])
s1

0    B
1    a
2    C
dtype: object

In [317]:
s1.sort_values()

0    B
2    C
1    a
dtype: object

In [318]:
s1.sort_values(key=lambda x: x.str.lower())

1    a
0    B
2    C
dtype: object

In [320]:
df = pd.DataFrame({"a": ["B", "a", "C"], "b": [1, 2, 3]})
df

Unnamed: 0,a,b
0,B,1
1,a,2
2,C,3


In [321]:
df.sort_values(by="a")

Unnamed: 0,a,b
0,B,1
2,C,3
1,a,2


In [322]:
df.sort_values(by="a", key=lambda col: col.str.lower())

Unnamed: 0,a,b
1,a,2
0,B,1
2,C,3


In [324]:
idx = pd.MultiIndex.from_tuples(

    [("a", 1), ("a", 2), ("a", 2), ("b", 2), ("b", 1), ("b", 1)]

)


idx.names = ["first", "second"]

idx

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 2),
            ('b', 2),
            ('b', 1),
            ('b', 1)],
           names=['first', 'second'])

In [325]:
df_multi = pd.DataFrame({"A": np.arange(6, 0, -1)}, index=idx)

df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,A
first,second,Unnamed: 2_level_1
a,1,6
a,2,5
a,2,4
b,2,3
b,1,2
b,1,1


In [326]:
df_multi.sort_values(by=["second", "A"])

Unnamed: 0_level_0,Unnamed: 1_level_0,A
first,second,Unnamed: 2_level_1
b,1,1
b,1,2
a,1,6
b,2,3
a,2,4
a,2,5


In [327]:
ser = pd.Series([1, 2, 3])

ser.searchsorted([0, 3])

array([0, 2], dtype=int64)

In [328]:
ser.searchsorted([0, 4])

array([0, 3], dtype=int64)

In [329]:
ser.searchsorted([1, 3], side="right")

array([1, 3], dtype=int64)

In [330]:
ser.searchsorted([1, 3], side="left")

array([0, 2], dtype=int64)

In [332]:
ser = pd.Series([3, 1, 2])
ser

0    3
1    1
2    2
dtype: int64

In [333]:
ser.searchsorted([0, 3], sorter=np.argsort(ser))

array([0, 2], dtype=int64)

In [335]:
s = pd.Series(np.random.permutation(10))
s

0    9
1    6
2    5
3    0
4    8
5    2
6    4
7    3
8    7
9    1
dtype: int32

In [336]:
s.sort_values()

3    0
9    1
5    2
7    3
6    4
2    5
1    6
8    7
4    8
0    9
dtype: int32

In [337]:
s.nsmallest(3)

3    0
9    1
5    2
dtype: int32

In [338]:
s.nlargest(3)

0    9
4    8
8    7
dtype: int32

In [339]:
df = pd.DataFrame(

    {

        "a": [-2, -1, 1, 10, 8, 11, -1],

        "b": list("abdceff"),

        "c": [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0],

    }

)
df

Unnamed: 0,a,b,c
0,-2,a,1.0
1,-1,b,2.0
2,1,d,4.0
3,10,c,3.2
4,8,e,
5,11,f,3.0
6,-1,f,4.0


In [340]:
df.nlargest(3, "a")

Unnamed: 0,a,b,c
5,11,f,3.0
3,10,c,3.2
4,8,e,


In [341]:
df.nlargest(5, ["a", "c"])

Unnamed: 0,a,b,c
5,11,f,3.0
3,10,c,3.2
4,8,e,
2,1,d,4.0
6,-1,f,4.0


In [342]:
df.nsmallest(3, "a")

Unnamed: 0,a,b,c
0,-2,a,1.0
1,-1,b,2.0
6,-1,f,4.0


In [343]:
df.nsmallest(5, ["a", "c"])

Unnamed: 0,a,b,c
0,-2,a,1.0
1,-1,b,2.0
6,-1,f,4.0
2,1,d,4.0
4,8,e,


In [344]:
df1.columns = pd.MultiIndex.from_tuples(

    [("a", "one"), ("a", "two"), ("b", "three")]

)
df1

Unnamed: 0_level_0,a,a,b
Unnamed: 0_level_1,one,two,three
0,2,1,5
1,1,3,4
2,1,2,3
3,1,4,2


In [345]:
df1.sort_values(by=("a", "two"))

Unnamed: 0_level_0,a,a,b
Unnamed: 0_level_1,one,two,three
0,2,1,5
2,1,2,3
1,1,3,4
3,1,4,2
