In [1]:
import pandas as pd
import numpy as np

In [2]:
index = pd.date_range("1/1/2000", periods=8)
index

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
              dtype='datetime64[ns]', freq='D')

In [3]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
s

a    0.226653
b    0.536675
c    1.472243
d   -1.098187
e    0.333052
dtype: float64

In [4]:
df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"])
df

Unnamed: 0,A,B,C
2000-01-01,0.12108,0.678015,-0.946181
2000-01-02,-0.011672,0.097127,0.771051
2000-01-03,-1.329823,-0.866895,-1.810622
2000-01-04,1.397277,0.048422,1.327762
2000-01-05,1.99328,-1.455339,1.473374
2000-01-06,0.878855,0.5326,-1.753983
2000-01-07,2.483546,-0.568352,-0.304539
2000-01-08,0.691885,0.046702,0.886395


In [5]:
long_series = pd.Series(np.random.randn(1000))

long_series.head()

0    1.289646
1   -1.068033
2   -0.277597
3    0.345562
4    0.223636
dtype: float64

In [6]:
long_series.tail(3)

997   -0.150359
998    1.369789
999    0.257525
dtype: float64

In [7]:
df[:2]

Unnamed: 0,A,B,C
2000-01-01,0.12108,0.678015,-0.946181
2000-01-02,-0.011672,0.097127,0.771051


In [8]:
df.columns = [x.lower() for x in df.columns]
df.columns

Index(['a', 'b', 'c'], dtype='object')

In [9]:
df

Unnamed: 0,a,b,c
2000-01-01,0.12108,0.678015,-0.946181
2000-01-02,-0.011672,0.097127,0.771051
2000-01-03,-1.329823,-0.866895,-1.810622
2000-01-04,1.397277,0.048422,1.327762
2000-01-05,1.99328,-1.455339,1.473374
2000-01-06,0.878855,0.5326,-1.753983
2000-01-07,2.483546,-0.568352,-0.304539
2000-01-08,0.691885,0.046702,0.886395


In [10]:
s

a    0.226653
b    0.536675
c    1.472243
d   -1.098187
e    0.333052
dtype: float64

In [11]:
s.array

<NumpyExtensionArray>
[0.22665273101563216,  0.5366748785428379,  1.4722429958286873,
 -1.0981873168171383, 0.33305235062824184]
Length: 5, dtype: float64

In [12]:
s.index.array

<NumpyExtensionArray>
['a', 'b', 'c', 'd', 'e']
Length: 5, dtype: object

In [13]:
s.to_numpy()

array([ 0.22665273,  0.53667488,  1.472243  , -1.09818732,  0.33305235])

In [14]:
np.asarray(s)

array([ 0.22665273,  0.53667488,  1.472243  , -1.09818732,  0.33305235])

In [15]:
ser = pd.Series(pd.date_range("2000", periods=2, tz="CET"))
ser


0   2000-01-01 00:00:00+01:00
1   2000-01-02 00:00:00+01:00
dtype: datetime64[ns, CET]

In [16]:
ser.to_numpy(dtype=object)

array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'),
       Timestamp('2000-01-02 00:00:00+0100', tz='CET')], dtype=object)

In [17]:
ser.to_numpy(dtype="datetime64[ns]")

array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00.000000000'],
      dtype='datetime64[ns]')

In [18]:
df.to_numpy()

array([[ 0.12108027,  0.67801528, -0.94618061],
       [-0.01167243,  0.09712694,  0.77105124],
       [-1.32982273, -0.86689492, -1.81062194],
       [ 1.39727723,  0.04842226,  1.32776197],
       [ 1.99328019, -1.4553394 ,  1.47337378],
       [ 0.87885505,  0.53260025, -1.75398331],
       [ 2.48354577, -0.56835242, -0.30453878],
       [ 0.69188511,  0.04670164,  0.88639474]])

In [19]:
pd.set_option("compute.use_bottleneck", True)
pd.set_option("compute.use_numexpr", True)

In [20]:
df = pd.DataFrame(

    {

        "one": pd.Series(np.random.randn(3), index=["a", "b", "c"]),

        "two": pd.Series(np.random.randn(4), index=["a", "b", "c", "d"]),

        "three": pd.Series(np.random.randn(3), index=["b", "c", "d"]),

    }

)



df

Unnamed: 0,one,two,three
a,0.342308,-0.698858,
b,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028
d,,-0.314475,-1.6948


In [21]:
row = df.iloc[1]
row

one      1.393615
two      1.504289
three   -0.084983
Name: b, dtype: float64

In [22]:
column = df["two"]
column

a   -0.698858
b    1.504289
c    0.993717
d   -0.314475
Name: two, dtype: float64

In [23]:
df.sub(row, axis="columns")

Unnamed: 0,one,two,three
a,-1.051306,-2.203147,
b,0.0,0.0,0.0
c,-1.294851,-0.510571,-0.171045
d,,-1.818764,-1.609817


In [24]:
df.sub(row, axis=1)

Unnamed: 0,one,two,three
a,-1.051306,-2.203147,
b,0.0,0.0,0.0
c,-1.294851,-0.510571,-0.171045
d,,-1.818764,-1.609817


In [25]:
df.sub(column, axis="index")

Unnamed: 0,one,two,three
a,1.041167,0.0,
b,-0.110674,0.0,-1.589272
c,-0.894953,0.0,-1.249745
d,,0.0,-1.380325


In [26]:
df.sub(column, axis=0)

Unnamed: 0,one,two,three
a,1.041167,0.0,
b,-0.110674,0.0,-1.589272
c,-0.894953,0.0,-1.249745
d,,0.0,-1.380325


In [27]:
#Furthermore you can align a level of a MultiIndexed DataFrame with a Series.
dfmi = df.copy()

dfmi.index = pd.MultiIndex.from_tuples(

    [(1, "a"), (1, "b"), (1, "c"), (2, "a")], names=["first", "second"]

)

In [28]:
dfmi

Unnamed: 0_level_0,Unnamed: 1_level_0,one,two,three
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,0.342308,-0.698858,
1,b,1.393615,1.504289,-0.084983
1,c,0.098764,0.993717,-0.256028
2,a,,-0.314475,-1.6948


In [29]:
dfmi.sub(column, axis=0, level="second")

Unnamed: 0_level_0,Unnamed: 1_level_0,one,two,three
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,1.041167,0.0,
1,b,-0.110674,0.0,-1.589272
1,c,-0.894953,0.0,-1.249745
2,a,,0.384384,-0.995942


In [30]:
s = pd.Series(np.arange(10))
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32

In [31]:
div, rem = divmod(s, 3)
div

0    0
1    0
2    0
3    1
4    1
5    1
6    2
7    2
8    2
9    3
dtype: int32

In [32]:
rem

0    0
1    1
2    2
3    0
4    1
5    2
6    0
7    1
8    2
9    0
dtype: int32

In [33]:
idx = pd.Index(np.arange(10))
idx

Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int32')

In [34]:
div, rem = divmod(idx, 3)
div

Index([0, 0, 0, 1, 1, 1, 2, 2, 2, 3], dtype='int32')

In [35]:
rem

Index([0, 1, 2, 0, 1, 2, 0, 1, 2, 0], dtype='int32')

In [36]:
div, rem = divmod(s, [2, 2, 3, 3, 4, 4, 5, 5, 6, 6])
div

0    0
1    0
2    0
3    1
4    1
5    1
6    1
7    1
8    1
9    1
dtype: int32

In [37]:
rem

0    0
1    1
2    2
3    0
4    0
5    1
6    1
7    2
8    2
9    3
dtype: int32

In [38]:
#missing data operations
df2 = df.copy()
df2

Unnamed: 0,one,two,three
a,0.342308,-0.698858,
b,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028
d,,-0.314475,-1.6948


In [39]:
df2.loc["a", "three"] = 1.0
df2

Unnamed: 0,one,two,three
a,0.342308,-0.698858,1.0
b,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028
d,,-0.314475,-1.6948


In [40]:
df + df2

Unnamed: 0,one,two,three
a,0.684617,-1.397717,
b,2.78723,3.008578,-0.169966
c,0.197528,1.987435,-0.512056
d,,-0.62895,-3.3896


In [41]:
df.add(df2, fill_value=0)

Unnamed: 0,one,two,three
a,0.684617,-1.397717,1.0
b,2.78723,3.008578,-0.169966
c,0.197528,1.987435,-0.512056
d,,-0.62895,-3.3896


In [42]:
df2.add(df, fill_value=0)

Unnamed: 0,one,two,three
a,0.684617,-1.397717,1.0
b,2.78723,3.008578,-0.169966
c,0.197528,1.987435,-0.512056
d,,-0.62895,-3.3896


In [43]:
df.gt(df2)

Unnamed: 0,one,two,three
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


In [44]:
df2.ne(df)

Unnamed: 0,one,two,three
a,False,False,True
b,False,False,False
c,False,False,False
d,True,False,False


In [48]:
df

Unnamed: 0,one,two,three
a,0.342308,-0.698858,
b,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028
d,,-0.314475,-1.6948


In [45]:
df > 0

Unnamed: 0,one,two,three
a,True,False,False
b,True,True,False
c,True,True,False
d,False,False,False


In [46]:
(df > 0).all()

one      False
two      False
three    False
dtype: bool

In [47]:
(df > 0).any()

one       True
two       True
three    False
dtype: bool

In [49]:
(df > 0).any().any()

True

In [50]:
df.empty

False

In [51]:
pd.DataFrame(columns=list("ABC")).empty

True

In [52]:
df + df == df * 2

Unnamed: 0,one,two,three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


In [53]:
(df + df == df * 2).all()

one      False
two       True
three    False
dtype: bool

In [54]:
np.nan == np.nan

False

In [55]:
(df + df).equals(df * 2)

True

In [56]:
df1 = pd.DataFrame({"col": ["foo", 0, np.nan]})

df2 = pd.DataFrame({"col": [np.nan, 0, "foo"]}, index=[2, 1, 0])

In [57]:
df1

Unnamed: 0,col
0,foo
1,0
2,


In [58]:
df2

Unnamed: 0,col
2,
1,0
0,foo


In [59]:
df1.equals(df2)

False

In [61]:
df2.sort_index()

Unnamed: 0,col
0,foo
1,0
2,


In [60]:
df1.equals(df2.sort_index())

True

In [62]:
pd.Series(["foo", "bar", "baz"]) == "foo"

0     True
1    False
2    False
dtype: bool

In [64]:
type(pd.Series(["foo", "bar", "baz"]) == "foo")

pandas.core.series.Series

In [63]:
pd.Index(["foo", "bar", "baz"]) == "foo"

array([ True, False, False])

In [65]:
type(pd.Index(["foo", "bar", "baz"]) == "foo")

numpy.ndarray

In [66]:
pd.Series(["foo", "bar", "baz"]) == pd.Index(["foo", "bar", "qux"])

0     True
1     True
2    False
dtype: bool

In [67]:
pd.Series(["foo", "bar", "baz"]) == np.array(["foo", "bar", "qux"])

0     True
1     True
2    False
dtype: bool

In [68]:
df1 = pd.DataFrame(

    {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]}

)



df2 = pd.DataFrame(

    {

        "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0],

        "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0],

    }

)

In [69]:
df1

Unnamed: 0,A,B
0,1.0,
1,,2.0
2,3.0,3.0
3,5.0,
4,,6.0


In [70]:
df2

Unnamed: 0,A,B
0,5.0,
1,2.0,
2,4.0,3.0
3,,4.0
4,3.0,6.0
5,7.0,8.0


In [71]:
df1.combine_first(df2)

Unnamed: 0,A,B
0,1.0,
1,2.0,2.0
2,3.0,3.0
3,5.0,4.0
4,3.0,6.0
5,7.0,8.0


In [72]:
def combiner(x, y):
    return np.where(pd.isna(x), y, x)

In [73]:
df1.combine(df2, combiner)

Unnamed: 0,A,B
0,1.0,
1,2.0,2.0
2,3.0,3.0
3,5.0,4.0
4,3.0,6.0
5,7.0,8.0


In [74]:
df

Unnamed: 0,one,two,three
a,0.342308,-0.698858,
b,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028
d,,-0.314475,-1.6948


In [75]:
df.mean(0)

one      0.611562
two      0.371168
three   -0.678604
dtype: float64

In [76]:
df.mean(1)

a   -0.178275
b    0.937640
c    0.278818
d   -1.004638
dtype: float64

In [77]:
df.sum(0, skipna=False)

one           NaN
two      1.484673
three         NaN
dtype: float64

In [78]:
df.sum(axis=1, skipna=True)

a   -0.356550
b    2.812921
c    0.836454
d   -2.009275
dtype: float64

In [81]:
df.mean()

one      0.611562
two      0.371168
three   -0.678604
dtype: float64

In [83]:
df.mean(1)

a   -0.178275
b    0.937640
c    0.278818
d   -1.004638
dtype: float64

In [82]:
df.std()

one      0.688137
two      1.046676
three    0.884198
dtype: float64

In [79]:
ts_stand = (df - df.mean()) / df.std()

ts_stand.std()

one      1.0
two      1.0
three    1.0
dtype: float64

In [85]:
ts_stand

Unnamed: 0,one,two,three
a,-0.391279,-1.022309,
b,1.136477,1.082589,0.671366
c,-0.745198,0.594787,0.47792
d,,-0.655067,-1.149287


In [80]:
xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0)

xs_stand.std(1)

a    1.0
b    1.0
c    1.0
d    1.0
dtype: float64

In [87]:
df

Unnamed: 0,one,two,three
a,0.342308,-0.698858,
b,1.393615,1.504289,-0.084983
c,0.098764,0.993717,-0.256028
d,,-0.314475,-1.6948


In [86]:
df.cumsum()

Unnamed: 0,one,two,three
a,0.342308,-0.698858,
b,1.735923,0.80543,-0.084983
c,1.834687,1.799148,-0.341011
d,,1.484673,-2.035811


In [88]:
np.mean(df["one"])

0.6115624781508879

In [89]:
df["one"].mean()

0.6115624781508879

In [91]:
np.mean(df["one"].to_numpy())

nan

In [92]:
series = pd.Series(np.random.randn(500))

series[20:500] = np.nan

series[10:20] = 5

series.nunique()

11

In [93]:
series = pd.Series(np.random.randn(1000))

series[::2] = np.nan

series.describe()

count    500.000000
mean       0.002191
std        1.023972
min       -2.545618
25%       -0.703997
50%       -0.021504
75%        0.663194
max        3.580700
dtype: float64