https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html

In [1]:
import pandas as pd
import numpy as np

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
dates = pd.date_range('20130101', periods = 6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6, 4), index = dates, columns = list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.539127,-0.042004,0.190917,0.024311
2013-01-02,0.245452,-1.676644,-0.412797,-0.892486
2013-01-03,0.514882,0.236369,-0.256021,-0.406557
2013-01-04,-0.422225,-0.150569,0.768392,0.268021
2013-01-05,-0.031511,-1.239368,-0.063095,0.695798
2013-01-06,-0.091894,-0.671448,0.226781,-0.992694


In [5]:
df2 = pd.DataFrame({'A': 1.,
                   'B': pd.Timestamp('20130102'),
                   'C': pd.Series(1, index = list(range(4)), dtype = 'float32'),
                   'D': np.array([3] * 4, dtype = 'int32'),
                   'E': pd.Categorical(['test', 'train', 'test', 'train']), 
                   'F': 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [6]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 3
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   A       4 non-null      float64       
 1   B       4 non-null      datetime64[ns]
 2   C       4 non-null      float32       
 3   D       4 non-null      int32         
 4   E       4 non-null      category      
 5   F       4 non-null      object        
dtypes: category(1), datetime64[ns](1), float32(1), float64(1), int32(1), object(1)
memory usage: 260.0+ bytes


In [7]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.539127,-0.042004,0.190917,0.024311
2013-01-02,0.245452,-1.676644,-0.412797,-0.892486
2013-01-03,0.514882,0.236369,-0.256021,-0.406557
2013-01-04,-0.422225,-0.150569,0.768392,0.268021
2013-01-05,-0.031511,-1.239368,-0.063095,0.695798


In [8]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-0.422225,-0.150569,0.768392,0.268021
2013-01-05,-0.031511,-1.239368,-0.063095,0.695798
2013-01-06,-0.091894,-0.671448,0.226781,-0.992694


In [9]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [10]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2013-01-01 to 2013-01-06
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
dtypes: float64(4)
memory usage: 240.0 bytes


In [12]:
df_np = df.to_numpy()
df_np

array([[-0.53912717, -0.04200449,  0.19091725,  0.02431112],
       [ 0.24545234, -1.67664365, -0.41279689, -0.89248573],
       [ 0.51488233,  0.23636855, -0.25602073, -0.40655689],
       [-0.42222492, -0.1505691 ,  0.7683918 ,  0.26802066],
       [-0.03151111, -1.23936846, -0.06309471,  0.69579784],
       [-0.09189388, -0.6714483 ,  0.22678087, -0.99269431]])

In [13]:
df_np.shape

(6, 4)

In [14]:
df2_np = df2.to_numpy()
df2_np

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [15]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.05407,-0.590611,0.075696,-0.217268
std,0.39653,0.746376,0.420441,0.666422
min,-0.539127,-1.676644,-0.412797,-0.992694
25%,-0.339642,-1.097388,-0.207789,-0.771004
50%,-0.061702,-0.411009,0.063911,-0.191123
75%,0.176211,-0.069146,0.217815,0.207093
max,0.514882,0.236369,0.768392,0.695798


In [16]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.539127,0.245452,0.514882,-0.422225,-0.031511,-0.091894
B,-0.042004,-1.676644,0.236369,-0.150569,-1.239368,-0.671448
C,0.190917,-0.412797,-0.256021,0.768392,-0.063095,0.226781
D,0.024311,-0.892486,-0.406557,0.268021,0.695798,-0.992694


In [17]:
df.sort_index(axis = 1, ascending = False)

Unnamed: 0,D,C,B,A
2013-01-01,0.024311,0.190917,-0.042004,-0.539127
2013-01-02,-0.892486,-0.412797,-1.676644,0.245452
2013-01-03,-0.406557,-0.256021,0.236369,0.514882
2013-01-04,0.268021,0.768392,-0.150569,-0.422225
2013-01-05,0.695798,-0.063095,-1.239368,-0.031511
2013-01-06,-0.992694,0.226781,-0.671448,-0.091894


In [18]:
df.sort_index(axis = 1, ascending = True)

Unnamed: 0,A,B,C,D
2013-01-01,-0.539127,-0.042004,0.190917,0.024311
2013-01-02,0.245452,-1.676644,-0.412797,-0.892486
2013-01-03,0.514882,0.236369,-0.256021,-0.406557
2013-01-04,-0.422225,-0.150569,0.768392,0.268021
2013-01-05,-0.031511,-1.239368,-0.063095,0.695798
2013-01-06,-0.091894,-0.671448,0.226781,-0.992694


In [19]:
df.sort_index(axis = 0, ascending = False)

Unnamed: 0,A,B,C,D
2013-01-06,-0.091894,-0.671448,0.226781,-0.992694
2013-01-05,-0.031511,-1.239368,-0.063095,0.695798
2013-01-04,-0.422225,-0.150569,0.768392,0.268021
2013-01-03,0.514882,0.236369,-0.256021,-0.406557
2013-01-02,0.245452,-1.676644,-0.412797,-0.892486
2013-01-01,-0.539127,-0.042004,0.190917,0.024311


In [20]:
df.sort_values(by = 'B')

Unnamed: 0,A,B,C,D
2013-01-02,0.245452,-1.676644,-0.412797,-0.892486
2013-01-05,-0.031511,-1.239368,-0.063095,0.695798
2013-01-06,-0.091894,-0.671448,0.226781,-0.992694
2013-01-04,-0.422225,-0.150569,0.768392,0.268021
2013-01-01,-0.539127,-0.042004,0.190917,0.024311
2013-01-03,0.514882,0.236369,-0.256021,-0.406557


In [21]:
df.sort_values(by = 'C')

Unnamed: 0,A,B,C,D
2013-01-02,0.245452,-1.676644,-0.412797,-0.892486
2013-01-03,0.514882,0.236369,-0.256021,-0.406557
2013-01-05,-0.031511,-1.239368,-0.063095,0.695798
2013-01-01,-0.539127,-0.042004,0.190917,0.024311
2013-01-06,-0.091894,-0.671448,0.226781,-0.992694
2013-01-04,-0.422225,-0.150569,0.768392,0.268021


In [22]:
df['C'].astype('int')

2013-01-01    0
2013-01-02    0
2013-01-03    0
2013-01-04    0
2013-01-05    0
2013-01-06    0
Freq: D, Name: C, dtype: int64

In [23]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.539127,-0.042004,0.190917,0.024311
2013-01-02,0.245452,-1.676644,-0.412797,-0.892486
2013-01-03,0.514882,0.236369,-0.256021,-0.406557
2013-01-04,-0.422225,-0.150569,0.768392,0.268021
2013-01-05,-0.031511,-1.239368,-0.063095,0.695798
2013-01-06,-0.091894,-0.671448,0.226781,-0.992694


In [24]:
df['A']

2013-01-01   -0.539127
2013-01-02    0.245452
2013-01-03    0.514882
2013-01-04   -0.422225
2013-01-05   -0.031511
2013-01-06   -0.091894
Freq: D, Name: A, dtype: float64

In [25]:
df.A

2013-01-01   -0.539127
2013-01-02    0.245452
2013-01-03    0.514882
2013-01-04   -0.422225
2013-01-05   -0.031511
2013-01-06   -0.091894
Freq: D, Name: A, dtype: float64

In [26]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.539127,-0.042004,0.190917,0.024311
2013-01-02,0.245452,-1.676644,-0.412797,-0.892486
2013-01-03,0.514882,0.236369,-0.256021,-0.406557


In [27]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,0.245452,-1.676644,-0.412797,-0.892486
2013-01-03,0.514882,0.236369,-0.256021,-0.406557
2013-01-04,-0.422225,-0.150569,0.768392,0.268021


In [28]:
df.loc[dates[0]]

A   -0.539127
B   -0.042004
C    0.190917
D    0.024311
Name: 2013-01-01 00:00:00, dtype: float64

In [29]:
df.loc[:, list('AB')]

Unnamed: 0,A,B
2013-01-01,-0.539127,-0.042004
2013-01-02,0.245452,-1.676644
2013-01-03,0.514882,0.236369
2013-01-04,-0.422225,-0.150569
2013-01-05,-0.031511,-1.239368
2013-01-06,-0.091894,-0.671448


In [30]:
df.loc['20130102':'20130104', list('AB')]

Unnamed: 0,A,B
2013-01-02,0.245452,-1.676644
2013-01-03,0.514882,0.236369
2013-01-04,-0.422225,-0.150569


In [31]:
df.loc['20130102', list('AB')]

A    0.245452
B   -1.676644
Name: 2013-01-02 00:00:00, dtype: float64

In [32]:
df.loc[dates[0], 'A']

-0.5391271717896502

In [33]:
df.at[dates[0], 'A']

-0.5391271717896502

In [34]:
df.iloc[3]

A   -0.422225
B   -0.150569
C    0.768392
D    0.268021
Name: 2013-01-04 00:00:00, dtype: float64

In [35]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.422225,-0.150569
2013-01-05,-0.031511,-1.239368


In [36]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,0.245452,-0.412797
2013-01-03,0.514882,-0.256021
2013-01-05,-0.031511,-0.063095


In [37]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,-0.042004,0.190917
2013-01-02,-1.676644,-0.412797
2013-01-03,0.236369,-0.256021
2013-01-04,-0.150569,0.768392
2013-01-05,-1.239368,-0.063095
2013-01-06,-0.671448,0.226781


In [38]:
df.iloc[1, 1]

-1.6766436470844208

In [39]:
df.iat[1, 1]

-1.6766436470844208

In [40]:
df['A'] > 0

2013-01-01    False
2013-01-02     True
2013-01-03     True
2013-01-04    False
2013-01-05    False
2013-01-06    False
Freq: D, Name: A, dtype: bool

In [41]:
df[df['A'] > 0]

Unnamed: 0,A,B,C,D
2013-01-02,0.245452,-1.676644,-0.412797,-0.892486
2013-01-03,0.514882,0.236369,-0.256021,-0.406557


In [42]:
df > 0

Unnamed: 0,A,B,C,D
2013-01-01,False,False,True,True
2013-01-02,True,False,False,False
2013-01-03,True,True,False,False
2013-01-04,False,False,True,True
2013-01-05,False,False,False,True
2013-01-06,False,False,True,False


In [43]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,,0.190917,0.024311
2013-01-02,0.245452,,,
2013-01-03,0.514882,0.236369,,
2013-01-04,,,0.768392,0.268021
2013-01-05,,,,0.695798
2013-01-06,,,0.226781,


In [44]:
df2 = df.copy()

In [45]:
df2

Unnamed: 0,A,B,C,D
2013-01-01,-0.539127,-0.042004,0.190917,0.024311
2013-01-02,0.245452,-1.676644,-0.412797,-0.892486
2013-01-03,0.514882,0.236369,-0.256021,-0.406557
2013-01-04,-0.422225,-0.150569,0.768392,0.268021
2013-01-05,-0.031511,-1.239368,-0.063095,0.695798
2013-01-06,-0.091894,-0.671448,0.226781,-0.992694


In [46]:
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.539127,-0.042004,0.190917,0.024311,one
2013-01-02,0.245452,-1.676644,-0.412797,-0.892486,one
2013-01-03,0.514882,0.236369,-0.256021,-0.406557,two
2013-01-04,-0.422225,-0.150569,0.768392,0.268021,three
2013-01-05,-0.031511,-1.239368,-0.063095,0.695798,four
2013-01-06,-0.091894,-0.671448,0.226781,-0.992694,three


In [47]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.539127,-0.042004,0.190917,0.024311
2013-01-02,0.245452,-1.676644,-0.412797,-0.892486
2013-01-03,0.514882,0.236369,-0.256021,-0.406557
2013-01-04,-0.422225,-0.150569,0.768392,0.268021
2013-01-05,-0.031511,-1.239368,-0.063095,0.695798
2013-01-06,-0.091894,-0.671448,0.226781,-0.992694


In [48]:
df2['E'].isin(['two', 'four'])

2013-01-01    False
2013-01-02    False
2013-01-03     True
2013-01-04    False
2013-01-05     True
2013-01-06    False
Freq: D, Name: E, dtype: bool

In [49]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.514882,0.236369,-0.256021,-0.406557,two
2013-01-05,-0.031511,-1.239368,-0.063095,0.695798,four


In [50]:
s1 = pd.Series([1, 2, 3, 4, 5 , 6], index = pd.date_range('20130102' ,periods = 6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [51]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-0.539127,-0.042004,0.190917,0.024311,
2013-01-02,0.245452,-1.676644,-0.412797,-0.892486,1.0
2013-01-03,0.514882,0.236369,-0.256021,-0.406557,2.0
2013-01-04,-0.422225,-0.150569,0.768392,0.268021,3.0
2013-01-05,-0.031511,-1.239368,-0.063095,0.695798,4.0
2013-01-06,-0.091894,-0.671448,0.226781,-0.992694,5.0


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2013-01-01 to 2013-01-06
Freq: D
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
 4   F       5 non-null      float64
dtypes: float64(5)
memory usage: 448.0 bytes


In [53]:
df.loc[dates[0], 'A'] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.042004,0.190917,0.024311,
2013-01-02,0.245452,-1.676644,-0.412797,-0.892486,1.0
2013-01-03,0.514882,0.236369,-0.256021,-0.406557,2.0
2013-01-04,-0.422225,-0.150569,0.768392,0.268021,3.0
2013-01-05,-0.031511,-1.239368,-0.063095,0.695798,4.0
2013-01-06,-0.091894,-0.671448,0.226781,-0.992694,5.0


In [54]:
df.iloc[0, 1] = 0

In [55]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.190917,0.024311,
2013-01-02,0.245452,-1.676644,-0.412797,-0.892486,1.0
2013-01-03,0.514882,0.236369,-0.256021,-0.406557,2.0
2013-01-04,-0.422225,-0.150569,0.768392,0.268021,3.0
2013-01-05,-0.031511,-1.239368,-0.063095,0.695798,4.0
2013-01-06,-0.091894,-0.671448,0.226781,-0.992694,5.0


In [56]:
df.loc[:, 'D'] = np.array([5] * len(df))

In [57]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.190917,5,
2013-01-02,0.245452,-1.676644,-0.412797,5,1.0
2013-01-03,0.514882,0.236369,-0.256021,5,2.0
2013-01-04,-0.422225,-0.150569,0.768392,5,3.0
2013-01-05,-0.031511,-1.239368,-0.063095,5,4.0
2013-01-06,-0.091894,-0.671448,0.226781,5,5.0


In [58]:
df2 = df.copy()

In [59]:
df2 > 0

Unnamed: 0,A,B,C,D,F
2013-01-01,False,False,True,True,False
2013-01-02,True,False,False,True,True
2013-01-03,True,True,False,True,True
2013-01-04,False,False,True,True,True
2013-01-05,False,False,False,True,True
2013-01-06,False,False,True,True,True


In [60]:
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.190917,-5,
2013-01-02,-0.245452,-1.676644,-0.412797,-5,-1.0
2013-01-03,-0.514882,-0.236369,-0.256021,-5,-2.0
2013-01-04,-0.422225,-0.150569,-0.768392,-5,-3.0
2013-01-05,-0.031511,-1.239368,-0.063095,-5,-4.0
2013-01-06,-0.091894,-0.671448,-0.226781,-5,-5.0


In [61]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.190917,5,
2013-01-02,0.245452,-1.676644,-0.412797,5,1.0
2013-01-03,0.514882,0.236369,-0.256021,5,2.0
2013-01-04,-0.422225,-0.150569,0.768392,5,3.0
2013-01-05,-0.031511,-1.239368,-0.063095,5,4.0
2013-01-06,-0.091894,-0.671448,0.226781,5,5.0


In [62]:
df1 = df.reindex(index = dates[0:4], columns = list(df.columns) + ['E'])
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,0.190917,5,,
2013-01-02,0.245452,-1.676644,-0.412797,5,1.0,
2013-01-03,0.514882,0.236369,-0.256021,5,2.0,
2013-01-04,-0.422225,-0.150569,0.768392,5,3.0,


In [63]:
df1.loc[dates[0]:dates[1], 'E'] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,0.190917,5,,1.0
2013-01-02,0.245452,-1.676644,-0.412797,5,1.0,1.0
2013-01-03,0.514882,0.236369,-0.256021,5,2.0,
2013-01-04,-0.422225,-0.150569,0.768392,5,3.0,


In [64]:
df1.dropna(how = 'any')

Unnamed: 0,A,B,C,D,F,E
2013-01-02,0.245452,-1.676644,-0.412797,5,1.0,1.0


In [65]:
df1.fillna(value = 5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,0.190917,5,5.0,1.0
2013-01-02,0.245452,-1.676644,-0.412797,5,1.0,1.0
2013-01-03,0.514882,0.236369,-0.256021,5,2.0,5.0
2013-01-04,-0.422225,-0.150569,0.768392,5,3.0,5.0


In [66]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


In [67]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.190917,5,
2013-01-02,0.245452,-1.676644,-0.412797,5,1.0
2013-01-03,0.514882,0.236369,-0.256021,5,2.0
2013-01-04,-0.422225,-0.150569,0.768392,5,3.0
2013-01-05,-0.031511,-1.239368,-0.063095,5,4.0
2013-01-06,-0.091894,-0.671448,0.226781,5,5.0


In [68]:
df.describe()

Unnamed: 0,A,B,C,D,F
count,6.0,6.0,6.0,6.0,5.0
mean,0.035784,-0.58361,0.075696,5.0,3.0
std,0.317924,0.752721,0.420441,0.0,1.581139
min,-0.422225,-1.676644,-0.412797,5.0,1.0
25%,-0.076798,-1.097388,-0.207789,5.0,2.0
50%,-0.015756,-0.411009,0.063911,5.0,3.0
75%,0.184089,-0.037642,0.217815,5.0,4.0
max,0.514882,0.236369,0.768392,5.0,5.0


In [69]:
df.mean()

A    0.035784
B   -0.583610
C    0.075696
D    5.000000
F    3.000000
dtype: float64

In [70]:
df.mean(1)

2013-01-01    1.297729
2013-01-02    0.831202
2013-01-03    1.499046
2013-01-04    1.639120
2013-01-05    1.533205
2013-01-06    1.892688
Freq: D, dtype: float64

In [71]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index = dates)
s

2013-01-01    1.0
2013-01-02    3.0
2013-01-03    5.0
2013-01-04    NaN
2013-01-05    6.0
2013-01-06    8.0
Freq: D, dtype: float64

In [72]:
s.shift(2)

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [73]:
s = s.shift(2)

In [74]:
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [75]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.190917,5,
2013-01-02,0.245452,-1.676644,-0.412797,5,1.0
2013-01-03,0.514882,0.236369,-0.256021,5,2.0
2013-01-04,-0.422225,-0.150569,0.768392,5,3.0
2013-01-05,-0.031511,-1.239368,-0.063095,5,4.0
2013-01-06,-0.091894,-0.671448,0.226781,5,5.0


In [76]:
df.sub(s, axis  ='index')

Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,-0.485118,-0.763631,-1.256021,4.0,1.0
2013-01-04,-3.422225,-3.150569,-2.231608,2.0,0.0
2013-01-05,-5.031511,-6.239368,-5.063095,0.0,-1.0
2013-01-06,,,,,


In [77]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.190917,5,
2013-01-02,0.245452,-1.676644,-0.22188,10,1.0
2013-01-03,0.760335,-1.440275,-0.4779,15,3.0
2013-01-04,0.33811,-1.590844,0.290491,20,6.0
2013-01-05,0.306599,-2.830213,0.227397,25,10.0
2013-01-06,0.214705,-3.501661,0.454178,30,15.0


In [78]:
df.describe()

Unnamed: 0,A,B,C,D,F
count,6.0,6.0,6.0,6.0,5.0
mean,0.035784,-0.58361,0.075696,5.0,3.0
std,0.317924,0.752721,0.420441,0.0,1.581139
min,-0.422225,-1.676644,-0.412797,5.0,1.0
25%,-0.076798,-1.097388,-0.207789,5.0,2.0
50%,-0.015756,-0.411009,0.063911,5.0,3.0
75%,0.184089,-0.037642,0.217815,5.0,4.0
max,0.514882,0.236369,0.768392,5.0,5.0


In [79]:
df.apply(lambda x: x.max() - x.min())

A    0.937107
B    1.913012
C    1.181189
D    0.000000
F    4.000000
dtype: float64

In [80]:
df.apply(lambda x: x.max() - x.min(), axis = 1)

2013-01-01    5.000000
2013-01-02    6.676644
2013-01-03    5.256021
2013-01-04    5.422225
2013-01-05    6.239368
2013-01-06    5.671448
Freq: D, dtype: float64

In [81]:
(lambda x: x.max() - x.min())(np.array([1, 2, 3, 4]))

3

In [82]:
s = pd.Series(np.random.randint(0, 7, size = 10))
s

0    5
1    1
2    2
3    4
4    1
5    1
6    1
7    5
8    3
9    3
dtype: int64

In [83]:
s.value_counts()

1    4
5    2
3    2
4    1
2    1
dtype: int64

In [84]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [85]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

In [86]:
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,2.502579,0.822139,-0.66855,-0.636759
1,0.268828,0.861344,-0.265055,3.008537
2,-0.77114,-0.838931,1.26739,-0.259614
3,1.377327,1.392848,-0.358377,-0.984756
4,-0.146133,-0.529402,1.368449,0.737435
5,-1.656238,0.753981,-0.089309,0.400222
6,-0.637497,-0.777066,-0.450825,0.231343
7,-0.087206,-0.538882,0.223912,1.448582
8,0.566359,1.729839,1.555875,0.403395
9,0.109868,-0.392983,-0.185124,0.421231


In [87]:
pieces = [df[:3], df[3:7], df[7:]]
pieces

[          0         1         2         3
 0  2.502579  0.822139 -0.668550 -0.636759
 1  0.268828  0.861344 -0.265055  3.008537
 2 -0.771140 -0.838931  1.267390 -0.259614,
           0         1         2         3
 3  1.377327  1.392848 -0.358377 -0.984756
 4 -0.146133 -0.529402  1.368449  0.737435
 5 -1.656238  0.753981 -0.089309  0.400222
 6 -0.637497 -0.777066 -0.450825  0.231343,
           0         1         2         3
 7 -0.087206 -0.538882  0.223912  1.448582
 8  0.566359  1.729839  1.555875  0.403395
 9  0.109868 -0.392983 -0.185124  0.421231]

In [88]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,2.502579,0.822139,-0.66855,-0.636759
1,0.268828,0.861344,-0.265055,3.008537
2,-0.77114,-0.838931,1.26739,-0.259614
3,1.377327,1.392848,-0.358377,-0.984756
4,-0.146133,-0.529402,1.368449,0.737435
5,-1.656238,0.753981,-0.089309,0.400222
6,-0.637497,-0.777066,-0.450825,0.231343
7,-0.087206,-0.538882,0.223912,1.448582
8,0.566359,1.729839,1.555875,0.403395
9,0.109868,-0.392983,-0.185124,0.421231


In [96]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

In [97]:
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [98]:
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [99]:
pd.merge(left, right, on = 'key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


In [102]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})

In [103]:
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2


In [104]:
right

Unnamed: 0,key,rval
0,foo,4
1,bar,5


In [105]:
pd.merge(left, right, on = 'key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


In [106]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 
                         'foo', 'bar' ,'foo', 'bar'],
                   'B': ['one', 'one', 'two', 'three',
                         'two', 'two', 'one', 'three'],
                   'C': np.random.randn(8),
                   'D': np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.986671,-0.232842
1,bar,one,0.128862,2.775429
2,foo,two,0.818932,0.680948
3,bar,three,-0.047913,-1.064371
4,foo,two,-0.873056,0.841249
5,bar,two,-1.198771,-1.576698
6,foo,one,-1.872267,0.654406
7,bar,three,0.521806,-1.057257


In [107]:
df.groupby('A')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fb341fd7b70>

In [112]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.596018,-0.922898
foo,-2.913063,1.94376


In [113]:
df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.128862,2.775429
bar,three,0.473892,-2.121628
bar,two,-1.198771,-1.576698
foo,one,-2.858939,0.421563
foo,two,-0.054124,1.522197


In [114]:
tuples = list(zip(*[['bar', 'bar' ,'baz', 'baz',
                    'foo', 'foo', 'qux', 'qux'],
                   ['one', 'two', 'one', 'two',
                   'one', 'two', 'one', 'two']]))
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [115]:
index = pd.MultiIndex.from_tuples(tuples, names = ['first', 'second'])
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [116]:
df = pd.DataFrame(np.random.randn(8, 2), index = index, columns = ['A', 'B'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.708979,0.935331
bar,two,-0.417548,0.180623
baz,one,0.072346,1.221638
baz,two,2.889435,0.33096
foo,one,1.681061,-2.465583
foo,two,0.420037,-0.49245
qux,one,-0.331621,2.185218
qux,two,1.940583,0.03086


In [118]:
df2 = df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.708979,0.935331
bar,two,-0.417548,0.180623
baz,one,0.072346,1.221638
baz,two,2.889435,0.33096


In [119]:
stacked = df2.stack()
stacked

first  second   
bar    one     A    0.708979
               B    0.935331
       two     A   -0.417548
               B    0.180623
baz    one     A    0.072346
               B    1.221638
       two     A    2.889435
               B    0.330960
dtype: float64