# 10 Minutes to Pandas

*Working through some of the 10 minutes to pandas examples at:*
https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html

In [36]:
import pandas as pd
import numpy as np
from inspect import signature

# [Object Creation](https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#dsintro)

`pandas.DataFrame( data, index, columns, dtype, copy)`

In [3]:
pd.DataFrame({
                    'Col1' : ['A','B','C'],
                    'Col2' : [1,2,3],
                    'Col3' : [2,4,6],
                    'Col4' : [True, False, True]

                  })


Unnamed: 0,Col1,Col2,Col3,Col4
0,A,1,2,True
1,B,2,4,False
2,C,3,6,True


In [4]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.089477,-1.560332,-0.611101,-0.058308
2013-01-02,0.816867,-0.312017,1.092017,0.122517
2013-01-03,0.014576,1.437115,-0.153668,-0.875574
2013-01-04,-0.750896,1.233726,-0.590551,-0.084992
2013-01-05,0.87132,-0.756557,0.823448,0.201562
2013-01-06,1.532544,-1.254144,-0.992413,-0.066592


In [6]:
df2 = pd.DataFrame(
     {
         "A": 1.0,
         "B": pd.Timestamp("20130102"),
         "C": pd.Series(1, index=list(range(4)), dtype="float32"),
         "D": np.array([3] * 4, dtype="int32"),
         "E": pd.Categorical(["test", "train", "test", "train"]),
         "F": "foo",
     }
 )
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [7]:
 df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

# [Viewing Data](https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html#basics)

In [8]:
print(df.index)
print(df.columns)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
Index(['A', 'B', 'C', 'D'], dtype='object')


In [9]:
df.to_numpy()

array([[-0.08947747, -1.56033199, -0.61110124, -0.05830802],
       [ 0.81686728, -0.3120166 ,  1.09201705,  0.12251738],
       [ 0.01457551,  1.43711484, -0.15366842, -0.87557427],
       [-0.75089562,  1.23372566, -0.59055138, -0.08499168],
       [ 0.8713196 , -0.75655716,  0.82344756,  0.2015623 ],
       [ 1.5325444 , -1.25414388, -0.99241307, -0.06659157]])

In [10]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [11]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.399156,-0.202035,-0.072045,-0.126898
std,0.823562,1.266469,0.844999,0.384892
min,-0.750896,-1.560332,-0.992413,-0.875574
25%,-0.063464,-1.129747,-0.605964,-0.080392
50%,0.415721,-0.534287,-0.37211,-0.06245
75%,0.857707,0.84729,0.579169,0.077311
max,1.532544,1.437115,1.092017,0.201562


In [12]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-0.089477,0.816867,0.014576,-0.750896,0.87132,1.532544
B,-1.560332,-0.312017,1.437115,1.233726,-0.756557,-1.254144
C,-0.611101,1.092017,-0.153668,-0.590551,0.823448,-0.992413
D,-0.058308,0.122517,-0.875574,-0.084992,0.201562,-0.066592


In [13]:
df.sort_index(axis = 1, ascending = False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.058308,-0.611101,-1.560332,-0.089477
2013-01-02,0.122517,1.092017,-0.312017,0.816867
2013-01-03,-0.875574,-0.153668,1.437115,0.014576
2013-01-04,-0.084992,-0.590551,1.233726,-0.750896
2013-01-05,0.201562,0.823448,-0.756557,0.87132
2013-01-06,-0.066592,-0.992413,-1.254144,1.532544


In [14]:
df.sort_values(by = 'B')

Unnamed: 0,A,B,C,D
2013-01-01,-0.089477,-1.560332,-0.611101,-0.058308
2013-01-06,1.532544,-1.254144,-0.992413,-0.066592
2013-01-05,0.87132,-0.756557,0.823448,0.201562
2013-01-02,0.816867,-0.312017,1.092017,0.122517
2013-01-04,-0.750896,1.233726,-0.590551,-0.084992
2013-01-03,0.014576,1.437115,-0.153668,-0.875574


# Selection

> While standard Python / Numpy expressions for selecting and setting are intuitive and come in handy for interactive work, for production code, we recommend the optimized pandas data access methods:  .at, .iat, .loc and .iloc.


See the indexing documentation [Indexing and Selecting Data](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing) and [MultiIndex / Advanced Indexing](https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#advanced).

## Getting

In [15]:
df["A"]

2013-01-01   -0.089477
2013-01-02    0.816867
2013-01-03    0.014576
2013-01-04   -0.750896
2013-01-05    0.871320
2013-01-06    1.532544
Freq: D, Name: A, dtype: float64

In [16]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.089477,-1.560332,-0.611101,-0.058308
2013-01-02,0.816867,-0.312017,1.092017,0.122517
2013-01-03,0.014576,1.437115,-0.153668,-0.875574


In [17]:
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-02,0.816867,-0.312017,1.092017,0.122517
2013-01-03,0.014576,1.437115,-0.153668,-0.875574
2013-01-04,-0.750896,1.233726,-0.590551,-0.084992


## [Selection by label](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-label)

*df.loc[row_indexer,column_indexer]*

For getting a cross section using a label:

In [18]:
print(dates[0])

df.loc[dates[0]]

2013-01-01 00:00:00


A   -0.089477
B   -1.560332
C   -0.611101
D   -0.058308
Name: 2013-01-01 00:00:00, dtype: float64

Selecting on a multi-axis by label

In [19]:
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2013-01-01,-0.089477,-1.560332
2013-01-02,0.816867,-0.312017
2013-01-03,0.014576,1.437115
2013-01-04,-0.750896,1.233726
2013-01-05,0.87132,-0.756557
2013-01-06,1.532544,-1.254144


## [Selection by Position](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-integer)

*df.iloc[row_indexer, column_indexer]*

In [20]:
df.iloc[3]

A   -0.750896
B    1.233726
C   -0.590551
D   -0.084992
Name: 2013-01-04 00:00:00, dtype: float64

By integer slices, acting similar to numpy/Python (lower bound included, upper bound excluded):

In [21]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.750896,1.233726
2013-01-05,0.87132,-0.756557


## Boolean indexing

In [22]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2013-01-02,0.816867,-0.312017,1.092017,0.122517
2013-01-03,0.014576,1.437115,-0.153668,-0.875574
2013-01-05,0.87132,-0.756557,0.823448,0.201562
2013-01-06,1.532544,-1.254144,-0.992413,-0.066592


Selecting values from a DataFrame where a boolean condition is met.

In [23]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,,,
2013-01-02,0.816867,,1.092017,0.122517
2013-01-03,0.014576,1.437115,,
2013-01-04,,1.233726,,
2013-01-05,0.87132,,0.823448,0.201562
2013-01-06,1.532544,,,


Using the isin() method for filtering:

In [24]:
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.089477,-1.560332,-0.611101,-0.058308,one
2013-01-02,0.816867,-0.312017,1.092017,0.122517,one
2013-01-03,0.014576,1.437115,-0.153668,-0.875574,two
2013-01-04,-0.750896,1.233726,-0.590551,-0.084992,three
2013-01-05,0.87132,-0.756557,0.823448,0.201562,four
2013-01-06,1.532544,-1.254144,-0.992413,-0.066592,three


In a list:

In [25]:
df2[df2["E"].isin(["two", "four"])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.014576,1.437115,-0.153668,-0.875574,two
2013-01-05,0.87132,-0.756557,0.823448,0.201562,four


Or a series:

In [26]:
s1 = pd.Series(["two","four"])
df2[df2["E"].isin(s1)]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.014576,1.437115,-0.153668,-0.875574,two
2013-01-05,0.87132,-0.756557,0.823448,0.201562,four


# Setting

Setting a new column automatically aligns the data by the indexes.

In [27]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6))
print(s1)
df["F"] = s1
df

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64


Unnamed: 0,A,B,C,D,F
2013-01-01,-0.089477,-1.560332,-0.611101,-0.058308,
2013-01-02,0.816867,-0.312017,1.092017,0.122517,1.0
2013-01-03,0.014576,1.437115,-0.153668,-0.875574,2.0
2013-01-04,-0.750896,1.233726,-0.590551,-0.084992,3.0
2013-01-05,0.87132,-0.756557,0.823448,0.201562,4.0
2013-01-06,1.532544,-1.254144,-0.992413,-0.066592,5.0


Setting values by label:

In [28]:
df.at[dates[0],"A"] = 0

Setting values by position:

In [29]:
df.iat[0, 1] = 0

Setting by assigning with a NumPy array:

In [30]:
df.loc[:, "D"] = np.array([5] * len(df))

Results:

In [31]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.611101,5,
2013-01-02,0.816867,-0.312017,1.092017,5,1.0
2013-01-03,0.014576,1.437115,-0.153668,5,2.0
2013-01-04,-0.750896,1.233726,-0.590551,5,3.0
2013-01-05,0.87132,-0.756557,0.823448,5,4.0
2013-01-06,1.532544,-1.254144,-0.992413,5,5.0


A `where` operation with setting.

In [32]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.611101,-5,
2013-01-02,-0.816867,-0.312017,-1.092017,-5,-1.0
2013-01-03,-0.014576,-1.437115,-0.153668,-5,-2.0
2013-01-04,-0.750896,-1.233726,-0.590551,-5,-3.0
2013-01-05,-0.87132,-0.756557,-0.823448,-5,-4.0
2013-01-06,-1.532544,-1.254144,-0.992413,-5,-5.0


# [Missing data](https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html#missing-data)

pandas primarily uses the value np.nan to represent missing data. It is by default not included in computations.

**Reindexing** allows you to change/add/delete the index on a specified axis. This returns a copy of the data.

In [39]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df1.loc[dates[0] : dates[1], "E"] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-0.611101,5,,1.0
2013-01-02,0.816867,-0.312017,1.092017,5,1.0,1.0
2013-01-03,0.014576,1.437115,-0.153668,5,2.0,
2013-01-04,-0.750896,1.233726,-0.590551,5,3.0,


In [40]:
#To drop any rows that have missing data.
df1.dropna(how="any")

Unnamed: 0,A,B,C,D,F,E
2013-01-02,0.816867,-0.312017,1.092017,5,1.0,1.0


In [41]:
#Filling missing data
df1.fillna(value=5)


Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-0.611101,5,5.0,1.0
2013-01-02,0.816867,-0.312017,1.092017,5,1.0,1.0
2013-01-03,0.014576,1.437115,-0.153668,5,2.0,5.0
2013-01-04,-0.750896,1.233726,-0.590551,5,3.0,5.0


In [45]:
#To get the boolean mask where values are nan.
df1.isna() # or pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


# [Operations](https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html#basics-binop)

## Stats

In [46]:
#Performing a descriptive statistic:
df.mean()

A    0.414069
B    0.058020
C   -0.072045
D    5.000000
F    3.000000
dtype: float64

In [47]:
#Same operation on the other axis:
df.mean(1)

2013-01-01    1.097225
2013-01-02    1.519374
2013-01-03    1.659604
2013-01-04    1.578456
2013-01-05    1.987642
2013-01-06    1.857197
Freq: D, dtype: float64

In [48]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [53]:
#Operating with objects that have different dimensionality and need alignment. 
    #In addition, pandas automatically broadcasts along the specified dimension.
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [54]:
#Subtract series from the dataframe base on the given index (same as SQL Join and using and subtracting one table from another)
df.sub(s, axis="index")

Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,-0.985424,0.437115,-1.153668,4.0,1.0
2013-01-04,-3.750896,-1.766274,-3.590551,2.0,0.0
2013-01-05,-4.12868,-5.756557,-4.176552,0.0,-1.0
2013-01-06,,,,,


## Apply