# Pandas

* [Pandas Docs](https://pandas.pydata.org)
* [Getting Started Tutorials](https://pandas.pydata.org/docs/getting_started/intro_tutorials)
* The community agreed alias for ```pandas``` is ```pd```
* Import the Pandas package: ```import pandas as pd```
* The ```DataFrame``` represents a 2D table data structure
* Each column in a ```DataFrame``` is a ```Series```
* The ```Series``` represents a 1D ```ndarray``` with axis labels 

## DataFrames: Getting Started

In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame({
    "Name": ["Braund, Mr. Owen Harris", "Allen, Mr. William Henry", "Bonnell, Miss. Elizabeth"],
    "Age": [22, 35, 58],
    "Sex": ["male", "male", "female"]}
    )
df

Unnamed: 0,Name,Age,Sex
0,"Braund, Mr. Owen Harris",22,male
1,"Allen, Mr. William Henry",35,male
2,"Bonnell, Miss. Elizabeth",58,female


In [3]:
df["Age"]

0    22
1    35
2    58
Name: Age, dtype: int64

In [4]:
df["Age"].max()

58

In [5]:
products_df = pd.DataFrame({'Product ID': [4109,1412,8931], 'Price': [5.0,0.5,1.5], 'Product': ['Sushi Roll','Egg','Bagel']})
invoices_df = pd.DataFrame({'Customer': ['Ali','Eric','Ande','Sam'], 'Product ID': [4109,1412,8931,4109], 'Quantity': [1,12,6,2]})
print(products_df)
print(invoices_df)
print(pd.merge(products_df, invoices_df, how='inner'))

   Product ID  Price     Product
0        4109    5.0  Sushi Roll
1        1412    0.5         Egg
2        8931    1.5       Bagel
  Customer  Product ID  Quantity
0      Ali        4109         1
1     Eric        1412        12
2     Ande        8931         6
3      Sam        4109         2
   Product ID  Price     Product Customer  Quantity
0        4109    5.0  Sushi Roll      Ali         1
1        4109    5.0  Sushi Roll      Sam         2
2        1412    0.5         Egg     Eric        12
3        8931    1.5       Bagel     Ande         6


In [6]:
salesDF = pd.DataFrame({
    "division": ["Consulting", "Hardware", "Software", ],
    "2017": [57, 68, 43],
    "2018": [78, 90, 14],
    "2019": [52, 63, 44],
    "2020": [32, 99, 65]}
    )
print(salesDF)
print()
salesDF = pd.melt(salesDF, id_vars='division',
                  value_vars=['2017', '2018', '2019', '2020'], var_name='year', value_name='sales')
print(salesDF)

     division  2017  2018  2019  2020
0  Consulting    57    78    52    32
1    Hardware    68    90    63    99
2    Software    43    14    44    65

      division  year  sales
0   Consulting  2017     57
1     Hardware  2017     68
2     Software  2017     43
3   Consulting  2018     78
4     Hardware  2018     90
5     Software  2018     14
6   Consulting  2019     52
7     Hardware  2019     63
8     Software  2019     44
9   Consulting  2020     32
10    Hardware  2020     99
11    Software  2020     65


In [7]:
df = pd.DataFrame({'department' : ['BIO','CHEM','PHYS']}) # MATH is missing
print (df)

print()

department_mapping = {'BIO': 10, 'CHEM': 20, 'PHYS': 30, 'MATH': 40}
df['department_code'] = df['department'].map(department_mapping)
print (df)

  department
0        BIO
1       CHEM
2       PHYS

  department  department_code
0        BIO               10
1       CHEM               20
2       PHYS               30


## DataFrames and the NumPy Library

In [8]:
import numpy as np

In [9]:
np.random.seed()
print(np.random.rand())
print(np.random.rand())
print(np.random.rand())
print(np.random.rand())
print()
np.random.seed()
print(np.random.rand())
print(np.random.rand())
print(np.random.rand())
print(np.random.rand())
print()
np.random.seed(0)
print(np.random.rand())
print(np.random.rand())
print(np.random.rand())
print(np.random.rand())
print()
np.random.seed(0)
print(np.random.rand())
print(np.random.rand())
print(np.random.rand())
print(np.random.rand())

0.5417788758509325
0.30534642476967844
0.9202000683091264
0.7742293613276636

0.07858643507372243
0.16718287357363026
0.728712547709525
0.6868621543289885

0.5488135039273248
0.7151893663724195
0.6027633760716439
0.5448831829968969

0.5488135039273248
0.7151893663724195
0.6027633760716439
0.5448831829968969


In [10]:
np.random.seed(10)
sales = np.array((100*np.random.rand(10)).astype(int))
sales

array([77,  2, 63, 74, 49, 22, 19, 76, 16,  8])

In [11]:
location = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
data = np.array([sales, location]).T
data

array([['77', 'A'],
       ['2', 'B'],
       ['63', 'C'],
       ['74', 'D'],
       ['49', 'E'],
       ['22', 'F'],
       ['19', 'G'],
       ['76', 'H'],
       ['16', 'I'],
       ['8', 'J']], dtype='<U11')

In [12]:
df = pd.DataFrame(data, columns = ["Sales", "Location"])
df

Unnamed: 0,Sales,Location
0,77,A
1,2,B
2,63,C
3,74,D
4,49,E
5,22,F
6,19,G
7,76,H
8,16,I
9,8,J


In [13]:
df.dtypes

Sales       object
Location    object
dtype: object

In [14]:
df.Sales = df.Sales.astype(int)
df.dtypes

Sales        int32
Location    object
dtype: object

In [15]:
df.describe()

Unnamed: 0,Sales
count,10.0
mean,40.6
std,30.251538
min,2.0
25%,16.75
50%,35.5
75%,71.25
max,77.0


In [16]:
df.Location

0    A
1    B
2    C
3    D
4    E
5    F
6    G
7    H
8    I
9    J
Name: Location, dtype: object

In [17]:
df.Sales

0    77
1     2
2    63
3    74
4    49
5    22
6    19
7    76
8    16
9     8
Name: Sales, dtype: int32

In [18]:
df.loc[:, 'Sales']

0    77
1     2
2    63
3    74
4    49
5    22
6    19
7    76
8    16
9     8
Name: Sales, dtype: int32

In [19]:
df.loc[:2, 'Sales']

0    77
1     2
2    63
Name: Sales, dtype: int32

In [20]:
df.loc[1:2, 'Sales']

1     2
2    63
Name: Sales, dtype: int32

In [21]:
df.iloc[-1, :]

Sales       8
Location    J
Name: 9, dtype: object

In [22]:
df.Sales.min()

2

In [23]:
df

Unnamed: 0,Sales,Location
0,77,A
1,2,B
2,63,C
3,74,D
4,49,E
5,22,F
6,19,G
7,76,H
8,16,I
9,8,J


In [24]:
df_sorted = df.Sales.sort_values(ascending=False) # not in-place, non mutating

In [25]:
df_sorted

0    77
7    76
3    74
2    63
4    49
5    22
6    19
8    16
9     8
1     2
Name: Sales, dtype: int32

In [26]:
df

Unnamed: 0,Sales,Location
0,77,A
1,2,B
2,63,C
3,74,D
4,49,E
5,22,F
6,19,G
7,76,H
8,16,I
9,8,J


In [27]:
df['Month'] = pd.Series(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct'])
df

Unnamed: 0,Sales,Location,Month
0,77,A,Jan
1,2,B,Feb
2,63,C,Mar
3,74,D,Apr
4,49,E,May
5,22,F,Jun
6,19,G,Jul
7,76,H,Aug
8,16,I,Sep
9,8,J,Oct


In [28]:
del df['Month']
df

Unnamed: 0,Sales,Location
0,77,A
1,2,B
2,63,C
3,74,D
4,49,E
5,22,F
6,19,G
7,76,H
8,16,I
9,8,J


## DataFrame: Missing Values 

In [29]:
from numpy import nan

r1 = [10, 20, nan]
r2 = [nan, 30, nan]
r3 = [40, nan, 50]
r4 = [60, 70, 80]
dataSet = pd.DataFrame([r1, r2, r3, r4])
dataSet

Unnamed: 0,0,1,2
0,10.0,20.0,
1,,30.0,
2,40.0,,50.0
3,60.0,70.0,80.0


In [30]:
dataSet.shape

(4, 3)

In [31]:
dataSet.isnull()

Unnamed: 0,0,1,2
0,False,False,True
1,True,False,True
2,False,True,False
3,False,False,False


In [32]:
dataSet.isnull().sum(axis=0)

0    1
1    1
2    2
dtype: int64

In [33]:
dataSet.isnull().sum(axis=0) / dataSet.shape[0]

0    0.25
1    0.25
2    0.50
dtype: float64

In [34]:
dataSet2 = dataSet.drop(2, axis=1)
dataSet2.shape

(4, 2)

In [35]:
dataSet2

Unnamed: 0,0,1
0,10.0,20.0
1,,30.0
2,40.0,
3,60.0,70.0


In [36]:
dataSet2[0].interpolate(method='nearest', inplace=True)
dataSet2

Unnamed: 0,0,1
0,10.0,20.0
1,10.0,30.0
2,40.0,
3,60.0,70.0


In [37]:
dataSet2[1].fillna(dataSet2[1].mean(), inplace=True)
dataSet2

Unnamed: 0,0,1
0,10.0,20.0
1,10.0,30.0
2,40.0,40.0
3,60.0,70.0


In [38]:
dataSet2.isnull()

Unnamed: 0,0,1
0,False,False
1,False,False
2,False,False
3,False,False


## DataFrame: Normalizing Values

In [39]:
np.random.seed(42)
unscaled_ndarray = np.ceil(100 * np.random.rand(10, 3))
unscaled_ndarray

array([[38., 96., 74.],
       [60., 16., 16.],
       [ 6., 87., 61.],
       [71.,  3., 97.],
       [84., 22., 19.],
       [19., 31., 53.],
       [44., 30., 62.],
       [14., 30., 37.],
       [46., 79., 20.],
       [52., 60.,  5.]])

In [40]:
df = pd.DataFrame(unscaled_ndarray)
df[1] = df[1] * 10
df[2] = df[2] * 100
df

Unnamed: 0,0,1,2
0,38.0,960.0,7400.0
1,60.0,160.0,1600.0
2,6.0,870.0,6100.0
3,71.0,30.0,9700.0
4,84.0,220.0,1900.0
5,19.0,310.0,5300.0
6,44.0,300.0,6200.0
7,14.0,300.0,3700.0
8,46.0,790.0,2000.0
9,52.0,600.0,500.0


In [41]:
df.mean()

0      43.4
1     454.0
2    4440.0
dtype: float64

In [42]:
df.std()

0      25.065248
1     325.310108
2    2973.288490
dtype: float64

In [43]:
from sklearn import preprocessing

In [44]:
scaled_ndarray = preprocessing.scale(unscaled_ndarray)
scaled_ndarray

array([[-0.2270913 ,  1.63957657,  1.04938151],
       [ 0.69809548, -0.95263935, -1.00683901],
       [-1.57281752,  1.34795228,  0.58850449],
       [ 1.16068887, -1.37387444,  1.8647793 ],
       [ 1.70739014, -0.75822316, -0.90048278],
       [-1.02611624, -0.46659887,  0.30488787],
       [ 0.02523237, -0.49900156,  0.62395657],
       [-1.23638597, -0.49900156, -0.26234538],
       [ 0.10934026,  1.08873069, -0.8650307 ],
       [ 0.36166392,  0.47307941, -1.39681187]])

In [45]:
np.mean(scaled_ndarray, axis=0)

array([ 3.88578059e-17, -3.88578059e-17, -2.22044605e-17])

In [46]:
np.std(scaled_ndarray, axis=0)

array([1., 1., 1.])

In [47]:
preprocessing.MinMaxScaler().fit_transform(unscaled_ndarray)

array([[0.41025641, 1.        , 0.75      ],
       [0.69230769, 0.13978495, 0.11956522],
       [0.        , 0.90322581, 0.60869565],
       [0.83333333, 0.        , 1.        ],
       [1.        , 0.20430108, 0.15217391],
       [0.16666667, 0.30107527, 0.52173913],
       [0.48717949, 0.29032258, 0.61956522],
       [0.1025641 , 0.29032258, 0.34782609],
       [0.51282051, 0.8172043 , 0.16304348],
       [0.58974359, 0.61290323, 0.        ]])

## Data Grouping and Aggregation

* Pivot tables and cross-tabulation
* Split, apply, combine

In [48]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C', 'D'],
                   'data': range(7)}, columns=['key', 'data'])
df

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5
6,D,6


In [49]:
df.groupby('key') # the GroupBy object

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000255744CD880>

In [50]:
df.groupby('key').count()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,2
B,2
C,2
D,1


In [51]:
df.groupby('key').sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7
D,6


In [52]:
df.groupby('key').mean()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,1.5
B,2.5
C,3.5
D,6.0


In [53]:
df.groupby('key').min()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,0
B,1
C,2
D,6


In [54]:
df.groupby('key').max()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,4
C,5
D,6


In [55]:
df.groupby('key').median()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,1.5
B,2.5
C,3.5
D,6.0


In [56]:
import pandas as pd
import datetime

df = pd.DataFrame({
    "name": ["Sally", "John", "Amanda"],
    "dob": ["03-09-1955", "12-11-1996", "07-07-1977"]})

def calculate_age(born):
    born = datetime.datetime.strptime(born, "%d-%m-%Y").date() # parse datetime from string using format
    today = datetime.datetime.today()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

df['age'] = df['dob'].apply(calculate_age)
print(df)

     name         dob  age
0   Sally  03-09-1955   65
1    John  12-11-1996   24
2  Amanda  07-07-1977   43
