# Part 3 - Numpy and Pandas


## Numpy 
Numpy is Numerical Python and the most basic and powerful computational package in Python. <br>
In order to use numpy functionality, one needs to import Numpy module.

In [1]:
## import numpy 
import numpy as np

### How to create Numpy Array
Array is a fast, flexible container for large data sets in Python.  <br>
Array is also referred as nparray, N-dimensional array. 

In [2]:
## Creating Numpy - one dimensional array
numpy1 = np.array([1,2,3,4]) 
print(numpy1)
print(type(numpy1))
print(numpy1.shape)

[1 2 3 4]
<class 'numpy.ndarray'>
(4,)


In [3]:
## Creating Numpy - two dimensional array
numpy2 = np.array([[1,2,3,4], [5,6,7,8]])
print(numpy2)
print(type(numpy2))
print(numpy2.shape)

[[1 2 3 4]
 [5 6 7 8]]
<class 'numpy.ndarray'>
(2, 4)


In [4]:
## Creating Numpy - three dimensional array
numpy3 = np.zeros((2,2,2))  # 2 by 2 by 2 array with zeros assigned
print(numpy3)
print(type(numpy3))
print(numpy3.shape)

[[[0. 0.]
  [0. 0.]]

 [[0. 0.]
  [0. 0.]]]
<class 'numpy.ndarray'>
(2, 2, 2)


In [5]:
## Creating Numpy with floating type
numpy4 = np.array([1,2,3,4], dtype=np.float64)
print(numpy4)
print(type(numpy4))
print(numpy4.shape)
print(numpy4.dtype)

## Creating Numpy with integer type
numpy5 = np.array([1,2,3,4], dtype=np.int16)
print(numpy5)
print(type(numpy5))
print(numpy5.shape)
print(numpy5.dtype)

[1. 2. 3. 4.]
<class 'numpy.ndarray'>
(4,)
float64
[1 2 3 4]
<class 'numpy.ndarray'>
(4,)
int16


### Numpy Array Indexing and Slicing
![image.png](attachment:image.png)

In [6]:
## Indexing and slicing of numpy
numpy6 = np.array([10,11,12,13,14,15,16,17,18,19])
print(numpy6)
print("index 0 and 6", numpy6[0] , numpy6[6])
print("slicing up to index 4: " , numpy6[:4])
print("slicing from index 4 to 8: " , numpy6[4:8])

[10 11 12 13 14 15 16 17 18 19]
index 0 and 6 10 16
slicing up to index 4:  [10 11 12 13]
slicing from index 4 to 8:  [14 15 16 17]


In [7]:
## Two Dimentional Array
numpy7 = np.array([[1,2,3],[4,5,6],[7,8,9]]) # 2 dimension
print(numpy7)
print("Shape of array: ", numpy7.shape)
print("index 1: " , numpy7[1])
print("index 1 2: " , numpy7[1][2])

[[1 2 3]
 [4 5 6]
 [7 8 9]]
Shape of array:  (3, 3)
index 1:  [4 5 6]
index 1 2:  6


### Reshape Numpy Array

In [8]:
## Reshaping array ursing reshape method
numpy6 = np.array([1,2,3,4,5,6,7,8,9])
numpy8 = numpy6.reshape(3,3)  ## reshape method
print("Original array: ", numpy6)
print("Reshaped array: \n", numpy8)
print("Shape of array: ", numpy8.shape)
print("index 1: " , numpy8[1])
print("index 2 & 1: " , numpy8[2][1])


Original array:  [1 2 3 4 5 6 7 8 9]
Reshaped array: 
 [[1 2 3]
 [4 5 6]
 [7 8 9]]
Shape of array:  (3, 3)
index 1:  [4 5 6]
index 2 & 1:  8


In [9]:
## Transposing array
numpy6 = np.array([0,1,2,3,4,5,6,7,8,9])
numpy9 = numpy6.reshape(5,2)  ## reshape method
numpy10 = numpy9.T
print("Original array: ", numpy6)
print("Reshaped array: \n", numpy9)
print("Shape of array: ", numpy9.shape)
print("Transposed array: \n", numpy10)
print("Shape of array: ", numpy10.shape)

Original array:  [0 1 2 3 4 5 6 7 8 9]
Reshaped array: 
 [[0 1]
 [2 3]
 [4 5]
 [6 7]
 [8 9]]
Shape of array:  (5, 2)
Transposed array: 
 [[0 2 4 6 8]
 [1 3 5 7 9]]
Shape of array:  (2, 5)


### Numpy Array Calculation

In [10]:
## Array Calculation
## addition
numpy11 = np.array([1,2,3,4])
numpy12 = numpy11 + numpy11
print(numpy12)


[2 4 6 8]


In [11]:
numpy13 = np.array([2,2])
numpy14 = numpy11 + numpy13  ## different shapes of array can't be added - Error Message

ValueError: operands could not be broadcast together with shapes (4,) (2,) 

In [12]:
## Array Calculation
## multiplication
numpy11 = np.array([1,2,3,4])
numpy15 = numpy11 *2
print(numpy11, " is multiplied by 2, then the result: ", numpy15)

numpy16 = np.array([2,2, 1, 2])
numpy17 = numpy11 * numpy16 
print(numpy11, " times ", numpy16, " is ",numpy17)

numpy18 = np.array([1,2])

[1 2 3 4]  is multiplied by 2, then the result:  [2 4 6 8]
[1 2 3 4]  times  [2 2 1 2]  is  [2 4 3 8]


In [13]:
numpy19 = numpy11 * numpy18  ## can't be multiplied by the different shapes - Error Message

ValueError: operands could not be broadcast together with shapes (4,) (2,) 

In [14]:
## Simple mathematical funtions and method in numpy
numpy20 = np.random.randn(4,5)  ## create randomly distributed 4 by 5 array
print(numpy20)

print("mean values of array : ", numpy20.mean())  # mean values using mean method of numpy20 instance
print("mean values of array : ", np.mean(numpy20))  # mean values using function of np.mean
print("max & min values of array : ", numpy20.max(), numpy20.min())  # max and min values using method of numpy20 instance


[[ 1.56012456  0.66445594 -0.0248449   0.05855892  0.10616939]
 [-0.81181184  1.364618   -0.8523478   1.4549566  -0.38706748]
 [-0.09975191 -1.53585631  0.26648895  1.33806259  0.26422476]
 [-1.6611288   0.27482255 -0.74033504  0.37922043 -0.04186887]]
mean values of array :  0.0788344870412592
mean values of array :  0.0788344870412592
max & min values of array :  1.5601245571085922 -1.6611288016133627


Mathematical calculation depending on Axis. <br>
Axis = 0 does the mathematical calculation by columns, so 3 by 4 array create 4 results. <br>
Axis = 1 does the mathematical calculation by rows, so 3 by 4 array create 3 results.  <br>
![image.png](attachment:image.png)


In [None]:
## Mathematical calcualtion using axis
print(numpy20)
print("\nmax values of array on axis 0 : ", numpy20.max(axis=0))
print("max values of array on axis 1 : ", numpy20.max(axis=1))
print("\nsum values of array : ", numpy20.sum())
print("sum values of array on axis 0 : ", numpy20.sum(axis=0))
print("sum values of array on axis 1 : ", numpy20.sum(axis=1))
print("mean values of array on axis 1 : ", numpy20.sum(axis=1)/5)

### Recap : Numpy Array basic structure
![image.png](attachment:image.png)

## Pandas
Pandas is the primary data structures for many analysis and it provides data manipulation tools designed to make data analyst fast and easy in Python.
<br>Its structure is row and column based table structure.
<br>It provides merge and other relational operations found in relational database. 
<br>It has two data structures
- Series
- DataFrame


In [15]:
## import pandas module
from pandas import Series, DataFrame
import pandas as pd

### Series
A series is one-dimentional array-like object. 
<br> A series is considered as individual column of dataframe. <br>
In SAS, a series is similar to one row or one column. 

In [16]:
series1 = Series([1,2,3,4])
print(series1)
print(type(series1))
print(series1.shape)

0    1
1    2
2    3
3    4
dtype: int64
<class 'pandas.core.series.Series'>
(4,)


In [17]:
## creating Series using dictionary
raw1 = {'subjid': '01-001','siteid': '01','sex': 'M', 'age': 29} ## dicionary
series2 = Series(raw1)
print(series2)
print("Type of series : ", type(series2))
print("Shape of Series : ", series2.shape)
print("value of age : ", series2.age)
print("value of age : ", series2['age'])

subjid    01-001
siteid        01
sex            M
age           29
dtype: object
Type of series :  <class 'pandas.core.series.Series'>
Shape of Series :  (4,)
value of age :  29
value of age :  29


Series has two componets 
- index
- values

Series is very similar to dictionary and its main difference is Series uses 'index' while Dictionary uses 'key'.

In [18]:
print("Series1 index: ", series1.index)
print("Series1 value: ", series1.values)

print("Series2 index: ", series2.index)
print("Series2 value: ", series2.values)

Series1 index:  RangeIndex(start=0, stop=4, step=1)
Series1 value:  [1 2 3 4]
Series2 index:  Index(['subjid', 'siteid', 'sex', 'age'], dtype='object')
Series2 value:  ['01-001' '01' 'M' 29]


## DataFrame
A DataFrame is a tabular, spreadsheet-like data structure whose columns have different value type (numeric, string, boolean and etc)
<br>A DataFrame has row and column index.

![image.png](attachment:image.png)

### How to create DataFrame

In [19]:
## creating dataframe from dictionaries
raw = {'subjid': ['01-001', '01-002', '01-003', '02-001'],
      'siteid': ['01','01','01','02'],
      'sex': ['M','F','F','M'],
      'age': [29, 40, 35, 56]} ## dicionary
df1 = DataFrame(raw)
print("DataFrame :\n", df1)
print("Data Type: ", type(df1))

DataFrame :
    subjid siteid sex  age
0  01-001     01   M   29
1  01-002     01   F   40
2  01-003     01   F   35
3  02-001     02   M   56
Data Type:  <class 'pandas.core.frame.DataFrame'>


### Extracting column and row from DataFrame

In [20]:
## Extracting column
series11 = df1.sex    ## extract sex variable
series12 = df1['sex'] ## extract sex variable
print("Sex column from dataframe:\n", series11)
print(type(series11))
print(series12)  ## same result

Sex column from dataframe:
 0    M
1    F
2    F
3    M
Name: sex, dtype: object
<class 'pandas.core.series.Series'>
0    M
1    F
2    F
3    M
Name: sex, dtype: object


In [21]:
## extracting row
series13 = df1.loc[0]    ## extract first row
print("First column from dataframe:\n", series13)
print(type(series13))

First column from dataframe:
 subjid    01-001
siteid        01
sex            M
age           29
Name: 0, dtype: object
<class 'pandas.core.series.Series'>


### Extract the value from DataFrame

In [22]:
## extracting the value from dataframe
print(df1)
val1 = df1.loc[0,'sex']  ## first position - row, second position - column
print("value of index 0 and column 'sex': ", val1)
print(type(val1))

   subjid siteid sex  age
0  01-001     01   M   29
1  01-002     01   F   40
2  01-003     01   F   35
3  02-001     02   M   56
value of index 0 and column 'sex':  M
<class 'str'>


### Metadata for DataFrame

In [23]:
## Understanding about data 
print("Property of dataframe: \n", df1.dtypes)  ## property of columns
print("Columns of dataframe: ", df1.columns) ## list columns
print("Index of dataframe: ", df1.index.values.tolist()) ## list row
print("Shaep of dataframe: ", df1.shape) ## shape
print("Dimension of dataframe: ", df1.ndim)  ## dimension 

Property of dataframe: 
 subjid    object
siteid    object
sex       object
age        int64
dtype: object
Columns of dataframe:  Index(['subjid', 'siteid', 'sex', 'age'], dtype='object')
Index of dataframe:  [0, 1, 2, 3]
Shaep of dataframe:  (4, 4)
Dimension of dataframe:  2


In [24]:
df1.describe() ## describe the numeric data

Unnamed: 0,age
count,4.0
mean,40.0
std,11.575837
min,29.0
25%,33.5
50%,37.5
75%,44.0
max,56.0


### Data Manipulation in DataFrame

In [25]:
## Adding variables
df1['country'] = 'USA'
print(df1)

   subjid siteid sex  age country
0  01-001     01   M   29     USA
1  01-002     01   F   40     USA
2  01-003     01   F   35     USA
3  02-001     02   M   56     USA


In [26]:
## sorting data using sort_values method
df1 = df1.sort_values(by=['age', 'sex'])  ## by age and sex
print(df1)

df1 = df1.sort_values(by=['sex', 'age'], ascending=False) ## sortying in descending order
print(df1)

df1 = df1.sort_values(by=['sex','age'], ascending=[False, True]) ## sortying in descending and ascending order
print(df1)

   subjid siteid sex  age country
0  01-001     01   M   29     USA
2  01-003     01   F   35     USA
1  01-002     01   F   40     USA
3  02-001     02   M   56     USA
   subjid siteid sex  age country
3  02-001     02   M   56     USA
0  01-001     01   M   29     USA
1  01-002     01   F   40     USA
2  01-003     01   F   35     USA
   subjid siteid sex  age country
0  01-001     01   M   29     USA
3  02-001     02   M   56     USA
2  01-003     01   F   35     USA
1  01-002     01   F   40     USA


In [27]:
## Dataframe can have different types of elements 
df2 = DataFrame({'col1': ['a','b', 3, 4], 'col2' : ['c','d', np.nan, True]})
print(df2)

print("\nProperty of dataframe: \n", df2.dtypes)  ## property of columns
print("The type of ", df2.loc[0,'col1'], " is " ,type(df2.loc[0,'col1']))  # string type
print("The type of ", df2.loc[2,'col1'], " is " ,type(df2.loc[2,'col1']))  # integer type
print("The type of ", df2.loc[2,'col2'], " is " ,type(df2.loc[2,'col2']))  # np.nan is float type
print("The type of ", df2.loc[3,'col2'], " is " ,type(df2.loc[3,'col2']))  # Boolean type


  col1  col2
0    a     c
1    b     d
2    3   NaN
3    4  True

Property of dataframe: 
 col1    object
col2    object
dtype: object
The type of  a  is  <class 'str'>
The type of  3  is  <class 'int'>
The type of  nan  is  <class 'float'>
The type of  True  is  <class 'bool'>


### Remove a column or row of DataFrame
![image.png](attachment:image.png)

In [28]:
## removing columns
df3 = df2.drop(['col2'], axis=1)  ## drop column
print("new dataframe -col2 deleted : \n", df3)

## removing rows (index)
df4 = df2.drop([2], axis=0)  ## drop row
print("new dataframe -index 2 deleted : \n", df4)

new dataframe -col2 deleted : 
   col1
0    a
1    b
2    3
3    4
new dataframe -index 2 deleted : 
   col1  col2
0    a     c
1    b     d
3    4  True


### Filter data 
The basic structure of dataframe by index and columns
![image.png](attachment:image.png)

Any data manipulation such as selecting and filtering will be based on index and columns. <br>
For example, filtering of data from dataframe needs codes of df.iloc[[index,columns]].  Index goes first and columns next just like axis=0 and axis=1.   

In [29]:
df5 = DataFrame(raw)
print("Original dataframe: \n", df5)

## filtering by index - row
df6 = df5.loc[[1,3]] ## index 1 and 3
print("\nNew dataframe of index 1 and 3: \n",df6)

## filtering by columns 
df7 = df5.loc[:,['subjid','siteid']] ## columns subjid and siteid
print("\nNew dataframe of columns of subjid and diabp: \n",df7)
df7 = df5[['subjid','siteid']] ## columns subjid and siteid
print("\nNew dataframe of columns of subjid and diabp: \n",df7)

## filtering by columns and rows
df8 = df5.loc[[1,3],['subjid','siteid']] ## index 1 and 3 & columns subjid and siteid
print("\nNew dataframe of index 1 and 3 and columns, subjid and diabp: \n",df8)


Original dataframe: 
    subjid siteid sex  age
0  01-001     01   M   29
1  01-002     01   F   40
2  01-003     01   F   35
3  02-001     02   M   56

New dataframe of index 1 and 3: 
    subjid siteid sex  age
1  01-002     01   F   40
3  02-001     02   M   56

New dataframe of columns of subjid and diabp: 
    subjid siteid
0  01-001     01
1  01-002     01
2  01-003     01
3  02-001     02

New dataframe of columns of subjid and diabp: 
    subjid siteid
0  01-001     01
1  01-002     01
2  01-003     01
3  02-001     02

New dataframe of index 1 and 3 and columns, subjid and diabp: 
    subjid siteid
1  01-002     01
3  02-001     02


In [31]:
## using boolean to filter data
df1.sort_values(by=['subjid'], inplace=True) # proc sort data=df1 ; by subjid; run;
df2 = df1.sort_values(by=['subjid'])  # proc sort data=df1 out=df2; by subjid; run;
print("Original                   : \n", df1, "\n")

df9 = df1[[False, True, False, True]]  # row number 0, 1, ,2, 3
print("New dataframe using boolean: \n", df9)  ## example of boolean filtering

print("\nBoolean result of condition : \n", df1.age > 35)
print(type(df1.age > 35))
df10 = df1[df1.age > 35]
print("\nNew dataframe using boolean: \n", df10)  ## only keep the index of age > 35

Original                   : 
    subjid siteid sex  age country
0  01-001     01   M   29     USA
1  01-002     01   F   40     USA
2  01-003     01   F   35     USA
3  02-001     02   M   56     USA 

New dataframe using boolean: 
    subjid siteid sex  age country
1  01-002     01   F   40     USA
3  02-001     02   M   56     USA

Boolean result of condition : 
 0    False
1     True
2    False
3     True
Name: age, dtype: bool
<class 'pandas.core.series.Series'>

New dataframe using boolean: 
    subjid siteid sex  age country
1  01-002     01   F   40     USA
3  02-001     02   M   56     USA


In [32]:
## Null value in dataframe
raw2 = {'subjid': ['01-001', '01-002', '01-003', '02-001'],
      'siteid': ['01','01','01',np.NaN],
      'sex': ['M','F','F','M'],
      'age': [29, 40, 35, 56]} ## dicionary
df12 = DataFrame(raw2)
print(df12)
df12.siteid.isnull()
print("\nnull value : ", df12.siteid[3], ". Its type : ", type(df12.siteid[3]))

   subjid siteid sex  age
0  01-001     01   M   29
1  01-002     01   F   40
2  01-003     01   F   35
3  02-001    NaN   M   56

null value :  nan . Its type :  <class 'float'>


### Traceability of DataFrame 
Any changes in assigned dataframe are traced back to original dataframe. <br>
To avoid changes in the original data, programmers need to break the tracing-back by creating the new dataframe.  

In [33]:
## creating dataframe from dictionaries
raw = {'subjid': ['01-001', '01-002', '01-003', '02-001'],
      'siteid': ['01','01','01','02'],
      'sex': ['M','F','F','M'],
      'age': [29, 40, 35, 56]} ## dicionary
df1 = DataFrame(raw)

## referencing df1 to df2: if you change df20, the change will also be implemented on df1
df20 = df1
print("Original df1: \n", df1)
df20.loc[0,'age'] = 40
print("\n change in df20:\n ", df20)
print("\n Also change in df1: \n", df1)

df1 = DataFrame(raw)
## Copying and creating new df in df2. Changes in df1 does not implement on df21
df21 = df1.copy()
df21.loc[0,'age'] = 40
print("Original df1: \n", df1)
df20.loc[0,'age'] = 40
print("\n change in df21: \n", df21)
print("\n No change in df1: \n", df1)

Original df1: 
    subjid siteid sex  age
0  01-001     01   M   29
1  01-002     01   F   40
2  01-003     01   F   35
3  02-001     02   M   56

 change in df20:
     subjid siteid sex  age
0  01-001     01   M   40
1  01-002     01   F   40
2  01-003     01   F   35
3  02-001     02   M   56

 Also change in df1: 
    subjid siteid sex  age
0  01-001     01   M   40
1  01-002     01   F   40
2  01-003     01   F   35
3  02-001     02   M   56
Original df1: 
    subjid siteid sex  age
0  01-001     01   M   29
1  01-002     01   F   40
2  01-003     01   F   35
3  02-001     02   M   56

 change in df21: 
    subjid siteid sex  age
0  01-001     01   M   40
1  01-002     01   F   40
2  01-003     01   F   35
3  02-001     02   M   56

 No change in df1: 
    subjid siteid sex  age
0  01-001     01   M   29
1  01-002     01   F   40
2  01-003     01   F   35
3  02-001     02   M   56


### Data Manipulation using for loop & iterrows function
iterrows() function is similar to SAS data procedure : one row by one row data manipulation

In [34]:
## Create the new data 
df22 = df1.copy()
print("Original dataframe is \n", df1)
## creating new columns using for loop 
for index, _df1 in df22.iterrows():
    #print('\n index: ', index)
    #print('_df1: \n', _df1)
    
    ## Create age group
    if _df1['age'] < 35 :
        df22.loc[index, 'age_group'] = '< 35'
    else:
        df22.loc[index, 'age_group'] = '>= 35'
        
print("\n New dataframe is \n", df22)

Original dataframe is 
    subjid siteid sex  age
0  01-001     01   M   29
1  01-002     01   F   40
2  01-003     01   F   35
3  02-001     02   M   56

 New dataframe is 
    subjid siteid sex  age age_group
0  01-001     01   M   29      < 35
1  01-002     01   F   40     >= 35
2  01-003     01   F   35     >= 35
3  02-001     02   M   56     >= 35


In [35]:
## Create the new data 
df22_2 = df1.copy()
df22_2['age_group2'] = df22_2['age'].apply(lambda x: '< 35' if x < 35 else '>= 35')
print("New dataframe is \n", df22_2)

New dataframe is 
    subjid siteid sex  age age_group2
0  01-001     01   M   29       < 35
1  01-002     01   F   40      >= 35
2  01-003     01   F   35      >= 35
3  02-001     02   M   56      >= 35


### Transpose data in DataFrame

In [36]:
## Transposing data
## We want to create ADaM structure from SDTM structure 
raw = {'subjid': ['01-001', '01-002', '01-003', '02-001'],
      'sysbp': [154, 152, 148, 149],
      'diabp': [44, 48, 47, 50],
      'weight': [90.5, 83, 78, 77]} ## dicionary
df23 = DataFrame(raw)
print("Original data frame : \n", df23)

df23 = df23[['subjid','diabp','sysbp','weight']]
print("\nReorderded column data frame: \n", df23)

df24 = pd.melt(df23, id_vars='subjid', value_vars=['diabp', 'sysbp','weight'])
print("\nTranspose data frame : \n", df24)

df25 = df24.rename(columns={'variable':'parmacd', 'value':'aval'})
print("\nRenamed columns : \n", df25)

Original data frame : 
    subjid  sysbp  diabp  weight
0  01-001    154     44    90.5
1  01-002    152     48    83.0
2  01-003    148     47    78.0
3  02-001    149     50    77.0

Reorderded column data frame: 
    subjid  diabp  sysbp  weight
0  01-001     44    154    90.5
1  01-002     48    152    83.0
2  01-003     47    148    78.0
3  02-001     50    149    77.0

Transpose data frame : 
     subjid variable  value
0   01-001    diabp   44.0
1   01-002    diabp   48.0
2   01-003    diabp   47.0
3   02-001    diabp   50.0
4   01-001    sysbp  154.0
5   01-002    sysbp  152.0
6   01-003    sysbp  148.0
7   02-001    sysbp  149.0
8   01-001   weight   90.5
9   01-002   weight   83.0
10  01-003   weight   78.0
11  02-001   weight   77.0

Renamed columns : 
     subjid parmacd   aval
0   01-001   diabp   44.0
1   01-002   diabp   48.0
2   01-003   diabp   47.0
3   02-001   diabp   50.0
4   01-001   sysbp  154.0
5   01-002   sysbp  152.0
6   01-003   sysbp  148.0
7   02-001   sysb