#  Essentials of NumPy, Pandas, and Matplotlib

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
lst = [1,2,3,4,5,6]

# Create array from list
arr = np.array(lst)
arr

array([1, 2, 3, 4, 5, 6])

In [3]:
# Create array of floats from list

arr = np.array(lst, dtype=float)
arr

array([1., 2., 3., 4., 5., 6.])

In [4]:
names = [['Jon', 'Mary', 'Paul'],['Peter','Ben','Saul']]
arr1 = np.array(names)
arr1


array([['Jon', 'Mary', 'Paul'],
       ['Peter', 'Ben', 'Saul']], dtype='<U5')

In [5]:
# Dimension of array
arr1.ndim

2

In [6]:
# Array of arange
arr2 = np.arange(0,50,5)
arr2

array([ 0,  5, 10, 15, 20, 25, 30, 35, 40, 45])

In [7]:
# Array of zeros
arr3 = np.zeros((2,3), dtype=float)
arr3

array([[0., 0., 0.],
       [0., 0., 0.]])

In [8]:
# Array of ones
arr4 = np.ones((2,3), dtype=float)
arr4

array([[1., 1., 1.],
       [1., 1., 1.]])

In [9]:
# Create a random number generator
rng = np.random.default_rng()

# Array of random integers from 0 to 9
array = rng.integers(low=0,high=10,size=(2,4))
array

array([[0, 8, 2, 0],
       [2, 0, 1, 8]], dtype=int64)

In [10]:
# Create a random number generator
rng = np.random.default_rng(seed=42)

# Array of random floats between 0 and 1
array = rng.random((2,4))
array

array([[0.77395605, 0.43887844, 0.85859792, 0.69736803],
       [0.09417735, 0.97562235, 0.7611397 , 0.78606431]])

##  Accessing Array Elements

In [11]:
names = [['Jon', 'Mary', 'Paul'],['Peter','Ben','Saul']]
arr1 = np.array(names)
arr1

array([['Jon', 'Mary', 'Paul'],
       ['Peter', 'Ben', 'Saul']], dtype='<U5')

In [12]:
# Select Mary and Paul
mary_paul = arr1[0,1:]
mary_paul

array(['Mary', 'Paul'], dtype='<U5')

In [13]:
# Creating an element indices
select_peter = np.array([1,0])
select_paul = np.array([0,-1])

select_peter_paul = arr1[select_paul,select_peter]
select_peter_paul

array(['Mary', 'Peter'], dtype='<U5')

In [14]:
arr = np.array([[1,2,3],[4,5,6],[7,8,9]])

# Create a boolean mask
filter_array = arr % 2 != 0

# Select from array
arr[filter_array]

array([1, 3, 5, 7, 9])

In [15]:
# Create an array
arr = np.array([[1,2,3],[4,5,6],[7,8,9 ]])

# Flatten the array using reshape function
np.reshape(arr, 9)

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [16]:
# Create array 
names = [['Jon','Mary','Paul'],['Peter','Ben','Saul']]

arr1 = np.array(names)
arr1

array([['Jon', 'Mary', 'Paul'],
       ['Peter', 'Ben', 'Saul']], dtype='<U5')

In [17]:
# Reshape the array to (3,2)
new_arr = np.reshape(arr1,(3,2))
new_arr

array([['Jon', 'Mary'],
       ['Paul', 'Peter'],
       ['Ben', 'Saul']], dtype='<U5')

In [18]:
# Create two arrays
arr1 = np.array([[10,20,30],[40,50,60],[70,80,90]])
arr2 = np.array([[100,110,120],[130,140,150]])

arr3 = np.concatenate((arr1, arr2),axis=0)
arr3

array([[ 10,  20,  30],
       [ 40,  50,  60],
       [ 70,  80,  90],
       [100, 110, 120],
       [130, 140, 150]])

In [19]:
# Create array 
names = [['Jon','Mary','Paul'],['Peter','Ben','Saul']]
arr1 = np.array(names)

# Split arrays into two arrays
split_1,split_2 = np.split(arr1,2)

In [20]:
split_1

array([['Jon', 'Mary', 'Paul']], dtype='<U5')

In [21]:
split_2

array([['Peter', 'Ben', 'Saul']], dtype='<U5')

In [22]:
# Create two arrays
arr1 = np.array([[10,20,30],[40,50,60],[70,80,90]])
arr2 = np.array([[100,110,120],[130,140,150]])

# Dot product of a
# np.dot(arr1, arr2)

In [23]:
# Dot product of two arrays with one transposed
np.dot(arr1, arr2.transpose())

array([[ 6800,  8600],
       [16700, 21200],
       [26600, 33800]])

##  Mathematical Functions 

In [24]:
# Create two arrays
arr1 = np.array([[1,2,4], [6,7,8]])
arr2 = np.array([[3,3,6],[4,5,7]])

# Add two arrays
np.add(arr1, arr2)

array([[ 4,  5, 10],
       [10, 12, 15]])

In [25]:
# Substract the arrays

np.subtract(arr1, arr2)

array([[-2, -1, -2],
       [ 2,  2,  1]])

In [26]:
# Multiply the arrays
np.multiply(arr1, arr2)

array([[ 3,  6, 24],
       [24, 35, 56]])

In [27]:
# Divide the arrays
np.divide(arr1, arr2)

array([[0.33333333, 0.66666667, 0.66666667],
       [1.5       , 1.4       , 1.14285714]])

In [28]:
# Raise array to the power of 2
np.power(arr2, 2)

array([[ 9,  9, 36],
       [16, 25, 49]], dtype=int32)

In [29]:
# Square root the array
np.sqrt(arr2)

array([[1.73205081, 1.73205081, 2.44948974],
       [2.        , 2.23606798, 2.64575131]])

##  Statistical Functions

In [30]:
# Create array
arr = np.array([[3,3,6],[4,5,7]])

arr

array([[3, 3, 6],
       [4, 5, 7]])

In [31]:
# Compute mean each row
np.mean(arr,axis=1)

array([4.        , 5.33333333])

In [32]:
# Compute mean each column
np.mean(arr,axis=0)

array([3.5, 4. , 6.5])

In [33]:
# Compute median
np.median(arr,axis=1)

array([3., 5.])

In [34]:
# Compute standard deviation
np.std(arr,axis=0)

array([0.5, 1. , 0.5])

In [35]:
# Compute variance
np.var(arr,axis=0)

array([0.25, 1.  , 0.25])

In [36]:
# Compute minimum value 0f each column
np.min(arr,axis=0)

array([3, 3, 6])

In [37]:
# Compute maximum value whole array
np.max(arr,axis=None)

7

## Pandas Series

In [38]:
# Create a Pandas Series
fruits = pd.Series(['Orange', 'Apple', 'Mango'], name='Friuts')
fruits


0    Orange
1     Apple
2     Mango
Name: Friuts, dtype: object

In [39]:
# Pandas Series with index

fruits = pd.Series(['Orange', 'Apple', 'Mango'],
                   name='Friuts',
                   index=['a', 'b', 'c'])
fruits

a    Orange
b     Apple
c     Mango
Name: Friuts, dtype: object

In [40]:
# Pandas Series of integers
int_num = pd.Series([10,20,30],name='numbers')
int_num

0    10
1    20
2    30
Name: numbers, dtype: int64

In [41]:
# Pandas Series with mixed dtype
mixed_dtype = pd.Series([10,20,30,'Orange'],name='mixed')
mixed_dtype

0        10
1        20
2        30
3    Orange
Name: mixed, dtype: object

In [42]:
# OR
mixed_dtype = pd.Series([10,20,30,14.5],name='mixed')
mixed_dtype

0    10.0
1    20.0
2    30.0
3    14.5
Name: mixed, dtype: float64

In [43]:
# Pandas Series with specified dtype
int_num = pd.Series([10,20,30],
                    name='numbers',
                    dtype='int8'
                    )
int_num

0    10
1    20
2    30
Name: numbers, dtype: int8

In [44]:
# Create Pandas Series

num_series = pd.Series([10,20,30,40],name='numbers')
num_series

0    10
1    20
2    30
3    40
Name: numbers, dtype: int64

In [45]:
# Change dtype of Pandas Series
num_series = num_series.astype('float')
num_series

0    10.0
1    20.0
2    30.0
3    40.0
Name: numbers, dtype: float64

##  Creating a Pandas DataFrame

In [46]:
# Create a Pandas DataFrame
data = {'Name':['Alice', 'Bob', 'Charlie', 'David'],
        'Age':[25,32,18,47],
        'Salary':[50000,80000,20000,120000]}
df = pd.DataFrame(data, columns=['Name', 'Age', 'Salary'])
df

Unnamed: 0,Name,Age,Salary
0,Alice,25,50000
1,Bob,32,80000
2,Charlie,18,20000
3,David,47,120000


In [47]:
# Create pandas data frame from lists
names = ['Alice', 'Bob', 'Charlie', 'David']
age = [25,32,18,47]
salary = [50000,80000,20000,120000]

df = pd.DataFrame({'Names': names, 'Age': age, 'Salary': salary})
df

Unnamed: 0,Names,Age,Salary
0,Alice,25,50000
1,Bob,32,80000
2,Charlie,18,20000
3,David,47,120000


##  Data Loading Functions

In [48]:
# Read in the data from csv file
df = pd.read_csv("../DATA/medical_data.csv")
df.head()

Unnamed: 0,First Name,First Letter,Last Name,Condition,Gender,BMI
0,John,J,Smith,Healthy,Female 27,27.35
1,Mary,M,Jones,Cancer,Female 39,24.29
2,Amy,A,Candy,Heart Disease,Male 51,28.72
3,David,D,Tuss,Diabetes,Female 73,24.66
4,Michael,M,Brookes,Asthma,Male 71,28.21


In [49]:
# Read in the data from excel file
# df = pd.read_excel("../DATA/Asset_sales_data.xlsx")
# df.head()

##  Data Cleaning

In [50]:
# Create a new data frame
names = ['Alice', 'Bob','Charlie','David']
age = [25,None,18,47]
salary = [50000,80000,None,1200000]

df = pd.DataFrame({'Names': names, 'Age': age, 'Salary': salary})
df

Unnamed: 0,Names,Age,Salary
0,Alice,25.0,50000.0
1,Bob,,80000.0
2,Charlie,18.0,
3,David,47.0,1200000.0


In [51]:
# Drop NaN values
df.dropna()

Unnamed: 0,Names,Age,Salary
0,Alice,25.0,50000.0
3,David,47.0,1200000.0


In [52]:
# Drop columns with NA values
df.dropna(axis=1)

Unnamed: 0,Names
0,Alice
1,Bob
2,Charlie
3,David


In [53]:
# Fill in missing values 
df['Age'] = df['Age'].fillna(value=df['Age'].mean())
df['Salary'] = df['Salary'].fillna(value=df['Salary'].mean())

df

Unnamed: 0,Names,Age,Salary
0,Alice,25.0,50000.0
1,Bob,30.0,80000.0
2,Charlie,18.0,443333.3
3,David,47.0,1200000.0


##  Data Manipulation in Pandas

In [54]:
names = ['Alice', 'Bob', 'Charlie','David','John','Mpho','Steve','Ben']
age = [25,29,33,21,57,66,50,30]

# Create a data frame
df = pd.DataFrame({'Names':names, 'Age':age})

# View first five rows
df.head()

Unnamed: 0,Names,Age
0,Alice,25
1,Bob,29
2,Charlie,33
3,David,21
4,John,57


In [55]:
# View first two rows

df.head(2)

Unnamed: 0,Names,Age
0,Alice,25
1,Bob,29


In [56]:
# View last five rows

df.tail()

Unnamed: 0,Names,Age
3,David,21
4,John,57
5,Mpho,66
6,Steve,50
7,Ben,30


In [57]:
# Detail of the data frame

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Names   8 non-null      object
 1   Age     8 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 260.0+ bytes


In [58]:
# Description of the data frame

df.describe()

Unnamed: 0,Age
count,8.0
mean,38.875
std,16.522171
min,21.0
25%,28.0
50%,31.5
75%,51.75
max,66.0


In [59]:
products = ['Computer','Phones','Shoes','Computer','Phones']
sales = [2500,3000,1400,2100,2800]

# Create a data frame
df = pd.DataFrame({'Products': products, 'Sales': sales})
df.head(2)

Unnamed: 0,Products,Sales
0,Computer,2500
1,Phones,3000


In [60]:
# Group by product
df_grouped = df.groupby(by='Products')['Sales'].sum()
df_grouped

Products
Computer    4600
Phones      5800
Shoes       1400
Name: Sales, dtype: int64

In [61]:
products = ['Computer','Phones','Shoes']
sales = [2500,3000,1400]

# Create a data frame
df_sales = pd.DataFrame({'Products':products,'Sales':sales})
df_sales

Unnamed: 0,Products,Sales
0,Computer,2500
1,Phones,3000
2,Shoes,1400


In [62]:
products = ['Computer','Phones','Shoes']
costs = [1800,2300,1000]

# Create a data frame
df_costs = pd.DataFrame({'Products':products,'Costs':costs})
df_costs

Unnamed: 0,Products,Costs
0,Computer,1800
1,Phones,2300
2,Shoes,1000


In [63]:
# Merge the data frames

df_merged = df_sales.merge(df_costs,on='Products')
df_merged

Unnamed: 0,Products,Sales,Costs
0,Computer,2500,1800
1,Phones,3000,2300
2,Shoes,1400,1000
