# Python: Numpy and Pandas

## Numpy

In [1]:
import numpy as np

arr  = np.array([1, 2, 3, 4, 5])

print(arr)

[1 2 3 4 5]


check the number of dimensions


In [2]:
a = np.array(42)
b = np.array([1, 2, 3, 4, 5])
c = np.array([[1, 2, 3], [4, 5, 6]])
d = np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])


print(a.ndim)
print(b.ndim)
print(c.ndim)
print(d.ndim)

0
1
2
3


In [3]:
c = np.array([[1, 2, 3], [4, 5, 6]])
print(c.shape)

(2, 3)


### Adding arrays (vectorisation)

N.B. this doesn't concatenate the arrays

In [4]:
arr1 = np.array([1, 2, 3, 4])
arr2 = np.array([1, 2, 3, 4])

arr1 + arr2

array([2, 4, 6, 8])

### Slicing



In [5]:
arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

print(arr[0:3])
print(arr[3:]) 
print(arr[0:8:2]) # with a step


[1 2 3]
[ 4  5  6  7  8  9 10]
[1 3 5 7]


2-D arrays

In [6]:
arr = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])

print(arr[1, 1:4]) # 2nd element 1:4

[7 8 9]


### Numpy Data Types
check with `.dtypes`


In [7]:
import numpy as np

arrString = np.array(['apple', 'banana', 'cherry'])
arrInt = np.array([1, 2, 3, 4])

print(arrString.dtype) 
print(arrInt.dtype)

<U6
int64


create an array with a defined data type

In [8]:
arrString = np.array([1, 2, 3, 4], dtype='S') 
arrInt = np.array([1, 2, 3, 4], dtype='i')

print(arrString)
print(arrInt)

[b'1' b'2' b'3' b'4']
[1 2 3 4]


and we can convert the array

In [9]:
arr = np.array([1, 2, 3, 4], dtype="i")
print(arr.dtype)

newarr = arr.astype(float)
print(newarr.dtype)


int32
float64


### Copy and view

copy creates its own copy

view is linked to the original

`base` checks if array owns it's own data (`None` if true)

In [10]:
arrC = np.array([1, 2, 3, 4, 5])
arrV = np.array([1, 2, 3, 4, 5])

arrCopy = arrC.copy()
arrView = arrV.view()

arrC[0] = 9
arrV[0] = 9


print(arrC)
print(arrCopy)
print(arrV)
print(arrView)

print(arrCopy.base)
print(arrView.base)

[9 2 3 4 5]
[1 2 3 4 5]
[9 2 3 4 5]
[9 2 3 4 5]
None
[9 2 3 4 5]


### Reshape


In [11]:
arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])

print(arr.reshape(4, 3))
print("\n")
print(arr.reshape(2, 3, 2))
print("\n")
print(arr.reshape(2, -1)) # unknown dimension with -1

[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]


[[[ 1  2]
  [ 3  4]
  [ 5  6]]

 [[ 7  8]
  [ 9 10]
  [11 12]]]


[[ 1  2  3  4  5  6]
 [ 7  8  9 10 11 12]]


In [12]:

# flatten array
newArr = np.array([[1, 2, 3], [4, 5, 6]])
print(newArr.reshape(-1))


[1 2 3 4 5 6]


### Iterating

Arrays are iterable, if nested we can use `nditer`

`ndenumerate` can be used to note where in the array structure the element is located

In [13]:
arr = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])

for x in np.nditer(arr):
    print(x)
    
print("\n")    
# compared to
for x in arr:
    print(x)

1
2
3
4
5
6
7
8


[[1 2]
 [3 4]]
[[5 6]
 [7 8]]


In [14]:
arr = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])

for idx, x in np.ndenumerate(arr):
  print(idx, x) 

(0, 0) 1
(0, 1) 2
(0, 2) 3
(0, 3) 4
(1, 0) 5
(1, 1) 6
(1, 2) 7
(1, 3) 8


### Joining arrays

we can also specifcy joining on **rows** (`axis=0`) or **columns** (`axis=1`).




In [15]:
arr = np.array([1, 2, 3])

combo = np.concatenate((arr, arr))

print(combo)

[1 2 3 1 2 3]


In [16]:
arr1 = np.array([[1, 2], [3, 4]])
arr2 = np.array([[5, 6], [7, 8]])

combo0 = np.concatenate((arr1, arr2), axis=0)
combo1 = np.concatenate((arr1, arr2), axis=1)

print(combo0)
print("\n")
print(combo1)

[[1 2]
 [3 4]
 [5 6]
 [7 8]]


[[1 2 5 6]
 [3 4 7 8]]


Similarly we can use `stack`, `hstack`, `vstack`, and `dstack`

In [17]:
arr1 = np.array([1, 2, 3])
arr2 = np.array([4, 5, 6])

arrStack = np.stack((arr1, arr2), axis = 1)
arrHstack = np.hstack((arr1, arr2))
arrVstack = np.vstack((arr1, arr2))
arrDstack = np.dstack((arr1, arr2))

print("stack")
print(arrStack)
print("\nhstack")
print(arrHstack)
print("\nvstack")
print(arrVstack)
print("\ndstack")
print(arrDstack)


stack
[[1 4]
 [2 5]
 [3 6]]

hstack
[1 2 3 4 5 6]

vstack
[[1 2 3]
 [4 5 6]]

dstack
[[[1 4]
  [2 5]
  [3 6]]]


### Splitting arrays

`np.array_split` allows for non-exact splits, wereas `np.split` requires an exact match

splits can be carried out on multi-dimensional arrays with the axis specified.


In [18]:
arr = np.array([1, 2, 3, 4, 5, 6])

# split into 3 

arrSplit = np.array_split(arr, 3)
print(arrSplit)

# split into a non-multiple
arrSplitAlt = np.array_split(arr, 4)
print(arrSplitAlt)

[array([1, 2]), array([3, 4]), array([5, 6])]
[array([1, 2]), array([3, 4]), array([5]), array([6])]


as above we also have `hsplit`, `vsplit`, and `dsplit`

In [19]:
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15], [16, 17, 18]])

np.hsplit(arr, 3)

[array([[ 1],
        [ 4],
        [ 7],
        [10],
        [13],
        [16]]),
 array([[ 2],
        [ 5],
        [ 8],
        [11],
        [14],
        [17]]),
 array([[ 3],
        [ 6],
        [ 9],
        [12],
        [15],
        [18]])]

### Searching Arrays

In [20]:
arr = np.array(["red", "blue", "blue", "blue", "green", "green"])
print(np.where(arr == "green"))

(array([4, 5]),)


**Binary Search**

`searchsorted` searches where the value should be inserted

arrays can also be used in this function

In [21]:
# binary 
arr = np.array([1, 5, 6, 8, 10])

print(np.searchsorted(arr, 2)) 


print(np.searchsorted(arr, 5)) # left most index given as default
print(np.searchsorted(arr, 5, side="right")) # right most index given


print(np.searchsorted(arr, [2, 3, 4]))


1
1
2
[1 1 1]


### Sorting


In [22]:
arrInt = np.array([3, 2, 0, 1])
print(np.sort(arrInt))

arrCol = np.array(["red", "blue", "blue", "blue", "green", "green"])
print(np.sort(arrCol))

[0 1 2 3]
['blue' 'blue' 'blue' 'green' 'green' 'red']


In [23]:
# 2D array
arr = np.array([[3, 2, 4], [5, 0, 1]])
print(np.sort(arr)) 

[[2 3 4]
 [0 1 5]]


### Filtering

In [24]:
arr = np.array([1, 2, 3, 4, 5, 6])

# general principle
newArr = arr[[True, True, False, True, False, False]]

print(newArr)

# better method
conditionArr = arr > 3
print(arr[conditionArr])

# the above for base lists would be
[i for i in arr if i > 3]

[1 2 4]
[4 5 6]


[4, 5, 6]

*Compare the speed of the two filtering methods*

**numpy** is far quicker


In [25]:
import time
import numpy as np


np.random.seed(0)
arr = np.random.randint(100, size=(1000))
repeats = 1000

def npFilter(x):
    for i in range(0,repeats):
        x[x >= 50]
    return 0

def baseFilter(x):
    for i in range(0,repeats):
        [i for i in x if i >= 50]
    return 0



start = time.time()
npFilter(arr)
print("numpy takes %f s to run." % (time.time() - start))

start = time.time()
baseFilter(arr)
print("base takes %f s to run." % (time.time() - start))


numpy takes 0.021225 s to run.
base takes 0.314613 s to run.


### Random numbers


In [26]:
from numpy import random

print(random.rand()) # range 0 and 1
print(random.randint(100)) # range 0 and 100
print(random.randint(100, size=(5)))

# choices
print("\nchoices")
print(random.choice([3, 5, 7, 9]))
print(random.choice([3, 5, 7, 9], size=2))
print(random.choice([3, 5, 7, 9], size=(2,5)))


0.8918009653672702
87
[21  2  8 10  5]

choices
7
[7 3]
[[5 5 9 3 5]
 [5 7 9 3 5]]


## ufuncs

universal functions operate on `ndarray` objects

Good for:
* vectorisation
* broadcasting


In [27]:
x = [1, 2, 3, 4, 5]
y = [6, 7, 8, 9, 10]

a = np.add(x, y)
print(a)


# is equivelent to
aa = np.array(x) + np.array(y) 
print(aa)

[ 7  9 11 13 15]
[ 7  9 11 13 15]


**Simple arithmetic**

`add()`

`subtract()`

`multiply()`

`divide()`

`power()`

`mod()` or `remainder()` - returns the remained

`divmod()` - returns the quotient and the mod in 2 arrays

`absolute()` or `abs()` - absolute values



**Rounding**

`fix()` and `trunc()` - Remove the decimals, and return the float number closest to zero

`around(in, no. of decimals)` - define the number of decimal places

`floor()` - nearest lowest integer

`ceiling()` - nearest highest integer


**Summation**

N.B. products (multiplication) can done in the same way with `prod()` and `cumprod()`

In [28]:
arr = np.array([1, 2, 3])

sum1 = np.sum([arr, arr])
print(sum1)

# sum of each array
sum2 = np.sum([arr, arr], axis = 1)
print(sum2)

# vertical sum
sum3 = np.sum([arr, arr], axis = 0)
print(sum3)

# cumulative sum [1, 2, 3] = [1, 1+2, 1+2+3]
sum4 = np.cumsum(arr)
print(sum4)


12
[6 6]
[2 4 6]
[1 3 6]


**Differences**


In [29]:
arr = np.array([10, 15, 25, 5])

diff1 = np.diff(arr) # equivelent to [10-15, 25-15, 5-25]
print(diff1)

# n is the number of repeates
    # first [10-15, 25-15, 5-25] = [5, 10, -20]
    # second [10-5, -20-10]
diff2 = np.diff(arr, n=2) # equivelent to [10-15, 25-15, 5-25]
print(diff2)

[  5  10 -20]
[  5 -30]


**Lowest Common Multiple** and **Greatest Common Denominator**

LCM - the least number that is common multiple of both of the numbers

GCD - the biggest number that is a common factor of both of the numbers


In [30]:
## LCM
# single elements
x = np.lcm(4, 6)
print(x)

# arrays
arr = np.array([3, 6, 9])
xx = np.lcm.reduce(arr)
print(xx)


## GCD
y = np.gcd(4, 6)
print(y)

arr2 = np.array([20, 8, 32, 36, 16])
yy = np.gcd.reduce(arr2)
print(yy)


12
18
2
4


**Unique elemets**

In [31]:
arr = np.array([1, 1, 1, 2, 3, 4, 5, 5, 6, 7])
arr1 = np.array([1, 2, 3, 4])
arr2 = np.array([3, 4, 5, 6])


a = np.unique(arr) # single array
b = np.union1d(arr, arr) # 2 arrays
c = np.intersect1d(arr1, arr2) # intersects of 2 arrays
d = np.setdiff1d(arr1, arr2) # values in arr1 not in arr2
e = np.setxor1d(arr1, arr2) # values not found in both

nu = [print(i) for i in [a, b, c, d, e]]


[1 2 3 4 5 6 7]
[1 2 3 4 5 6 7]
[3 4]
[1 2]
[1 2 5 6]


## Vectorisation


**`np.frompyfunc`**
* Takes an arbitrary Python function and returns a ufunc
* `frompyfunc(function, no. inputs, no. outputs)`

**`np.vectorize`**
* Evaluates pyfunc over input arrays using broadcasting rules of numpy, *convenience over performance*.
* wrapper for `frompyfunc`
* features: carries across docstring, defines broadcasting rules, and define dtype



In [32]:
def myMultiplier(x, y):
    return x * y

myMultiplier = np.frompyfunc(myMultiplier, 2, 1)

myMultiplier([1, 2, 3, 4, 5], [6, 7, 8, 9, 10])


array([6, 14, 24, 36, 50], dtype=object)

In [33]:
def myAdder(x, y):
    return x + y

myAdder = np.vectorize(myAdder)

myAdder([1, 2, 3, 4, 5], [6, 7, 8, 9, 10])


array([ 7,  9, 11, 13, 15])

## Pandas
works with **dataframes**

slow but sometimes convenient 

In [39]:
import pandas as pd

bearData  = {
    "species": ["grizzly", "panda", "black"],
    "weight": [800, 400, 100] 
}

pd.DataFrame(bearData)

Unnamed: 0,species,weight
0,grizzly,800
1,panda,400
2,black,100


**series** are 1D pandas arrays (like a column in the df)

In [44]:
country = pd.Series(["North America", "China", "North America"])

print(country)

0    North America
1            China
2    North America
dtype: object


you can create own index labels

In [45]:
country = pd.Series(["North America", "China", "North America"],
                   index = ["a", "b", "c"])


country

a    North America
b            China
c    North America
dtype: object

### Dataframe Retrival

In [60]:
bearData  = {
    "species": ["grizzly", "panda", "black"],
    "weight": [800, 400, 100],
    "country": ["North America", "China", "North America"]
}

bearData = pd.DataFrame(bearData)

# specify the row
print(bearData.loc[0])
print("\n")
print(bearData.loc[[0, 2]]) # return from a list

# specific column
print("\n")
print(bearData["species"])


species          grizzly
weight               800
country    North America
Name: 0, dtype: object


   species  weight        country
0  grizzly     800  North America
2    black     100  North America


0    grizzly
1      panda
2      black
Name: species, dtype: object


### Read and Write

In [64]:
# write 
bearData.to_csv("data/bearSpecies.csv", index=False)

In [73]:
# read

treeData = pd.read_csv("data/trees.csv")

treeData

Unnamed: 0,Species,Distance.m,Angle.degrees
0,Populus tremula,31.665834,41.282636
1,Quercus robur,45.984993,44.535917
2,Ginkgo biloba,31.241767,25.146259
3,Fraxinus excelsior,34.616669,23.336127
4,Betula pendula,45.466165,38.349130
...,...,...,...
115,Pinus sylvestris,33.405883,35.996387
116,Carpinus betulus,39.501562,30.298136
117,Fagus sylvatica,43.988915,27.079336
118,Carpinus betulus,37.849727,38.573435


### View the data


In [74]:
treeData.head()

Unnamed: 0,Species,Distance.m,Angle.degrees
0,Populus tremula,31.665834,41.282636
1,Quercus robur,45.984993,44.535917
2,Ginkgo biloba,31.241767,25.146259
3,Fraxinus excelsior,34.616669,23.336127
4,Betula pendula,45.466165,38.34913


In [75]:
treeData.tail()

Unnamed: 0,Species,Distance.m,Angle.degrees
115,Pinus sylvestris,33.405883,35.996387
116,Carpinus betulus,39.501562,30.298136
117,Fagus sylvatica,43.988915,27.079336
118,Carpinus betulus,37.849727,38.573435
119,Alnus glutinosa,43.528852,36.761984


In [86]:
treeData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Species        120 non-null    object 
 1   Distance.m     120 non-null    float64
 2   Angle.degrees  120 non-null    float64
dtypes: float64(2), object(1)
memory usage: 2.9+ KB


In [88]:
print(treeData.shape)
print(treeData.size)

(120, 3)
360


### Cleaning

In [105]:
df = pd.read_csv("data/dirtydata.csv")
df.head()

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0


In [116]:
df = pd.read_csv("data/dirtydata.csv")
print(df.size)

# drop na
df.dropna(inplace = True) # inplace change the original df

print(df.size)

160
145


In [118]:
df = pd.read_csv("data/dirtydata.csv")

# replace the na values
df.fillna(130, inplace = True) 

### Convert datatype

In [123]:
df = pd.read_csv("data/dirtydata.csv")
df.dropna(inplace = True)
df["Date"] = pd.to_datetime(df["Date"])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29 entries, 0 to 31
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Duration  29 non-null     int64         
 1   Date      29 non-null     datetime64[ns]
 2   Pulse     29 non-null     int64         
 3   Maxpulse  29 non-null     int64         
 4   Calories  29 non-null     float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 1.4 KB


### Correcting data

In [132]:
df = pd.read_csv("data/dirtydata.csv")


# replace value
df.loc[7, "Duration"] = 45 

# drop row
print(df.head())
df.drop(0, inplace = True) # index 0
print(df.head())

   Duration          Date  Pulse  Maxpulse  Calories
0        60  '2020/12/01'    110       130     409.1
1        60  '2020/12/02'    117       145     479.0
2        60  '2020/12/03'    103       135     340.0
3        45  '2020/12/04'    109       175     282.4
4        45  '2020/12/05'    117       148     406.0
   Duration          Date  Pulse  Maxpulse  Calories
1        60  '2020/12/02'    117       145     479.0
2        60  '2020/12/03'    103       135     340.0
3        45  '2020/12/04'    109       175     282.4
4        45  '2020/12/05'    117       148     406.0
5        60  '2020/12/06'    102       127     300.0


In [138]:
df = pd.read_csv("data/dirtydata.csv")
print(df.duplicated())
# remove duplicate
df.drop_duplicates(inplace = True) 

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12     True
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
dtype: bool
