# Arrays

Arrays are continuous sequences of values of a homogeneous type.  The simplest way to emulate this in Python is using the `list` datatype.  However, lists do not enforce this homogeneity of the data.

In [4]:
# Define a function over a list
def mymean(x):
    s = 0
    for i in x:
        s += i
    return s / len(x)

In [5]:
# Create a list and check the function output
# x = range(10000)
x = [float(x) for x in range(10000000)]
print(mymean(x))

4999999.5


# Benchmarking

> Do not try to optimize what you have not measured.

Before we try to improve the speed of code, we first need to know 

- whether it can in fact be improved, or it is already as fast as possible given the hardware
- which parts of the code should we target for optimization

In [6]:
# A library that provides a magic function to benchmark functions
import timeit

In [7]:
# Actually measure the time for the function defined earlier
%timeit mymean(x)

193 ms ± 11.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [8]:
# Another library - this provides other ways of handling arrays
import numpy as np

In [9]:
print(np.mean(x))

4999999.5


In [10]:
%timeit np.mean(x)

272 ms ± 7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
xn = np.array(x)
%timeit np.mean(xn)

9.75 ms ± 208 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Memory Usage

The notebook interface (or `ipython` in general) allows us to easily see the various elements that exist in the namespace, along with some more information about them.  Use the `who` or `whos` commands for this.

In [12]:
whos

Variable   Type        Data/Info
--------------------------------
mymean     function    <function mymean at 0x0000023AAA128900>
np         module      <module 'numpy' from 'c:\<...>ges\\numpy\\__init__.py'>
timeit     module      <module 'timeit' from 'c:<...>thon312\\Lib\\timeit.py'>
x          list        n=10000000
xn         ndarray     10000000: 10000000 elems, type `float64`, 80000000 bytes (76.2939453125 Mb)


# 2-D arrays

How do we represent a 2-D array in Python - the most obvious way is a list of lists.

In [13]:
xm=[[x for x in range(i, i+100)] for i in range(100)]
print(xm)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 

In [14]:
print(xm[3][4])

7


In [15]:
def rowmean(xm):
    s = []
    for l in xm:
        s.append(mymean(l))
    return s

In [16]:
print(rowmean(xm))

[49.5, 50.5, 51.5, 52.5, 53.5, 54.5, 55.5, 56.5, 57.5, 58.5, 59.5, 60.5, 61.5, 62.5, 63.5, 64.5, 65.5, 66.5, 67.5, 68.5, 69.5, 70.5, 71.5, 72.5, 73.5, 74.5, 75.5, 76.5, 77.5, 78.5, 79.5, 80.5, 81.5, 82.5, 83.5, 84.5, 85.5, 86.5, 87.5, 88.5, 89.5, 90.5, 91.5, 92.5, 93.5, 94.5, 95.5, 96.5, 97.5, 98.5, 99.5, 100.5, 101.5, 102.5, 103.5, 104.5, 105.5, 106.5, 107.5, 108.5, 109.5, 110.5, 111.5, 112.5, 113.5, 114.5, 115.5, 116.5, 117.5, 118.5, 119.5, 120.5, 121.5, 122.5, 123.5, 124.5, 125.5, 126.5, 127.5, 128.5, 129.5, 130.5, 131.5, 132.5, 133.5, 134.5, 135.5, 136.5, 137.5, 138.5, 139.5, 140.5, 141.5, 142.5, 143.5, 144.5, 145.5, 146.5, 147.5, 148.5]


In [17]:
xmn = np.array(xm)
print(type(xmn[0]))

<class 'numpy.ndarray'>


In [18]:
np.mean(xmn)

99.0

In [19]:
print(np.mean(xmn, axis=0))

[ 49.5  50.5  51.5  52.5  53.5  54.5  55.5  56.5  57.5  58.5  59.5  60.5
  61.5  62.5  63.5  64.5  65.5  66.5  67.5  68.5  69.5  70.5  71.5  72.5
  73.5  74.5  75.5  76.5  77.5  78.5  79.5  80.5  81.5  82.5  83.5  84.5
  85.5  86.5  87.5  88.5  89.5  90.5  91.5  92.5  93.5  94.5  95.5  96.5
  97.5  98.5  99.5 100.5 101.5 102.5 103.5 104.5 105.5 106.5 107.5 108.5
 109.5 110.5 111.5 112.5 113.5 114.5 115.5 116.5 117.5 118.5 119.5 120.5
 121.5 122.5 123.5 124.5 125.5 126.5 127.5 128.5 129.5 130.5 131.5 132.5
 133.5 134.5 135.5 136.5 137.5 138.5 139.5 140.5 141.5 142.5 143.5 144.5
 145.5 146.5 147.5 148.5]


In [20]:
%timeit rowmean(xm)

205 µs ± 3.96 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [21]:
%timeit np.mean(xmn, axis=0)

11.5 µs ± 338 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [22]:
print(xmn)

[[  0   1   2 ...  97  98  99]
 [  1   2   3 ...  98  99 100]
 [  2   3   4 ...  99 100 101]
 ...
 [ 97  98  99 ... 194 195 196]
 [ 98  99 100 ... 195 196 197]
 [ 99 100 101 ... 196 197 198]]


# File I/O

Rather than just creating dummy data, we should be able to read in data from other sources.  Most often, these sources are files containing data.  One simple way to do this would be reading data from files.  For now, we will assume that the data is present linewise as text.  This is NOT the most efficient way to store data, but is easiest to work with.

In [25]:
# Generate random data using library function
import random
r = np.random.random((100,100))
np.savetxt('random.txt', r)

In [26]:
# Now read in to a regular list
f = open('random.txt', 'r')
rm = []
for l in f:
    r = [float(x) for x in l.split()]
    rm.append(r)
print(rm)

[[0.1598803203045669, 0.7274917005429364, 0.6941308001772654, 0.8337858693517982, 0.6807788262945837, 0.3134505217329747, 0.6417236655791736, 0.33937563378246793, 0.3653062453777356, 0.2539386472158385, 0.2575657252729312, 0.33213876586131275, 0.685012041669872, 0.2191487362131398, 0.9911141336387979, 0.2358672408190905, 0.508185977431454, 0.43994828516576745, 0.4133767261345659, 0.2260746069209335, 0.05040651997662682, 0.42537414393465944, 0.9715065597180507, 0.5937357121440465, 0.5460739085374235, 0.1389894256637012, 0.6296242728088245, 0.3939729358035994, 0.8949545298668977, 0.7832860428517275, 0.6366572477936295, 0.7672818636637825, 0.28508033743564476, 0.7450093544631271, 0.03486847721266839, 0.379652758993973, 0.8325978983695296, 0.3655593671607553, 0.21781226648356722, 0.3997303849773197, 0.6263226092882596, 0.9331151699426145, 0.7616090364541366, 0.8798148014856836, 0.5975064666246765, 0.1476567989237313, 0.8916282196734852, 0.9238399333195937, 0.8067882892176467, 0.89941292930

In [None]:
print(rowmean(rm))

In [None]:
# Alternative using numpy methods - only for appropriately formatted data of course
rmn = np.loadtxt('random.txt')

In [None]:
print(np.mean(rmn, axis=1))

In [None]:
print(rmn[0,0:10])

# Generalize File I/O and Strings

What if we want to read something like strings that cannot be handled by numpy?

In [None]:
fr = open('randomtext.txt', 'r')
count = 0
for l in fr:
    for word in l.split():
        if word == "the":
            count += 1
print(count)

## Fake files using StringIO

For our testing purposes, we may not always be able to read/write files.  Instead we will create *fake* files where we emulate the behaviour of a file using data from a string.

In [None]:
randstr = """There are many variations of passages of Lorem Ipsum available, 
but the majority have suffered alteration in some form, by injected humour, 
or randomised words which don't look even slightly believable. 
If you are going to use a passage of Lorem Ipsum, you need to be sure 
there isn't anything embarrassing hidden in the middle of text. 
All the Lorem Ipsum generators on the Internet tend to repeat predefined 
chunks as necessary, making this the first true generator on the Internet. 
It uses a dictionary of over 200 Latin words, combined with a handful of 
model sentence structures, to generate Lorem Ipsum which looks reasonable. 
The generated Lorem Ipsum is therefore always free from repetition, 
injected humour, or non-characteristic words etc.
"""

In [None]:
import io
sfr = io.StringIO(randstr)
count = 0
for l in sfr:
    for word in l.split():
        if word == "the":
            count += 1
print(count)

In [None]:
from functools import cache
import timeit
#@cache
def fib(n):
    if n <= 2:
        return 1
    else:
        return fib(n-1) + fib(n-2)

print(fib(46))

In [None]:
%timeit fib(46)