# 2. Arrays - Part 2

In [6]:
import numpy as np
import numpy

## Structured arrays

A structured array consists of a number of columns, where each column can be a different datatype. 

Full information about structured arrays: 
http://docs.scipy.org/doc/numpy-1.10.1/user/basics.rec.html#structured-arrays

One of the possible ways to specify a structured array is to use a list of tuples as `dtype`:
For every column in the array a tuple is specified with the name of the column and the type of data in it. For example: 

In [1]:
dtype = [('Name', 'U10'), ('Country', 'U10'), ('Area', 'float64')]

In [2]:
dtype = [('Name', 'U10'), ('Country', 'U10'), ('Area', 'float64')]

In [5]:
print(dtype)

[('Name', 'U10'), ('Country', 'U10'), ('Area', 'float64')]


The content of the array can then be given as a list of tuples, like so:

In [7]:
city = numpy.array([('Amsterdam', 'Netherlands', 219.3),
                    ('Paris',     'France',      105.4 ),
                    ('Barcelona', 'Spain',       101.9 )],
                     dtype=dtype)
print(city)

[('Amsterdam', 'Netherland', 219.3) ('Paris', 'France', 105.4)
 ('Barcelona', 'Spain', 101.9)]


In [8]:
city.ndim

1

In [9]:
city['Country'], city['Area'], city['Name']

(array(['Netherland', 'France', 'Spain'], dtype='<U10'),
 array([219.3, 105.4, 101.9]),
 array(['Amsterdam', 'Paris', 'Barcelona'], dtype='<U10'))

In [11]:
city['Country'], city['Area'], city['Name']

(array(['Netherland', 'France', 'Spain'], dtype='<U10'),
 array([219.3, 105.4, 101.9]),
 array(['Amsterdam', 'Paris', 'Barcelona'], dtype='<U10'))

### Indexing structured arrays
The rows in a structured array can be accessed by regular indexing. The columns of the array by using the column names that are specified when the array was created.

In [8]:
# Access first row
print(city[0])

('Amsterdam', 'Netherland', 219.3)


In [9]:
city[0]

('Amsterdam', 'Netherland', 219.3)

In [12]:
# Access first two rows
print(city[0:2])

[('Amsterdam', 'Netherland', 219.3) ('Paris', 'France', 105.4)]


In [13]:
# Access column by name
print(city['Area'])

[219.3 105.4 101.9]


In [14]:
# Access two columns using list of names
print(city[['Name', 'Area']])

[('Amsterdam', 219.3) ('Paris', 105.4) ('Barcelona', 101.9)]


In [15]:
print(city[['Name', 'Country']])

[('Amsterdam', 'Netherland') ('Paris', 'France') ('Barcelona', 'Spain')]


In [14]:
# Print information about the array
print(city.shape, city.dtype)

(3,) [('Name', '<U10'), ('Country', '<U10'), ('Area', '<f8')]


In [17]:
print(city.shape), print(city.dtype)

(3,)
[('Name', '<U10'), ('Country', '<U10'), ('Area', '<f8')]


(None, None)

Note that structured arrays like this one, even though they have rows and columns, 
are treated as one-dimensional.

### Accessing and modifying column names

For example:


In [15]:
city.dtype.names

('Name', 'Country', 'Area')

In [18]:
city.dtype.names

('Name', 'Country', 'Area')

In [16]:
city.dtype.names = ('name', 'country', 'area')
print(city['area'])

[219.3 105.4 101.9]


In [20]:
city.dtype.names = ('name','country', 'area')
city.dtype.names

('name', 'country', 'area')

In [21]:
city.dtype

dtype([('name', '<U10'), ('country', '<U10'), ('area', '<f8')])

### Loading data into structured arrays

Structured arrays are useful for loading and working with tabular data with heterogeneous column types. 

#### Exercise 2b.1

Complete the following code loading the data from file [populations.txt](populations.txt). Load the year column as an `int`, and the other columns as `float`.

In [22]:
# Define dtype
dtype = [('year','i8'), ('hare', 'f8'), ('lynx', 'f8'), ('carrot', 'f8')]
# load data
population = numpy.loadtxt("populations.txt", dtype=dtype)
print(population.dtype)       


[('year', '<i8'), ('hare', '<f8'), ('lynx', '<f8'), ('carrot', '<f8')]


An alternative way of loading tabular data using `genfromtxt`:

In [23]:
population = numpy.genfromtxt("populations.txt", 
                 names=True,
                 dtype=['int','float','float','float'])
# Access lynx column

print(population['lynx'])


print(population.dtype)

[ 4000.  6100.  9800. 35200. 59400. 41700. 19000. 13000.  8300.  9100.
  7400.  8000. 12300. 19500. 45700. 51100. 29700. 15800.  9700. 10100.
  8600.]
[('year', '<i8'), ('hare', '<f8'), ('lynx', '<f8'), ('carrot', '<f8')]


In [29]:
population = np.genfromtxt("populations.txt", 
                          names = True,
                          dtype = ['int', 'float', 'float','float'])
print(population['lynx'])

print(population.dtype)

[ 4000.  6100.  9800. 35200. 59400. 41700. 19000. 13000.  8300.  9100.
  7400.  8000. 12300. 19500. 45700. 51100. 29700. 15800.  9700. 10100.
  8600.]
[('year', '<i8'), ('hare', '<f8'), ('lynx', '<f8'), ('carrot', '<f8')]


### Record arrays
There is a special interface to structured arrays called **record arrays**. For details, see https://docs.scipy.org/doc/numpy-1.10.1/user/basics.rec.html#record-arrays

In [38]:
pop2 = np.rec.array(population)
print(type(pop2))

print(pop2.year)

print(pop2[0:3].carrot)
print(pop2[0:3].hare)

<class 'numpy.recarray'>
[1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913
 1914 1915 1916 1917 1918 1919 1920]
[48300. 48200. 41500.]
[30000. 47200. 70200.]


## Array Indexing

For complete information  about indexing see
http://docs.scipy.org/doc/numpy/user/basics.indexing.html

You have already seen how to access content of the array by using an index for each dimension. This method is know as matrix indexing. In addition to matrix indexing, there are other ways to address content in an array

- Linear indexing transforms an n-dimensional array to a 1-dimensional list. This linear index is returned when the `argmin` and `argmax` function are applied to an n-dimensional array. 

In [20]:
a = numpy.random.uniform(-1, 1, (5,5))
print(a)
# Return the index of the maximum value
numpy.argmax(a)

[[ 0.5742096   0.75687853  0.35885174  0.16858814 -0.26401042]
 [ 0.9158619   0.74355223 -0.6763735  -0.69540853  0.36883388]
 [-0.1085984   0.96037587  0.07502181 -0.37324387  0.44493281]
 [ 0.65608343  0.15671954 -0.1385521  -0.40119154 -0.14156916]
 [ 0.24731078  0.00809253 -0.87772442  0.59791545 -0.83205954]]


11

In [41]:
b = np.random.uniform(-1,1, (5,5))
print(b)

print(np.argmax(b), np.argmin(b))

[[ 0.98343966  0.91467061  0.23534256  0.56683186  0.55814336]
 [-0.10145228  0.59607992  0.62880735  0.54511689  0.37513249]
 [ 0.16355048  0.53314915  0.88002621  0.64323189  0.98143013]
 [ 0.03717994 -0.70257969  0.76595041 -0.96604096  0.31457879]
 [ 0.50485088 -0.5593839  -0.62020545 -0.5856025  -0.74651862]]
0 18


- Boolean indexing, which returns all values in the array for which the index is True.

In [21]:
a = numpy.random.uniform(-1, 1, (5,5))
# Create a boolean index for positive numbers in array a
print(a)
index = a > 0.
print(index)
# Return all the positive numbers
print(a[:,0][index[:,0]])

[[-0.98377591  0.06072144  0.99218221 -0.83158922 -0.41705789]
 [-0.83507594 -0.05563875 -0.15104522 -0.43933757 -0.31865632]
 [ 0.83525921  0.77563887  0.66405576 -0.92629227 -0.72016054]
 [-0.45154832 -0.83404489 -0.61577725  0.99245354  0.25021523]
 [ 0.81220112 -0.05254329 -0.30535853  0.25473508  0.31116835]]
[[False  True  True False False]
 [False False False False False]
 [ True  True  True False False]
 [False False False  True  True]
 [ True False False  True  True]]
[0.83525921 0.81220112]


In [49]:
index2 = b >0
print(b)
print(index2)

print(b[:,0][index2[:,0]])

print(b[:,4][index2[:,4]])

[[ 0.98343966  0.91467061  0.23534256  0.56683186  0.55814336]
 [-0.10145228  0.59607992  0.62880735  0.54511689  0.37513249]
 [ 0.16355048  0.53314915  0.88002621  0.64323189  0.98143013]
 [ 0.03717994 -0.70257969  0.76595041 -0.96604096  0.31457879]
 [ 0.50485088 -0.5593839  -0.62020545 -0.5856025  -0.74651862]]
[[ True  True  True  True  True]
 [False  True  True  True  True]
 [ True  True  True  True  True]
 [ True False  True False  True]
 [ True False False False False]]
[0.98343966 0.16355048 0.03717994 0.50485088]
[0.55814336 0.37513249 0.98143013 0.31457879]


- Indexing with an array of indices. In this case you specify a separate array in which you store the indices as integers and you will return exactly the elements of the array with these indices. 

In [23]:
b = numpy.linspace(0,1,10)
print(b)
# Return numbers at prime indices
index = numpy.array([ 2, 3, 5, 7])

print(b[index])

[0.         0.11111111 0.22222222 0.33333333 0.44444444 0.55555556
 0.66666667 0.77777778 0.88888889 1.        ]
[0.22222222 0.33333333 0.55555556 0.77777778]


### Linear and matrix indexing

Indexing in a 1-dimensional matrix is similar as indexing in a Python list. 

Indexing in a n-dimensional matrix has one index for every dimension. To access one element of the array, the index of every dimension should be given. When accessing more than one element, the slice syntax `m:n` can be used, and this works similar as it works with lists, but you can use the `m:n` for every dimension. 

If the index is `[m:n]` then indices that are used are `m` up to but not including `n`.

If you have the linear index and you want to convert it to a matrix index, you can use the function `numpy.unravel_index()`.

The first argument is the linear index and the second argument is the shape of the array for which you want to transform the index. For example: `numpy.unravel_index(linear_index, (2,3))`. 

In [50]:
# indexing in a 3-dimensional array
z = numpy.arange(24).reshape((2, 3, 4))
print(z)

[[[ 0  1  2  3]
  [ 4  5  6  7]
  [ 8  9 10 11]]

 [[12 13 14 15]
  [16 17 18 19]
  [20 21 22 23]]]


In [54]:
# slices
print(z[0:2, 1:3, 3]) # 1st and 2nd block, 1-3 rows, 3rd column
print()
print(z[:, 2, :]) # all blocks, 2nd row, all columns

[[ 7 11]
 [19 23]]

[[ 8  9 10 11]
 [20 21 22 23]]


In [33]:
# linear indexing
linear_index = 10
print("\n For a array with dimensions (2, 3, 4), the linear index: ", linear_index, " is equal to \
multidimensional index: ", numpy.unravel_index(linear_index, z.shape))


 For a array with dimensions (2, 3, 4), the linear index:  10  is equal to multidimensional index:  (0, 2, 2)


#### Exercise 2b.2

Create a $4\times3$ matrix of random numbers between $0$ and $1$. 
Find the row and column position of the minimum and the maximum value.

In [51]:
cc = np.random.uniform(0,1,(4,3))
print(cc)

[[0.5972184  0.26834783 0.44216787]
 [0.55833044 0.58313137 0.43535966]
 [0.44781663 0.85979294 0.28031132]
 [0.12637611 0.49124254 0.43658165]]


In [62]:
print("min index: ",cc.argmin(),"\n","max index: ", cc.argmax())
print()

print("for an array with dimensions (4X3), the multidimensional index of argmin()={}, is {}".format(cc.argmin(),
                                                                                                   np.unravel_index(cc.argmin(),(4,3))))

print()
print("for an array with dimensions (4X3), the multidimensional index of argmax()={}, is {}".format(cc.argmax(),
                                                                                                   np.unravel_index(cc.argmax(),(4,3))))



min index:  9 
 max index:  7

for an array with dimensions (4X3), the multidimensional index of argmin()=9, is (3, 0)

for an array with dimensions (4X3), the multidimensional index of argmax()=7, is (2, 1)


#### Exercise 2b.3 

Complete the following code to print years with the smallest number of hares, lynxes and carrots in the 
populations dataset.

In [66]:
for species in ['hare','lynx','carrot']:
    year = population['year'][np.argmin(population[species])]
    print("Least # of {} in year {}".format(species, year))

Least # of hare in year 1917
Least # of lynx in year 1900
Least # of carrot in year 1916


In [68]:
for i in ['hare','lynx', 'carrot']:
    year = population['year'][np.argmax(population[i])]
    print("Most # of {} in year {}".format(i, year))

Most # of hare in year 1903
Most # of lynx in year 1904
Most # of carrot in year 1900


### Boolean indexing

A boolean index can be created directly, but most often it is built by specifying a certain condition.

The condition will return a `True` or `False` for every position in the array and when the condition is True then the corresponding element will be retrieved.

In [73]:
# Boolean indexing
x = numpy.arange(1, 6)
y = numpy.array([True, False, True, False, True ])
print("Only elements of x for which the value in y is True: ", x[y])

# boolean indexing by using a condition
print("Only elements of x for which the condition is True: ", x[x>3])

Only elements of x for which the value in y is True:  [1 3 5]
Only elements of x for which the condition is True:  [4 5]


In [74]:
type(np.arange(1,6))
type(np.array([True, False, True, False, True]))

print(x[y])

[1 3 5]


#### Exercise 2b.4
Use the population data to

1. Select all the years in which there are more than 50000 lynxes;
2. Select all the years in which there are more lynxes than hares.

In [77]:
pop_sub = population['year'][population['lynx']>50000]
print(pop_sub)
print()

pop_sub2 = population['year'][population['lynx']>population['hare']]
print(pop_sub2)

[1904 1915]

[1904 1905 1906 1915 1916 1917]


In [78]:
#Select all the years in which there are more than 50000 lynxes
print('years with more than 50000 lynxes :' , population['year'][population['lynx'] > 50000])

#Select all the years in which there are more lynxes than hares.
print('years with more lynxes than hares:', population['year'][population['lynx'] > population['hare']])

years with more than 50000 lynxes : [1904 1915]
years with more lynxes than hares: [1904 1905 1906 1915 1916 1917]


In [79]:
criteria = [population['lynx'] > 50000]
print('years with more than 50000 lynxes :', population['year'][criteria])

#Select all the years in which there are more lynxes than hares.
criteria = [population['lynx'] > population['hare']]
print('years with more lynxes than hares:', population['year'][criteria])

years with more than 50000 lynxes : [1904 1915]
years with more lynxes than hares: [1904 1905 1906 1915 1916 1917]


  
  


### Indexing with an array of indices

In this case you specify a separate array in which you store the indices as integers and you will return exactly the elements of the array with these indices.

One advantage of this is that you can explicitly specify the order in which you want to have the values and you can return multiple times the value at a certain position. 

In [80]:
x = numpy.arange(100, 111)
y = numpy.array([8, 3, 8, 4, 9, 3])
print("Array x: ", x)
print("Array with indices: ", y)
print("Indexing with an array of indices will give:", x[y])

Array x:  [100 101 102 103 104 105 106 107 108 109 110]
Array with indices:  [8 3 8 4 9 3]
Indexing with an array of indices will give: [108 103 108 104 109 103]


#### Exercise 2b.5

Indexing with an array is often useful when we want to randomize the order of items in some data. Complete the following code which creates a scrambled version of the population data

In [84]:
# Create an index for the rows of population (from 0 to population.shape[0])
index = np.arange(0,population.shape[0])
# Shuffle the index
numpy.random.shuffle(index)
# Create a scrambled version
population_rand = population[index]
print(population_rand)
print()
print(population_rand['year'])

[(1901, 47200.,  6100., 48200.) (1916, 11200., 29700., 36700.)
 (1906, 18100., 19000., 38600.) (1919, 16200., 10100., 41300.)
 (1913, 76600., 19500., 40900.) (1912, 57000., 12300., 43800.)
 (1915, 19500., 51100., 39000.) (1903, 77400., 35200., 38200.)
 (1909, 25400.,  9100., 42100.) (1918, 14600.,  9700., 43300.)
 (1911, 40300.,  8000., 46800.) (1917,  7600., 15800., 41800.)
 (1905, 20600., 41700., 39800.) (1908, 22000.,  8300., 44500.)
 (1900, 30000.,  4000., 48300.) (1902, 70200.,  9800., 41500.)
 (1904, 36300., 59400., 40600.) (1920, 24700.,  8600., 47300.)
 (1914, 52300., 45700., 39400.) (1907, 21400., 13000., 42300.)
 (1910, 27100.,  7400., 46000.)]

[1901 1916 1906 1919 1913 1912 1915 1903 1909 1918 1911 1917 1905 1908
 1900 1902 1904 1920 1914 1907 1910]


## Vector stacking

Sometimes you want to combine two or more vectors to create an array. This is called vector stacking. Vector stacking can be done in two different ways horizontal and vertical. 
- horizontal stack: `numpy.hstack([x, y, z])`
- vertical stack: `numpy.vstack([x, y, z])`

In [85]:
x = numpy.arange(0, 5)                     
y = numpy.arange(5, 10)   
z = numpy.arange(10, 15)
print("Horizontal stack: ",  numpy.hstack([x,y, z]) )
print("Vertical stack: ")
print( numpy.vstack([x,y, z]))

Horizontal stack:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
Vertical stack: 
[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]]


### Save data set to file

To save an array from numpy as a separate file you specify the filename and the array you want to save. Use the following functions:
- `numpy.savetxt(filename, array)` : save an array to a text file. Some optional arguments are: delimiter=' ', newline = '\n', header = ' '. http://docs.scipy.org/doc/numpy-1.10.0/reference/generated/numpy.savetxt.html#numpy.savetxt
- `numpy.save(filename, array)` : save an array to a binary file in numpy `.npy` format. http://docs.scipy.org/doc/numpy-1.10.0/reference/generated/numpy.save.html#numpy.save


In [90]:
mm= np.vstack([x,y,z])
mm
np.savetxt("mm.txt", mm)
np.save("mm.npy", mm)

#### Exercise 2b.6 

Save the population data to a `.npy` file. Figure out how to load it back into a numpy array.

#### Exercise 2b.7
The files

- [irisa.txt](irisa.txt)
- [irisb.txt](irisb.txt)
- [irisc.txt](irisc.txt)

contain the data for the iris dataset. Each file has these columns:

- `SepalLength` 
- `SepalWidth`
- `PetalLength` 
- `PetalWidth` 
- `Species`

Load this data, and create a single array with all the species.

In [92]:
np.save("populations.npy",population)
np.load("populations.npy")

array([(1900, 30000.,  4000., 48300.), (1901, 47200.,  6100., 48200.),
       (1902, 70200.,  9800., 41500.), (1903, 77400., 35200., 38200.),
       (1904, 36300., 59400., 40600.), (1905, 20600., 41700., 39800.),
       (1906, 18100., 19000., 38600.), (1907, 21400., 13000., 42300.),
       (1908, 22000.,  8300., 44500.), (1909, 25400.,  9100., 42100.),
       (1910, 27100.,  7400., 46000.), (1911, 40300.,  8000., 46800.),
       (1912, 57000., 12300., 43800.), (1913, 76600., 19500., 40900.),
       (1914, 52300., 45700., 39400.), (1915, 19500., 51100., 39000.),
       (1916, 11200., 29700., 36700.), (1917,  7600., 15800., 41800.),
       (1918, 14600.,  9700., 43300.), (1919, 16200., 10100., 41300.),
       (1920, 24700.,  8600., 47300.)],
      dtype=[('year', '<i8'), ('hare', '<f8'), ('lynx', '<f8'), ('carrot', '<f8')])

In [101]:
dtype = [("SepalLength", "float64"), ("SepalWidth", "float64"), ("PetalLength", "float64"),("PetalWidth", "float64"), ("Species", "U10")] 


In [102]:
irisa = np.loadtxt("irisa.txt", dtype=dtype)
irisb = np.loadtxt("irisb.txt", dtype=dtype)
irisc = np.loadtxt("irisc.txt", dtype=dtype)

In [104]:
iris = np.hstack([irisa, irisb, irisc])
print(iris)

[(5.1, 3.5, 1.4, 0.2, 'setosa') (4.9, 3. , 1.4, 0.2, 'setosa')
 (4.7, 3.2, 1.3, 0.2, 'setosa') (4.6, 3.1, 1.5, 0.2, 'setosa')
 (5. , 3.6, 1.4, 0.2, 'setosa') (5.4, 3.9, 1.7, 0.4, 'setosa')
 (4.6, 3.4, 1.4, 0.3, 'setosa') (5. , 3.4, 1.5, 0.2, 'setosa')
 (4.4, 2.9, 1.4, 0.2, 'setosa') (4.9, 3.1, 1.5, 0.1, 'setosa')
 (5.4, 3.7, 1.5, 0.2, 'setosa') (4.8, 3.4, 1.6, 0.2, 'setosa')
 (4.8, 3. , 1.4, 0.1, 'setosa') (4.3, 3. , 1.1, 0.1, 'setosa')
 (5.8, 4. , 1.2, 0.2, 'setosa') (5.7, 4.4, 1.5, 0.4, 'setosa')
 (5.4, 3.9, 1.3, 0.4, 'setosa') (5.1, 3.5, 1.4, 0.3, 'setosa')
 (5.7, 3.8, 1.7, 0.3, 'setosa') (5.1, 3.8, 1.5, 0.3, 'setosa')
 (5.4, 3.4, 1.7, 0.2, 'setosa') (5.1, 3.7, 1.5, 0.4, 'setosa')
 (4.6, 3.6, 1. , 0.2, 'setosa') (5.1, 3.3, 1.7, 0.5, 'setosa')
 (4.8, 3.4, 1.9, 0.2, 'setosa') (5. , 3. , 1.6, 0.2, 'setosa')
 (5. , 3.4, 1.6, 0.4, 'setosa') (5.2, 3.5, 1.5, 0.2, 'setosa')
 (5.2, 3.4, 1.4, 0.2, 'setosa') (4.7, 3.2, 1.6, 0.2, 'setosa')
 (4.8, 3.1, 1.6, 0.2, 'setosa') (5.4, 3.4, 1.5, 0.4, 's

In [106]:
iris.ndim

1

In [109]:
#Select the flower with highest sepallenght:
index = iris['SepalLength'].argmax()
index
print(iris[index])

(7.9, 3.8, 6.4, 2., 'virginica')


In [110]:
#Select the species of the flower with highest sepallenght:
iris['Species'][iris['SepalLength'].argmax()]

'virginica'

In [111]:
#Select the species of the flower with smallest sepallenght:
iris['Species'][iris['SepalLength'].argmin()]

'setosa'