# NumPy
NumPy is a library that allows us to work on tabular, spreadsheet-like (matrix) data structures. It is a significant improvement over using lists of lists to simulate tabular data 

In [2]:
#first, lets see how lists of lists look  

row1=[1,2,3]
row2=[4,5,6]
row3=[7,8,9]


listOfLists = [row1,row2,row3]
listOfLists

[[1, 2, 3], [4, 5, 6], [7, 8, 9]]

In [3]:
#import numpy
import numpy as np

In [4]:
#creating a numpy n-dimensional array, using the list of lists above
narray = np.array(listOfLists)
narray

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [5]:
#we can easily slice a row or column of a numpy array
narray[:,0]

array([1, 4, 7])

In [6]:
narray[2,:]

array([7, 8, 9])

In [7]:
#note the behavior with rows first, so index 2 returns row 2 (aka "row three" - the bottom one in this example)
narray[2]

array([7, 8, 9])

In [8]:
#nympy comes wit a wealth of extra functions, some similar to Python's built in ones, and many additional functions.
np.max(narray)

9

In [9]:
#asking for the minimum of row 1
np.min(narray[1])

4

In [10]:
#asking for the column index of the minimum of row 2. (like asking "where is the minimum of row 2 located")
np.argmin(narray[2])

0

In [11]:
#arrays can quickly be reshaped, into other applicable shapes. 3*3 can also be viewed as 9 (by one)
narray.reshape(9,)

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [12]:
narray.reshape(3,3)

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [13]:
#arrays can be transposed/flipped
narray.T

array([[1, 4, 7],
       [2, 5, 8],
       [3, 6, 9]])

### Note that NumPy has a peculiar way pf dealing with copies - "a copy is just a view" - any modifications to the copy is modifications to the original also

In [14]:
#creating a copy, which is a slice of the original
narrayCopy = narray[0]
narrayCopy

array([1, 2, 3])

In [15]:
#changing the copy
narrayCopy[1] = 7

In [16]:
narray

array([[1, 7, 3],
       [4, 5, 6],
       [7, 8, 9]])

Note how the original also changed in place 0,1

### lets learn how to import data from file into a NumPy array, to play around with more NumPy functions

# Importing CSV with numpy (this part is where we put emphasis on)
-----------------------------------------

Remember that NumPy has a great documentation online at numpy.org. This is an example of the documentation of np.genfromtxt as used below:
https://numpy.org/doc/stable/reference/generated/numpy.genfromtxt.html#numpy.genfromtxt

learning to quickly read documentation on new functions and then implement them is time well spent - as you will always come across new libraries with functions you would like to use.

In [2]:
import numpy as np
#The csv imported has been uploaded along 
#with this notebook
start_array = np.genfromtxt('sort of random data.csv',
                            dtype='float',
                            delimiter=';',
                            filling_values=0)

### Now, the start_array holds data we can investigate with numpys different functions

In [3]:
#The .shape tells us number of rows, and 
#columns, respectively
start_array.shape

(26, 7)

In [4]:
#Making a slice of the array 
#to inspect one column of the data
start_array[:,4]

array([298.12,  48.93,  29.61, 151.  ,  55.1 ,  95.45,  34.4 ,  86.94,
        64.69,  34.01,  31.25, 357.19, 216.56, 233.44,  24.29, 155.  ,
       151.61, 151.  ,  47.81, 170.64,  99.36,  42.3 ,  14.94,  43.85,
       267.7 , 106.61])

## Examine the two first columns of the data (display a slice)

In [5]:
start_array[:,:2]

array([[17.9237 , 59.6423 ],
       [17.0514 , 59.5093 ],
       [20.51447, 59.0481 ],
       [11.0534 , 60.0814 ],
       [11.1437 , 60.4586 ],
       [16.5732 , 55.6803 ],
       [ 6.71895, 55.365  ],
       [ 9.1138 , 55.7391 ],
       [19.45983, 60.287  ],
       [29.29154, 59.9821 ],
       [11.117  , 60.1922 ],
       [16.18511, 58.8726 ],
       [16.07554, 58.8818 ],
       [15.98789, 58.8885 ],
       [22.5235 , 58.266  ],
       [ 6.1523 , 56.3694 ],
       [ 7.1236 , 58.6813 ],
       [ 6.0798 , 59.6619 ],
       [14.1471 , 60.2908 ],
       [12.81027, 59.3149 ],
       [14.6657 , 56.4846 ],
       [ 4.70555, 60.5926 ],
       [24.73179, 61.6196 ],
       [ 9.1897 , 55.1424 ],
       [10.94208, 55.5425 ],
       [14.77893, 58.8833 ]])

## Ex 2 Create a function to return the max, min and mean value in a column


In [6]:
def descr(col):
    return {'max':np.max(col),
            'mean':np.mean(col),
           'min':np.min(col)}

In [7]:
descr(start_array[:,0])

{'max': 29.29154, 'mean': 14.079225, 'min': 4.70555}

In [None]:
[descr(start_array[:,i]) for i in range(start_array.shape[1])]


In [11]:
# information on the shape
num_rows = start_array.shape[0]
print('there are {} rows in the data'.format(num_rows))

num_cols = start_array.shape[1]
print('there are {} columns in the data'.format(num_cols))

there are 26 rows in the data
there are 7 columns in the data


# Think about: how to return top five rows based on values in column 1
https://numpy.org/doc/stable/reference/generated/numpy.argsort.html
Especially parameters and returns.


(use argsort as part of the implementation)

(better still: can you write this as a function, which takes a numpy row/column as input and returns the top five rows with their indices)

(note that argsort sorts ascending - we are looking for descending sorting. You can use "np.flip"/"np.fliplr" around the expression, OR numpy indexing, using [::-1] at the end of your expression, to reverse and obtain descending sorting


In [18]:
arr0=np.array([0,0,0])
arr1=np.array([3,3,3])
arr2=np.array([2,2,2])
arr = np.array([arr0,arr1,arr2])
print(arr, arr.shape)

[[0 0 0]
 [3 3 3]
 [2 2 2]] (3, 3)


In [34]:
s1 = np.argsort(arr)
print(s1)

s2= np.argsort(arr,axis=1)
print(s2)

[[0 1 2]
 [0 1 2]
 [0 1 2]]
[[0 1 2]
 [0 1 2]
 [0 1 2]]


In [40]:
def top5(ar):
    """input: an array in shape of (1,)
    output: iterable with indices of top five elements, highest value first"""
    ar_sorted_asc = np.argsort(ar) #by default argsort gives ascending sorting
    #ar_sorted_desc = np.argsort(ar)[::-1] #to obtain descending sorting, we reverse the order with [::-1] on the end . alternatively np.flip/np.fliplr
    
    return ar_sorted_asc[:5]

In [43]:
#quick loop to print out the top five values. Do a visual sanity check to see that your code returns descending values (column 1)

for element in top5(start_array[:,0]):
    print(start_array[element,:])

[ 4.70555e+00  6.05926e+01  3.37566e+03  1.08180e+02  4.23000e+01
 -3.30000e-01  3.36042e+03]
[ 6.07980e+00  5.96619e+01  6.46938e+03  1.46450e+02  1.51000e+02
 -9.75000e+00  6.49986e+03]
[6.152300e+00 5.636940e+01 1.187958e+04 2.547400e+02 1.550000e+02
 0.000000e+00 1.191768e+04]
[ 6.71895e+00  5.53650e+01  3.16992e+03  1.73940e+02  3.44000e+01
 -2.60000e+00  3.13944e+03]
[   7.1236   58.6813 6088.38    173.1     151.61     -9.75   6134.1   ]


### Numpy supports using an array with indixes to slice another array. The sorting will be the same as in the original array. useful if you want the five best scores, but dont keed them sorted, etc

In [18]:
start_array[top5(start_array[1]),:2]

array([[ 6.71895, 55.365  ],
       [20.51447, 59.0481 ],
       [11.0534 , 60.0814 ],
       [17.0514 , 59.5093 ],
       [11.1437 , 60.4586 ]])

### 

### bonus; pandas DataFrame to easily allow for colunn and row labels. Example with the min/max/mean from above

In [1]:
import pandas as pd
pd.DataFrame([descr(start_array[:,i]) for i in range(start_array.shape[1])])

Unnamed: 0,max,mean,min
0,29.29154,14.079225,4.70555
1,61.6196,58.595292,55.1424
2,11894.82,4120.075385,0.0
3,283.69,156.414912,16.98
4,357.19,115.838462,14.94
5,0.0,-2.713846,-10.73
6,11940.54,3770.727692,0.0
