In [1]:
import numpy as np

In [2]:
arr = np.array([4, 3, 1, 5, 6])
print(f'the array contains {arr}')
print(f'a numpy array has has type {arr}')
print(f'the array has a shape of {arr.shape}')
print(f'The item at index 0 in the array is {arr[0]}')
print(f'The item at index 2 in the array is {arr[2]}')
print(f'If we slice the array between item 0 and 2 we get {arr[:2]}')
print(f'If we slice the array between item 3 and 5 we get {arr[3:5]}')

the array contains [4 3 1 5 6]
a numpy array has has type [4 3 1 5 6]
the array has a shape of (5,)
The item at index 0 in the array is 4
The item at index 2 in the array is 1
If we slice the array between item 0 and 2 we get [4 3]
If we slice the array between item 3 and 5 we get [5 6]


In [3]:
empty_arr = np.empty(10)
print(empty_arr)
print(type(empty_arr))
print(empty_arr.shape)

[6.93462497e-310 6.93462497e-310 6.93460180e-310 6.93462387e-310
 6.93462398e-310 6.93462338e-310 6.93462401e-310 6.93462400e-310
 6.93462398e-310 6.93462396e-310]
<class 'numpy.ndarray'>
(10,)


In [4]:
zeros_arr = np.zeros(10)
print(zeros_arr)
print(type(zeros_arr))
print(zeros_arr.shape)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
<class 'numpy.ndarray'>
(10,)


In [5]:
data = np.zeros(10)
print(f'original data {data}')

data[5] = 111
data[9] = 222
print(f'updated data {data}')

original data [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
updated data [  0.   0.   0.   0.   0. 111.   0.   0.   0. 222.]


In [6]:
data = np.zeros(10)
print(f'original data {data}')

# slice from index 3 to index 5
slice_of_data = data[3:6] 
#incremement each value in the slice by 999
slice_of_data += 999 

print(f'slice of data {data}')
print(f'the original data is also updated {data}')

original data [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
slice of data [  0.   0.   0. 999. 999. 999.   0.   0.   0.   0.]
the original data is also updated [  0.   0.   0. 999. 999. 999.   0.   0.   0.   0.]


In [7]:
data1 = np.arange(10)  #create a sequence of integers 0-9
data2 = np.arange(10,20) #create a sequence of integers 10-19

print(f'data1: {data1}')
print(f'data2: {data2}')

print(f'The square of each value in data1 {data1**2}') 
print(f'The summation of the the two arrays {data1 + data2}')
print(f'The difference between the the two arrays {data1 - data2}')

data1 += 10
print(f'If we add 10 to each element in data1 we get {data1}')

data1: [0 1 2 3 4 5 6 7 8 9]
data2: [10 11 12 13 14 15 16 17 18 19]
The square of each value in data1 [ 0  1  4  9 16 25 36 49 64 81]
The summation of the the two arrays [10 12 14 16 18 20 22 24 26 28]
The difference between the the two arrays [-10 -10 -10 -10 -10 -10 -10 -10 -10 -10]
If we add 10 to each element in data1 we get [10 11 12 13 14 15 16 17 18 19]


In [8]:
data = np.random.randn(10000)  
print(data.shape)
print(type(data))

(10000,)
<class 'numpy.ndarray'>


In [9]:
def descriptives(data):
    """
    Returns mean, stdev, and 1st and 99th 
    percentile of a 1D numpy.array
    
    Parameters:
    -----------
    data: numpy.ndarray 
        1D array containing data to analyse
        
    Returns:
    --------
        tuple with 4 items
    """
    mean = data.mean()
    std = data.std()
    per_1st = np.percentile(data, 1) 
    per_99th = np.percentile(data, 99)
    
    return mean, std, per_1st, per_99th


results = descriptives(data)
print(results)

(0.014888317106039944, 0.9914977689951731, -2.322939713078127, 2.293120809807665)


In [10]:
result = data >= 2.3

print(result.shape)
print(type(result))
print(result)

(10000,)
<class 'numpy.ndarray'>
[False False False ... False False False]


In [11]:
def prob_great_than_or_equal_to(data, x):
    '''
    Return the proportion of the dataset that
    is greater than or equal to x
    
    Parameters:
    -----------
    data: numpy.ndarray 
        Vector containing numeric data
    x: float
        Function returns proportion where data >=x
        
    Returns:
    --------
    float
    '''
    return (data >= x).sum()/data.shape[0]


def prob_less_than_or_equal_to(data, x):
    '''
    Return the proportion of the dataset that
    is less than or equal to x
    
    Keyword arguments
    data -- a numpy.ndarray containing numeric data
    x -- a numeric value. Function returns proportion where data <=x
    '''
    return (data <= x).sum()/data.shape[0]

x1 = prob_great_than_or_equal_to(data, 1.96)
x2 = prob_less_than_or_equal_to(data, -1.96)

print(x1, x2)

0.0246 0.0236


In [12]:
#OLS functionality is in statsmodels
import statsmodels.api as sm  

def load_dtoc_dataset():
    '''
    Loads the breach and dtoc data sets into memory
    Returns a tuple of numpy.ndarrays representing
    breach and dtoc dataset respectively.
    '''    
    #note we use skip_header because the dataset has column descriptors
    dtoc = np.genfromtxt('./data/dtocs.csv', skip_header=1)  
    breach = np.genfromtxt('./data/breach.csv', skip_header=1)
    return breach, dtoc
    
    
breach, dtoc = load_dtoc_dataset()

#regression code
dtoc = sm.add_constant(dtoc) # an intercept term to the model
model = sm.OLS(breach, dtoc)
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.714
Model:                            OLS   Adj. R-squared:                  0.710
Method:                 Least Squares   F-statistic:                     194.6
Date:                Thu, 11 Mar 2021   Prob (F-statistic):           6.80e-23
Time:                        15:08:26   Log-Likelihood:                -945.02
No. Observations:                  80   AIC:                             1894.
Df Residuals:                      78   BIC:                             1899.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -1.633e+05      2e+04     -8.178      0.0

In [13]:
data = np.arange(10)

for x in data:
    print(x, end= ' ')

0 1 2 3 4 5 6 7 8 9 

In [14]:
data = np.arange(10)

for i in range(data.shape[0]):
    print(data[i], end = ' ')

0 1 2 3 4 5 6 7 8 9 

In [15]:
def max_value(data):
    '''
    Loop over each element in array
    and return the maximum value
    
    Keyword arguments:
    data -- numpy.array containing numeric data
    '''
    max_value = 0
    
    for x in data:
        max_value = max(x, max_value)
    return max_value
        
data = np.array([100, 10, 10, 1000, 999, 1])
print(max_value(data))

1000


In [16]:
def ceiling(arr, upper_limit):
    '''
    Loop through all elements of a np.ndarray
    and impose a max_value ceiling on the data.
    
    Keyword arguments:
    arry - numeric np.ndarray to iterate
    upper_limit - the numeric upper limit on values in arr
    '''
    for i in range(arr.shape[0]):
        arr[i] = min(arr[i], upper_limit)
        
data = np.arange(10)

print('original data: {0}'.format(data))

ceiling(data, 5)

print('data with ceiling {0}'.format(data))

original data: [0 1 2 3 4 5 6 7 8 9]
data with ceiling [0 1 2 3 4 5 5 5 5 5]


In [17]:
def ceiling(to_test, upper_limit):
    '''
    Returing the minimum value by comparing to_test to upper_limit
    
    Parameters:
    -----------
    to_test: float
        numeric value to test if breaches ceiling
    
    upper_limit: float
        the numeric upper limit on to_test
    '''
    return min(to_test, upper_limit)

# v_ceiling is a wrapper function that we call instead of ceiling
v_ceiling = np.vectorize(ceiling)  

data = np.arange(10)
c_data = v_ceiling(data, 5)

print(f'original data: {data}')
print(f'data with ceiling {c_data}')

original data: [0 1 2 3 4 5 6 7 8 9]
data with ceiling [0 1 2 3 4 5 5 5 5 5]


In [18]:
# here we vectorize min instead of our custom function
v_ceiling = np.vectorize(min)  

data = np.arange(10)
c_data = v_ceiling(data, 5)

print(f'original data: {data}')
print(f'data with ceiling {c_data}')

original data: [0 1 2 3 4 5 6 7 8 9]
data with ceiling [0 1 2 3 4 5 5 5 5 5]


In [19]:
data = np.array([0, 1, 2, 500, 700])
results = np.where(data > 2)

print(results)


(array([3, 4]),)


In [20]:
data = np.array([0, 1, 2, 500, 700])
indexes = [1, 3, 4]
sliced_data = data[indexes]

print(sliced_data)

[  1 500 700]


In [21]:
data = np.array([0, 1, 2, 500, 700])
sliced_data = data[np.where(data > 2)]

print(sliced_data)

data[np.where(data > 2)] = 999 
print(data)


[500 700]
[  0   1   2 999 999]


In [22]:
def ceiling(data, upper_limit):
    '''
    Returing the minimum value by comparing to_test to upper_limit
    
    Parameters:
    -----------
    to_test: float
        numeric value to test if breaches ceiling
    
    upper_limit: float
        the numeric upper limit on to_test
    '''
    data[np.where(data > upper_limit)] = upper_limit

data = np.arange(10)

print(f'original data: {data}')
ceiling(data, 5)
print(f'data with ceiling {data}')

original data: [0 1 2 3 4 5 6 7 8 9]
data with ceiling [0 1 2 3 4 5 5 5 5 5]
