# Function to calculate the probability of correlations between two arrays
### Variables to be set by the calling file:
max_shift:
Determines what array shifts are allowed:
The shifts represent the time delays in days;
The time delay with the highest probability of correlation represents
the number of days after which the numbers of one county show a similar development
as the numbers of another county.

## Modules
Needed to use non-Python functionalities already programmed by someone else.

In [1]:
# Used to cast the polygons into np.arrays and afterwards transpose them easily with the .T function
import numpy as np
import matplotlib.pyplot as plt    # to plot the counties
# Used to navigate the directories and check for files
import os
# backup directory of this file, because the working directory is changed to save files
get_scaled_correlation_directory = os.getcwd()

In [2]:
# move to the cllect_data directory
os.chdir(".."), os.chdir(".."), os.chdir("collect_data")
%run get_data.ipynb
os.chdir(get_scaled_correlation_directory)    # return to original directory

NameError: name 'F' is not defined

NameError: name 'F' is not defined

## Control
Set variables to "True" to trigger the action described by the comment and the variable's name.

In [None]:
test_get_scaled_correlation_with_sinus = False

## Define correlation function
Contains getting the correlations between the two given arrays, scaling the correlations by the number of products, returning the correlation, the shift between the two arrays, the number of products and the highest correlation.
<br/><br/>
The variable max_shift determines the maximum shift of array2 regarding array one. if it is a negative value, all possible shifts are done.

In [None]:
def get_scaled_correlation(array1, array2, max_shift=-1):
    if len(array1) != len(array2):
        print("The given arrays seem to have different lengths, this might cause problems.")

    # create array with the probabilities for a correlation between
    # array1 and array2 for every shift possible
    correlations = np.correlate(array1, array2, mode="full")

    # Get the number of products
    max_number_of_products = min(len(array1), len(array2))
    number_of_products = np.concatenate((
        np.arange(1, max_number_of_products),    # start of array2 left from start of array1
        np.full_like(np.arange(abs(len(array2)-len(array1))), max_number_of_products),
        np.arange(max_number_of_products, 0, -1)    # end of array2 right from end of array1
    ), axis=None)

    # Generate the array containing the shift of array2 relative to array1
    position_array2_to_array1 = np.append(np.arange(-len(array2) + 1, 0),
                                          np.arange(0, len(array1)))

    if (len(correlations) != len(position_array2_to_array1) or
        len(correlations) != len(number_of_products)):
        raise Exception("The output arrays of the get_scaled_correlation-function " +
                        "do not have the same length.")

    if max_shift < 0:
        # Scale by dividing by the number of products
        correlations = correlations // number_of_products
        return (correlations, number_of_products, position_array2_to_array1, 
                position_array2_to_array1[np.argmax(correlations)])
    borders = (np.where(position_array2_to_array1 == -max_shift)[0][0], 
               np.where(position_array2_to_array1 == max_shift)[0][0] + 1)
    # Scale by dividing by the number of products
    correlations = (correlations[borders[0]:borders[1]] //
                    number_of_products[borders[0]:borders[1]])
    return(
        correlations,
        number_of_products[borders[0]:borders[1]],
        position_array2_to_array1[borders[0]:borders[1]],
        position_array2_to_array1[np.argmax(correlations) + borders[0]]
    )

## Test the correlation function with a simple sinus

In [None]:
if test_get_scaled_correlation_with_sinus:
    sinus = [int(100*(e+1)) for e in np.sin(np.linspace(0,np.pi*10,1000))]
    plt.plot(sinus)
    print(get_scaled_correlation(sinus, sinus)[3])
    print("(should be 0. In this case shifting by -1 and 1 produces bigger correlation " +
          "probabilities, because the discrete values of the sinus.)")
    print(get_scaled_correlation(sinus, sinus)[0])
    print(get_scaled_correlation(sinus, sinus)[1])
    print(get_scaled_correlation(sinus, sinus)[2])
    plt.plot(get_scaled_correlation(sinus, sinus)[0])

## Calculate the Correlation for all Counties

In [None]:
for county in covid19.values():
    county["correlation"] = dict()
    for second_AdmUnitID, second_county in covid19.items():
        county["correlation"][second_AdmUnitID] = get_scaled_correlation(
            county["incidences"], second_county["incidences"])

## Calculate the Correlation for all Districts

In [None]:
for district in districts.values():
    district["correlation"] = dict()
    for second_districtID, second_district in districts.items():
        district["correlation"][second_districtID] = get_scaled_correlation(
            district["incidences"], second_district["incidences"])