In [69]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy import spatial
import scipy.stats
from itertools import combinations

In [13]:
# Calculates relevant summary statistics for a given array
def summary_stats(array: list) -> int:
    mean = np.mean(array)	
    median = np.median(array)
    mode_arr = mode(array)
    
    min_arr = min(array)
    max_arr = max(array)
    quartiles = np.percentile(array, [25, 75]) # finds 1st and 3rd quartile of the array, 25% and 75% respectively.

    print(f'mean: {mean}\nmedian: {median}\nmode: {mode_arr}\n')
    print(f'lower quartile: {quartiles[0]}\nupper quartile: {quartiles[1]}\nmin: {min_arr}\nmax: {max_arr}\n')

# Function to find the mode in a given array using a dictionary to count their frequency.
def mode(array: list) -> int:
    freq = dict()  
    for i in array:
        if i in freq:
            freq[i] += 1
        else:
            freq[i] = 1
    
    return max(freq, key=freq.get) # return key based on the highest value (k:v pair)



# Finds outliers in an array based on 1.5 * IQR rule
def find_outliers(array: list):
    quartiles = np.percentile(array, [25, 75])	# get 1st and 3rd quartile for calculations
    iqr = (quartiles[1] - quartiles[0])	# IQR = 3rd Quartile - 1st Quartile
  	
    lower = quartiles[0] - iqr # lowest value before it could be considered an outlier
    upper = quartiles[1] + iqr # highest value before it becomes an outlier

    outliers = [elem for elem in array if elem < lower or elem > upper] # find all (elements < lower) or (elements > upper)
    print(outliers)



# # Function to compare similarities between two entries, based on binary attributes
# def find_binary_similarity(df: pd.DataFrame, col1, col2):
#     new_df = pd.DataFrame([df[col1], df[col2]]).transpose()	# Transpose to traverse easier

#     cnt = 0	# count of similarities
#     for rows in new_df.index:
#         if new_df[col1][rows] == new_df[col2][rows]: # check if they have a matching attribute
#             cnt += 1

#     print(new_df, '\n') 
#     print(f'Similarity between {col1} and {col2}: {cnt / len(new_df.index)}\n')


def find_binary_similarity(list1: list, list2: list) -> int:
    
    sim_cnt = 0
    for ele1, ele2 in zip(list1, list2):
        if ele1 == ele2:
            sim_cnt += 1
        
    return sim_cnt / len(list1)



# Normalize data in a given array using its min & max values
def normalization(array: list):
    lower = max(array) - min(array)
    arr_min = min(array)
    ret = []
    for i in array:
        ret += [(i - arr_min) / lower]

    print(ret)


In [65]:
def show_ar(array: list[list]):
    for i in array:
        print(i, '')


def find_edit_distance(str1: str, str2: str) -> int:
    len1 = len(str1)
    len2 = len(str2)

    table = [[0 for j in range(len2+1)] for i in range(len1+1)]     # table to save previous calculations

    for i in range(0, len1 + 1):
        for j in range(0, len2 + 1):
            if i == 0:
                table[i][j] = j
            elif j == 0:
                table[i][j] = i

            elif str1[i-1] == str2[j-1]:  
                table[i][j] = table[i-1][j-1]   
            else:
                table[i][j] = 1 + min(
                                        table[i-1][j-1],    
                                        table[i][j-1],
                                        table[i-1][j])
    show_ar(table)
    return table[-1][-1]

In [7]:
outdoor_temp = [63.2, 82.4, 82.7, 76.6, 68.6, 74.4, 82.4, 66.2, 79.5, 56.5]
summary_stats(outdoor_temp)

mean: 73.25
median: 75.5
mode: 82.4

lower quartile: 66.8
upper quartile: 81.67500000000001
min: 56.5
max: 82.7



In [8]:
indoor_temp = [68.5, 73.8, 79.5, 74.9, 72.5, 75.1, 74.8, 70.6, 74.5, 71.2]
summary_stats(indoor_temp)

mean: 73.54
median: 74.15
mode: 68.5

lower quartile: 71.525
upper quartile: 74.875
min: 68.5
max: 79.5



In [10]:
dist = []
for outdoor, indoor in zip(outdoor_temp, indoor_temp):
    dist += [abs(outdoor - indoor)]

print(dist)

[5.299999999999997, 8.600000000000009, 3.200000000000003, 1.6999999999999886, 3.9000000000000057, 0.6999999999999886, 7.6000000000000085, 4.3999999999999915, 5.0, 14.700000000000003]


In [11]:
print(scipy.stats.pearsonr(outdoor_temp, indoor_temp))


(0.7877417193884946, 0.006812519094620164)


In [16]:
# let 0 = cool and 1 = hot

q2_outdoor = [0, 1, 1, 1, 0, 0, 1, 0, 1, 0]
q2_indoor = [0, 0, 1, 0, 0, 1, 0, 0, 0, 0]
# print(find_binary_similarity(q2_indoor, q2_outdoor))
print(scipy.stats.pearsonr(q2_outdoor, q2_indoor))

# let 0 = off, 1 = on for AC running
ac_running = [0, 1, 0, 1, 1, 0, 1, 0, 1, 0]
print(scipy.stats.pearsonr(q2_outdoor, ac_running))

0.5
(0.0, 0.9999999999999998)
(0.6, 0.06668800000000001)


In [34]:
str1 = 'TACAAGGAGTGA'
str2 = 'ATTGCAGAGA'
find_edit_distance(str1, str2)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 
[1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9] 
[2, 1, 2, 2, 3, 4, 4, 5, 6, 7, 8] 
[3, 2, 2, 3, 3, 3, 4, 5, 6, 7, 8] 
[4, 3, 3, 3, 4, 4, 3, 4, 5, 6, 7] 
[5, 4, 4, 4, 4, 5, 4, 4, 4, 5, 6] 
[6, 5, 5, 5, 4, 5, 5, 4, 5, 4, 5] 
[7, 6, 6, 6, 5, 5, 6, 5, 5, 5, 5] 
[8, 7, 7, 7, 6, 6, 5, 6, 5, 6, 5] 
[9, 8, 8, 8, 7, 7, 6, 5, 6, 5, 6] 
[10, 9, 8, 8, 8, 8, 7, 6, 6, 6, 6] 
[11, 10, 9, 9, 8, 9, 8, 7, 7, 6, 7] 
[12, 11, 10, 10, 9, 9, 9, 8, 7, 7, 6] 


6

In [95]:
def dtw_distance(arr1: list, arr2: list) -> int:
    def dist(distance1: int, distance2: int) -> int:
        return abs(distance1 - distance2)


    len1 = len(arr1) + 1
    len2 = len(arr2) + 1
    table = [[99 for j in range(len2)] for i in range(len1)]
    table[0][0] = 0
    for i in range(1, len1):
        table[i][0] = arr1[i-1]
    for j in range(1, len2):
        table[0][j] = arr2[j-1]

    for i in range(1, len1):
        for j in range(1, len2):

     
            cost = dist(arr1[i-1], arr2[j-1])
            # if table[i-1][j-1] == min(
            #                             table[i-1][j],
            #                             table[i][j-1],
            #                             table[i-1][j-1]):
            #     print(i, j)

            table[i][j] = round(cost + min(
                                        table[i-1][j],
                                        table[i][j-1],
                                        table[i-1][j-1]
            ), 4)
            
    


    show_ar(table)
    return table[-1][-1]

series1 = [1.0, 1.2, 1.4, 1.7, 2.1, 2.1, 2.1, 1.9, 1.8, 1.7,]
series2 = [1.0, 1.01, 1.02, 1.19, 1.38, 1.72, 2.05, 1.92, 1.81, 1.68, 1.54]
print(dtw_distance(series2, series1))
    

[0, 1.0, 1.2, 1.4, 1.7, 2.1, 2.1, 2.1, 1.9, 1.8, 1.7] 
[1.0, 0.0, 0.2, 0.6, 1.3, 2.4, 3.2, 3.2, 2.8, 2.6, 2.4] 
[1.01, 0.01, 0.19, 0.58, 1.27, 2.36, 3.45, 4.29, 3.69, 3.39, 3.09] 
[1.02, 0.03, 0.19, 0.57, 1.25, 2.33, 3.41, 4.49, 4.57, 4.17, 3.77] 
[1.19, 0.22, 0.04, 0.25, 0.76, 1.67, 2.58, 3.49, 4.2, 4.78, 4.28] 
[1.38, 0.6, 0.22, 0.06, 0.38, 1.1, 1.82, 2.54, 3.06, 3.48, 3.8] 
[1.72, 1.32, 0.74, 0.38, 0.08, 0.46, 0.84, 1.22, 1.4, 1.48, 1.5] 
[2.05, 2.37, 1.59, 1.03, 0.43, 0.13, 0.18, 0.23, 0.38, 0.63, 0.98] 
[1.92, 2.84, 2.31, 1.55, 0.65, 0.31, 0.31, 0.36, 0.25, 0.37, 0.59] 
[1.81, 2.62, 2.92, 1.96, 0.76, 0.6, 0.6, 0.6, 0.34, 0.26, 0.37] 
[1.68, 2.36, 2.84, 2.24, 0.78, 1.02, 1.02, 1.02, 0.56, 0.38, 0.28] 
[1.54, 2.08, 2.42, 2.38, 0.94, 1.34, 1.58, 1.58, 0.92, 0.64, 0.44] 
0.44
