In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

In [2]:
plt.rcParams["figure.figsize"] = (12, 8)

In [3]:
def stat_tests(x, y):
    res1 = ["Dependent", "Independent"]
    res2 = ["Same Disribution", "Different Distribution"]
    
    print("CORRELATION:")
    a, b = stats.pearsonr(x, y)
    print("  PearsonrResult(correlation={}, p_values={})    {}".format(a, b, res1[int(b<0.05)]))
    statistic = stats.spearmanr(x, y)
    print("  {}    {}".format(statistic, res1[int(statistic.pvalue<0.05)]))
    statistic = stats.kendalltau(x, y)
    print("  {}    {}".format(statistic, res1[int(statistic.pvalue<0.05)]))
    
    print("\nPARAMETRIC STATISTICAL HYPOTHESIS TESTS")
    statistic = stats.ttest_ind(x, y)
    print("  {}    {}".format(statistic, res2[int(statistic.pvalue<0.05)]))
    statistic = stats.ttest_rel(x, y)
    print("  {}    {}".format(statistic, res2[int(statistic.pvalue<0.05)]))
    statistic = stats.f_oneway(x, y)
    print("  {}    {}".format(statistic, res2[int(statistic.pvalue<0.05)]))
                        
    print('\nNON-PARAMETRIC STATISTICAL HYPOTHESIS TESTS')
    statistic = stats.mannwhitneyu(x, y)
    print("  {}    {}".format(statistic, res2[int(statistic.pvalue<0.05)]))
    statistic = stats.wilcoxon(x, y)
    print("  {}    {}".format(statistic, res2[int(statistic.pvalue<0.05)]))
    statistic = stats.kruskal(x, y)
    print("  {}    {}".format(statistic, res2[int(statistic.pvalue<0.05)]))

# Method I

In [4]:
l1 = []
l2 = []

for x in range(1,99):
    for y in range(x+5, 100):
        l1.append(x)
        l2.append(y)
        
l1 = np.array(l1)
l2 = np.array(l2)

In [5]:
def get_data(seed, x=l1, y=l2, vals = 30):
    np.random.seed(seed)
    idx = np.random.choice(len(l1), size = vals, replace=False)
    idx.sort()

    return x[idx], y[idx]

In [None]:
r_seed = 0
best_p = 0
best_corr = 1

for seed in range(1, 100000):
    x, y = get_data(seed, vals=30)
    corr, p_val = stats.pearsonr(x, y)
    if np.abs(corr) < best_corr:
        best_corr = np.abs(corr)
        r_seed = seed
        best_p = p_val
        
best_corr, best_p

In [None]:
x, y = get_data(r_seed, vals=30)

In [None]:
plt.scatter(x, y)
plt.show()

In [None]:
stat_tests(x, y)

# Method II

In [None]:
noise = 5
sep = 5

g = list(np.arange(1+noise, 100-noise, 13))
h = list(np.arange(g[0]+sep+noise, 100+sep-noise+2, 12))

x = []
y = []

for i in g:
    for j in h:
        if i<j:
            x.append(i + np.random.randint(-noise, noise+1))
            y.append(j + np.random.randint(-noise, noise+1))

In [None]:
plt.scatter(x, y)
plt.show()

In [None]:
stat_tests(x, y)