In [50]:
def get_change_points(series, threshold, min_observations_below, min_distance_change_streaks):
    """Gets a list of changepoints from a series of observations based on a threshold. 
    
    A change point is registered if the series values are below the threshold for the duration of 'min_observations_below'. 
    Change points will only be flagged if there is at least a distance of 'min_distance_change_streaks' between streaks of observations below the change point.
    
    Args:
        series: Pandas series with observations.
        
    Returns:
        List of change points.
    """
    # for each row, get whether its value is of threshold or lower
    series_below_threshold = series <= threshold

    # do an accumulative count of how many values below the threshold have been observed
    # restarts at 0 as soon as one value > threshold is observed
    below_threshold_counts = series_below_threshold * (series_below_threshold.groupby((series_below_threshold != series_below_threshold.shift()).cumsum()).cumcount() + 1)
    
    # reset the index of below_threshold_counts to 0...n instead of trace counts
    true_indices_series = below_threshold_counts.index
    below_threshold_counts = below_threshold_counts.reset_index(drop=True)

    # store change points as indices
    change_points = []
    
    # ix when the last streak ended
    last_change_streak_ended = None

    for index, streak_count in below_threshold_counts.iteritems():
        if streak_count < min_observations_below:
            continue
        
        # check if the streak is exactly the minimum number of observations
        if streak_count == min_observations_below:
            
            integer_index_candidate = index - min_observations_below + 1
            change_point_candidate = true_indices_series[integer_index_candidate]
            
            # definitely enter the change point if this is the first streak that was seen
            if last_change_streak_ended is None:
                change_points.append(change_point_candidate)
            else:
                # get distance to last streak end
                distance_to_last_streak = integer_index_candidate - last_change_streak_ended - 1

                if distance_to_last_streak >= min_distance_change_streaks:
                    change_points.append(change_point_candidate)

        # update the end of the last change streak, if the current streak count exceeds the min observations below threshold
        if streak_count >= min_observations_below:
            last_change_streak_ended = index
    
    return change_points

Unnamed: 0,pop_1,pop_2
0,TV,TV
1,Radio,TV
2,Comp,Comp
3,TV,TV
4,TV,TV
5,TV,Comp
6,Comp,Comp
7,Comp,Comp
8,TV,TV
9,Radio,Radio


In [59]:
array_1 = ['asdf'] * 0 + ['akdsjfiev'] * 2001 + ['adfviie'] * 3000 + ['lkadkf'] * 4000
array_2 = ['asdf'] * 3999 + ['akdsjfiev'] * 3001 + ['adfviie'] * 2000 + ['lkadkf'] * 1000

In [2]:
def get_contingency_table_pandas(array_1, array_2):
    df = pd.DataFrame().from_dict({'pop_1': array_1,
                   'pop_2': array_2})
    value_counts_df = df.apply(pd.value_counts)
    value_counts_df = value_counts_df.transpose()
    return value_counts_df.to_numpy()

In [10]:
from processdrift.framework import pop_comparison
import timeit
import pandas as pd
import numpy as np

In [6]:
pop_comparison.get_contingency_table(array_1, array_2)
get_contingency_table_pandas(array_1, array_2)

array([[3000, 2001,  999, 4000],
       [2000, 3001, 3999, 1000]], dtype=int64)

In [7]:
current_implementation = timeit.Timer(lambda: pop_comparison.get_contingency_table(array_1, array_2)).timeit(number=1000)
current_implementation

2.539812999999981

In [8]:
new_implementation = timeit.Timer(lambda: get_contingency_table_pandas(array_1, array_2)).timeit(number=1000)
new_implementation

3.3947099000000094

In [60]:
from collections import Counter
def using_counter(array_1, array_2):
    pop_1_counter = Counter(array_1)
    pop_2_counter = Counter(array_2)
    df = pd.DataFrame().from_dict({'pop_1': pop_1_counter, 'pop_2': pop_2_counter}, orient='index')
    return df.to_numpy()
using_counter(array_1, array_2)

array([[2001., 3000., 4000.,   nan],
       [3001., 2000., 1000., 3999.]])

In [76]:
from operator import itemgetter

pop_1_counter = Counter(array_1)
pop_2_counter = Counter(array_2)
result_array = []

keys = list(pop_1_counter.keys() | pop_2_counter.keys())
result_array = []

for dict in [pop_1_counter, pop_2_counter]:
    result_array.append([dict.get(key) for key in keys])

np.array(result_array)

array([[None, 4000, 2001, 3000],
       [3999, 1000, 3001, 2000]], dtype=object)

In [74]:
[pop_1_counter.get(key) for key in keys]

[None, 4000, 2001, 3000]

In [62]:
from collections import Counter
def no_pandas(array_1, array_2):
    pop_1_counter = Counter(array_1)
    pop_2_counter = Counter(array_2)

    result_array = []

    for (k,v), (k2,v2) in zip(pop_1_counter.items(), pop_2_counter.items()):
            result_array.append([v, v2])

    result = np.array(result_array).transpose()
    return result
no_pandas(array_1, array_2)

array([[2001, 3000, 4000],
       [3999, 3001, 2000]])

In [79]:
def no_pandas2(array_1, array_2):
    pop_1_counter = Counter(array_1)
    pop_2_counter = Counter(array_2)
    result_array = []

    keys = list(pop_1_counter.keys() | pop_2_counter.keys())
    result_array = []

    for dict in [pop_1_counter, pop_2_counter]:
        result_array.append([dict.get(key) for key in keys])

    return np.array(result_array)
no_pandas2(array_1, array_2)

array([[None, 4000, 2001, 3000],
       [3999, 1000, 3001, 2000]], dtype=object)

In [80]:
counter_implementation = timeit.Timer(lambda: no_pandas2(array_1, array_2)).timeit(number=1000)
counter_implementation

0.748181299999942

In [124]:
df.groupby(['pop_1']).count()

pd.crosstab(index=df['pop_1'], columns=[df['pop_1'],df['pop_2']])

pop_1,Comp,Radio,Radio,TV,TV,TV
pop_2,Comp,Radio,TV,Comp,Radio,TV
pop_1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Comp,5,0,0,0,0,0
Radio,0,2,1,0,0,0
TV,0,0,0,1,3,8


In [51]:
import pandas as pd
import numpy as np

In [52]:
series_list = [0, 0, 1, 0, 0, 1, 1] # 1 cp at index 60 (=iloc 3)

series = pd.Series(series_list, index=np.linspace(0, 200, len(series_list)))
display(series)

get_change_points(series, threshold=0.05, min_observations_below=1, min_distance_change_streaks=1)

0.000000      0
33.333333     0
66.666667     1
100.000000    0
133.333333    0
166.666667    1
200.000000    1
dtype: int64

[0.0, 100.0]

In [28]:
import numpy as np
def _get_distribution(attribute_value_count):
    """Get a single distribution for a given number of values.
    
    Args:
        number_values: How many different values the distribution should have.
    
    Returns:
        A probability distribution.
    """
    distribution = np.random.dirichlet(np.ones(attribute_value_count)).tolist()
    return distribution

def _get_distribution_new(attribute_value_count, min_probability=0.1):
    """Get a single distribution for a given number of values.
    
    Args:
        number_values: How many different values the distribution should have.
        min_probability: Minimum probability per attribute value.
    
    Returns:
        A probability distribution.
    """
    # generate a random number per attribute_value_count
    # each attribute probability should be at least min_percent -> return an error if this cannot be satisfied
    if attribute_value_count * min_probability > 1:
        raise Exception("'min_probability' too high for the number of attribute values.")

    sampled_random_numbers = np.random.random(attribute_value_count)
    distribution_random = sampled_random_numbers / sum(sampled_random_numbers)
    distribution = [min_probability + (1-min_probability) * random for random in distribution_random]

    return distribution

In [29]:
attribute_value_count = 3
experiments = 100000
dichilet_mins = [min(_get_distribution(attribute_value_count)) for _ in range(experiments)]
own_mins = [min(_get_distribution_new(attribute_value_count)) for _ in range(experiments)]
print(np.array(dichilet_mins).min())
print(np.array(own_mins).min())

7.633906699013631e-07
0.10000300725504244


In [1]:
import pandas as pd
import scipy
import numpy as np

In [79]:
pop_1_array = ['asd'] * 20 + ['alsdkf'] * 15 + ['aksdf'] * 10 + ['laksdfe'] * 0
pop_2_array = ['asd'] * 6 + ['alsdkf'] * 10 + ['aksdf'] * 15 + ['laksdfe'] * 1

In [80]:
def get_contingency_table(pop_1_array, pop_2_array):
    """Get a numpy contingency table from two arrays of samples.
    
    Args:
        pd_series_1: First sample array.
        pd_series_2: Second sample array.
    """
    # get pandas series for both arrays of observations so that we can calculate the value counts next
    pop_1 = pd.Series(pop_1_array)
    pop_2 = pd.Series(pop_2_array)

    # get the observation frequencies
    pop_1_frequencies = pop_1.value_counts()
    pop_2_frequencies = pop_2.value_counts()

    # create the contingency table, fill in NaN with 0
    contingency_table_df = pd.DataFrame([pop_1_frequencies, pop_2_frequencies])
    contingency_table_df = contingency_table_df.fillna(0)

    # convert to numpy array
    contingency_table = contingency_table_df.to_numpy()
    return contingency_table

In [87]:
contingency_table = get_contingency_table(pop_1_array, pop_2_array)
contingency_table

array([[20., 15., 10.,  0.],
       [ 6., 10., 15.,  1.]])

In [88]:
# check if one value is NaN or < 5, return
(contingency_table < 5).any()

True

In [89]:
# check if one value is NaN or < 5, return
(contingency_table < 5).any()

# calculate the chi-squared p-value
stat, p, dof, expected = scipy.stats.chi2_contingency(contingency_table)
p

0.0352937477047853

In [90]:
# calculate the G-test p-value
stat, p, dof, expected = scipy.stats.chi2_contingency(contingency_table, lambda_="log-likelihood")
p

0.027392347410116178

In [93]:
expected

array([[15.19480519, 14.61038961, 14.61038961,  0.58441558],
       [10.80519481, 10.38961039, 10.38961039,  0.41558442]])

In [94]:
from joblib import Parallel, delayed
def process(i):
    return i * i
    
results = Parallel(n_jobs=2)(delayed(process)(i) for i in range(10))
print(results) 

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]


In [10]:
from locale import normalize
import numpy as np
import pandas as pd
from collections import Counter


def preprocess_get_contingency_table(pop_1_array, pop_2_array):
    """Get a numpy contingency table from two arrays of samples.
    
    Args:
        pd_series_1: First sample array.
        pd_series_2: Second sample array.

    Returns:
        numpy contingency table
    """
    pop_1_counter = Counter(pop_1_array)
    pop_2_counter = Counter(pop_2_array)
    result_array = []

    keys = list(pop_1_counter.keys() | pop_2_counter.keys())
    result_array = []

    for dict in [pop_1_counter, pop_2_counter]:
        result_array.append([dict.get(key, 0) for key in keys])

    return np.array(result_array)

def preprocess_get_distribution_table(pop_1_array, pop_2_array):
    """Gets a distribution table. This table is like a contingency table but normalizes the occurence frequncies.
        
    Args:
        pd_series_1: First sample array.
        pd_series_2: Second sample array.
    """
    contingency_table = preprocess_get_contingency_table(pop_1_array, pop_2_array)
    sum_of_rows = contingency_table.sum(axis=1)
    normalized_array = contingency_table / sum_of_rows[:, np.newaxis]
    return normalized_array

In [12]:
array_1 = ['a']*1 + ['b']*2
array_2 = ['a']*4 + ['b'] * 4

In [14]:
p, q = preprocess_get_distribution_table(array_1, array_2)

In [18]:
q

array([0.5, 0.5])