## Nulls summary table

In [None]:
def nulls_summary_table(df):
    """
    Returns a summary table showing null value counts and percentage
    
    Parameters:
    df (DataFrame): Dataframe to check
    
    Returns:
    null_values (DataFrame)
    """
    null_values = pd.DataFrame(df.isnull().sum())
    null_values[1] = null_values[0]/len(df)
    null_values.columns = ['null_count','null_pct']
    return null_values

## Time benchmarking

In [None]:
# Generate sample data
np.random.seed(0)
data = np.random.rand(10000, 2)
df = pd.DataFrame(data, columns=['A', 'B'])
a, b = data[:, 0], data[:, 1]

# Benchmark pandas
pandas_time = timeit.timeit(lambda: df['A'].corr(df['B'], method='pearson'), number=1000)
print(f"Pandas: {pandas_time} seconds")

# Benchmark numpy
numpy_time = timeit.timeit(lambda: np.corrcoef(a, b)[0, 1], number=1000)
print(f"Numpy: {numpy_time} seconds")

# Benchmark scipy
scipy_time = timeit.timeit(lambda: pearsonr(a, b)[0], number=1000)
print(f"Scipy: {scipy_time} seconds")

## K-means mathematical elbow

In [16]:
# wcss = []
# for i in range(1,11):
#     kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
#     kmeans.fit(data)
#     wcss.append(kmeans.inertia_)
# wcss_data = dict(zip(range(1, 11), wcss))

wcss_data = {1: 14000.000000000002,
 2: 10674.294049383006,
 3: 8907.898028914926,
 4: 7761.474180989327,
 5: 6449.492103746886,
 6: 5988.100866179874,
 7: 5705.051575716015,
 8: 5066.316782614302,
 9: 4754.754386186658,
 10: 4465.7488481144765}

gradients = []
keys = list(wcss_data.keys())

for i in range(1, len(keys)):
    gradient = wcss_data[keys[i]] - wcss_data[keys[i - 1]]
    gradients.append(gradient)

percentage_changes = []
for i in range(1, len(gradients)):
    change = (gradients[i] - gradients[i - 1]) / abs(gradients[i - 1]) * 100
    percentage_changes.append(change)

max_change = max(percentage_changes)
max_index = percentage_changes.index(max_change) + 2  # Adding 2 because index starts at 1 and we skip the first gradient

print(f"Largest percentage change:\t\t{max_change:.2f}%\tbetween points {max_index-1} and {max_index}")

max_gradient = max(gradients)
min_gradient = min(gradients)

relative_percentage_changes = []
for gradient in gradients:
    change = (gradient - min_gradient) / (max_gradient - min_gradient) * 100
    relative_percentage_changes.append(change)

max_change = max(relative_percentage_changes)
max_index = relative_percentage_changes.index(max_change) + 2  # Adding 2 because index starts at 1 and we skip the first gradient

print(f"Largest relative percentage change:\t{max_change:.2f}%\tbetween points {max_index-1} and {max_index}")

Largest percentage change:		64.83%	between points 4 and 5
Largest relative percentage change:	100.00%	between points 6 and 7
