In [1]:
import os
import pandas as pd
from itertools import product
from meta.scripts.Utilities import Utilities


def describe_correlation_coefficient(r: float):
    if 1.0 >= r:
        if r < 0:
            out = "negative"
        else:
            out = "positive"
        _r = abs(r)
       
        if 0.01 < _r < 0.3:
            return "weak_{}".format(out)
        if 0.3 <= _r < 0.7:
            return "moderate_{}".format(out)
        if 0.70 <= _r <= 1.0:
            return "strong_{}".format(out)
    return "none"


def describe_correlation_table(file: str):
    correlation_df = Utilities.load_tsv(file)
    correlation_df.set_index(correlation_df.columns[0], inplace=True)
    annotation_df = correlation_df.applymap(describe_correlation_coefficient)
    annotation_dict = dict()
    for index, column in product(annotation_df.index.values, annotation_df.columns.values):
        value = annotation_df.loc[index, column]
        if index != column and len(value) > 0 and any(i in value for i in ["strong", "moderate"]):
            annotation_dict[" vs ".join(sorted([index, column]))] = value
    return pd.DataFrame([dict(file_name=os.path.basename(file), 
                              comparison_pair=k, dependence=annotation_dict[k]) 
                         for k in annotation_dict])


In [2]:
correlation_tables = [i for i in Utilities.scan_whole_dir(
    os.path.join(os.getcwd(), "data", "correlation")) if "Correlation" in i]

described_df = pd.concat(
    Utilities.single_core_queue(describe_correlation_table, correlation_tables), axis=0, sort=False, ignore_index=True)

Utilities.dump_tsv(described_df, 
                   os.path.join(os.getcwd(), 
                                "data", "correlation", "description", "description.tsv"))