## Old way of reading data

In [None]:
# Configuration

# NAME_EVALUATION = 'mean-err-n-sources'
# NAME_FOLDER = "{}sources"
# NAME_DATA_FILES = '2017-10-04*.txt'  # 1st trials, original speech samples

# NAME_EVALUATION = 'mean-err-n-sources-sample3and5exchanged'
# NAME_FOLDER = "{}sources"
# NAME_DATA_FILES = '2017-10-11*.txt'  # 2nd trials, rearranged speech samples

# NAME_EVALUATION = 'mean-err-n-sources-rnd-T60=0.0'
# NAME_FOLDER = "{}sources-rnd"
# NAME_DATA_FILES = '2017-*results.txt'  # 3rd trials, BASE

# NAME_EVALUATION = 'mean-err-n-sources-rnd-T60=0.3'
# NAME_FOLDER = "{}sources-rnd-T60"
# NAME_DATA_FILES = '2017-*0.3T60*results.txt'  # 4th trials, T60=0.3

# NAME_EVALUATION = 'mean-err-n-sources-rnd-T60=0.6-em=5'
# NAME_FOLDER = "{}sources-rnd-T60"
# NAME_DATA_FILES = '2017-*0.6T60*5em_results.txt'  # 5th trials, T60=0.6, em=5

# NAME_EVALUATION = 'mean-err-n-sources-rnd-T60=0.6-em=10'
# NAME_FOLDER = "{}sources-rnd-T60"
# NAME_DATA_FILES = '2017-*0.6T60*10em_results.txt'  # 6th trials, T60=0.6, em=10

# NAME_EVALUATION = 'mean-err-n-sources-rnd-estimates'
# NAME_FOLDER = "{}sources-rnd-estimates"
# NAME_DATA_FILES = '*results.txt'  # 7th trials, random estimates (benchmark)

In [None]:
n_sources_range = [1, 2, 3, 4, 5, 6, 7]
results_dict = {}

for s in n_sources_range:
    # define filename format
    files = glob.glob(path.join(PATH_ROOT,NAME_FOLDER.format(s), NAME_DATA_FILES))

    # load data
    dfs = []
    for f in files:
        dfs.append(pd.DataFrame(list(csv.reader(open(f, 'r'), delimiter='\t')), dtype=float))
    try:   
        results = pd.concat(dfs)
    except ValueError:
        continue
    n_trials = len(results)
    n_sources = int((len(results.columns)-1)/5)
    results.drop(results.columns[[n_sources*4+n_sources]], axis=1, inplace=True) # drops empty column
    results.columns = get_col_names(n_sources)
    results.index = ["t{}".format(i+1) for i in range(n_trials)]
    results_dict['{}'.format(n_sources)] = results
    
    # calculate helper columns
    results['n_sources']     = n_sources
    results["err_mean"]      = results.loc[:, "err1":_get_err_col_name(s)[-1]].mean(axis=1)
    results["err_total"]     = results.loc[:, "err1":_get_err_col_name(s)[-1]].sum(axis=1)
    results["perfect_match"] = results["err_total"]<=0.01

try:   
    results = pd.concat(results_dict, ignore_index=True)
except ValueError:
    print("NO DATA FOUND!")

print("SUMMARY:")
summary = results.groupby('n_sources').agg({'x1':'count', 'err_mean':np.mean, 'perfect_match':np.mean}).rename(columns={'x1':'sample size'})
print(summary.transpose())

# print("")
# print("COMPLETE DATA:")
# print(results.sample(5))
if EXPORT_RESULTS:
    results.to_pickle(path.join(PATH_ROOT, NAME_DATAFRAME))
if EXPORT_LATEX:
    summary.transpose().to_latex(PATH_LATEX_SUMMARY_TABLE, column_format="rrrrrrr", bold_rows=True)

## Old way of comparing

In [None]:
files = glob.glob(path.join(PATH_ROOT, 'mean-err-n-sources-rnd-T60*-dataframe.pkl'))
files2 = glob.glob(path.join(PATH_ROOT, 'mean-err-n-sources-rnd-estimates-dataframe.pkl'))
from pprint import pprint
print("The following files match the criterion:")

dfs = []
for f in files:
    df = pd.read_pickle(f)
    fname = f.split(sep="/")[-1]
    print(fname)
    version = fname[fname.find("rnd-")+4:fname.find("-dataframe")]
    if version=="": version="latest"
    df["version"] = version
    dfs.append(df)

for f in files2:
    df = pd.read_pickle(f)
    fname = f.split(sep="/")[-1]
    print(fname)
    df["version"] = "random"
    dfs.append(df)

dfc = pd.concat(dfs, ignore_index=True)
print("\nSUMMARY:")
print(dfc.groupby(by=["version", "n_sources"]).agg({'x1':'count', 'err_mean':np.mean, 'perfect_match':np.mean}).rename(columns={'x1':'sample size'}))

## Plot with groups 

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from itertools import groupby

def test_table():
    data_table = pd.DataFrame({'version':['v1']*4 + ['v2']*4,
                               'n_sources':(['2']*2 + ['3']*2)*2,
                               'Quantity':[10,20,5,6,4,7,2,1],
                               'Ordered':np.random.randint(0,10,8)
                               })
    return data_table

def add_line(ax, xpos, ypos):
    line = plt.Line2D([xpos, xpos], [ypos + .1, ypos],
                      transform=ax.transAxes, color='black')
    line.set_clip_on(False)
    ax.add_line(line)

def label_len(my_index,level):
    labels = my_index.get_level_values(level)
    return [(k, sum(1 for i in g)) for k,g in groupby(labels)]

def label_group_bar_table(ax, df):
    ypos = -.1
    scale = 1./df.index.size
    for level in range(df.index.nlevels)[::-1]:
        pos = 0
        for label, rpos in label_len(df.index,level):
            lxpos = (pos + .5 * rpos)*scale
            ax.text(lxpos, ypos, label, ha='center', transform=ax.transAxes)
            add_line(ax, pos*scale, ypos)
            pos += rpos
        add_line(ax, pos*scale , ypos)
        ypos -= .1

df = dfc.groupby(['n_sources','version']).mean()["err_mean"]
fig = plt.figure(figsize=(17,5))
ax = fig.add_subplot(111)
df.plot(kind='bar',stacked=False,ax=fig.gca(),color=[lms_red,'black', 'gray'])
#Below 3 lines remove default labels
labels = ['' for item in ax.get_xticklabels()]
ax.set_xticklabels(labels)
ax.set_xlabel('')
label_group_bar_table(ax, df)
fig.subplots_adjust(bottom=.1*df.index.nlevels)

NameError: name 'dfc' is not defined

In [7]:
import pandas as pd
import numpy as np

df = pd.DataFrame([None,'string',np.nan,42], index=[0,1,2,3], columns=['A'])

df1 = df['A'].astype(str)
df2 =  df['A'].apply(str)

print(df)
print(df1)
print(df2)

        A
0    None
1  string
2     NaN
3      42
0      None
1    string
2       nan
3        42
Name: A, dtype: object
0      None
1    string
2       nan
3        42
Name: A, dtype: object
