# Cluster analysis

In this notebook, we will look at how to debug blocked table similarities and the resulting clusters.

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
# Assume that we have the "chartedIn" results
import takco

# tables = takco.HashBag._load('../../output/chartedIn/4-coltypes/*')
tables = takco.HashBag._load('../../output/chartedIn/5-cluster/*')

print(f"Got {sum(1 for _ in tables)} tables")

# Get biggest table
table = sorted(tables, key=lambda x: -x.get('numDataRows'))[0]
takco.preview( table )

Got 38 tables


?,0,1,2,3,4,5
Unnamed: 0_level_1,_pgTitle,Peak position	Position,_Variable,Chart	Chart (,Chart )	Chart,Chart
,These Days (Bon Jovi album),3,1995,,Argentine Albums Chart,
,These Days (Bon Jovi album),1,1995,,Australian Albums Chart,
,These Days (Bon Jovi album),1,1995,,Austrian Albums Chart,
,These Days (Bon Jovi album),3,1995,,Belgian Albums Chart (Flanders),
,These Days (Bon Jovi album),6,1995,,Belgian Albums Chart (Wallonia),


In [3]:
from takco.util import tableobj_to_dataframe
df = tableobj_to_dataframe(table)
for ci, col in enumerate(df.columns):
    print(f"{str(col):>35s}", dict( df.T.iloc[ci].value_counts()[:3] ))

                      ('_pgTitle',) {'Womanizer (song)': 64, '...Baby One More Time (song)': 50, 'Love. Angel. Music. Baby.': 48}
       ('Peak position\tPosition',) {'1': 212, '2': 74, '3': 69}
                     ('_Variable',) {'1995': 79, '2018': 60, '2012': 59}
                ('Chart\tChart (',) {'': 452, 'US Billboard 200': 29, 'UK Albums Chart': 21}
                ('Chart )\tChart',) {'': 735, 'US Billboard Hot 100': 19, 'Australian Albums Chart': 16}
                         ('Chart',) {'': 1150, 'Belgium': 4, 'Netherlands': 4}


In [4]:
takco.preview( table.get('partColAligns'), ntables=None )

?,0,1,2,3
∈,Q482994  Q2031291  Q208569,decimal,dateTime,string
Unnamed: 0_level_2,_pgTitle,Position,_Variable,Chart
,These Days (Bon Jovi album),3,1995,Argentine Albums Chart
,These Days (Bon Jovi album),1,1995,Australian Albums Chart
,These Days (Bon Jovi album),1,1995,Austrian Albums Chart
,These Days (Bon Jovi album),3,1995,Belgian Albums Chart (Flanders)
,These Days (Bon Jovi album),6,1995,Belgian Albums Chart (Wallonia)

?,0,1,2,3
∈,Q482994  Q208569  Q2031291,decimal,dateTime,Q43229  Q80793969  Q373899
Unnamed: 0_level_2,_pgTitle,Peak position,_Variable,Chart
,So Fresh: The Hits of Summer 2010 + the Best of 2009,1,2009,Australian ARIA Compilations Chart
,Unsainted,86,2019,Australia ( ARIA )
,Unsainted,70,2019,Czech Republic ( Singles Digitál Top 100 )
,Unsainted,11,2019,Hungary ( Single Top 40 )
,Unsainted,38,2019,Hungary ( Stream Top 40 )

?,0,1,2,3,4,5
∈,Q2188189  Q685884  Q207628  Q7725634  Q7366  Q47407603  Q1047113  Q11862829  Q2031291  Q134556,decimal,dateTime,Q6256  Q3624078  Q7275  Q1048835,Q43229  Q80793969  Q373899,string
Unnamed: 0_level_2,_pgTitle,Position,_Variable,Chart,Chart (,Chart )
,Set Fire to the Rain,49,2011,Australia,ARIA,
,Set Fire to the Rain,25,2011,Austria,Ö3 Austria Top 40,
,Set Fire to the Rain,3,2011,Belgium,Ultratop 50 Flanders,
,Set Fire to the Rain,8,2011,Belgium,Ultratop 40 Wallonia,
,Set Fire to the Rain,14,2011,Denmark,Hitlisten,

?,0,1,2,3,4
∈,Q134556  Q2031291,decimal,Unnamed: 3_level_1,dateTime,Q80793969  Q373899  Q43229
Unnamed: 0_level_2,_pgTitle,Peak position,Unnamed: 3_level_2,_Variable,Chart
,Womanizer (song),5,,2008–09,Australia ( ARIA )
,Womanizer (song),4,,2008–09,Austria ( Ö3 Austria Top 40 )
,Womanizer (song),1,,2008–09,Belgium ( Ultratop 50 Flanders)
,Womanizer (song),2,,2008–09,Belgium ( Ultratop 50 Wallonia)
,Womanizer (song),1,,2008–09,Canada ( Canadian Hot 100 )


In [5]:
from IPython.display import display
from takco.cluster.clustering import aggregate_similarities
import pandas as pd
sims = pd.read_csv('../../output/chartedIn/sims.csv', index_col=[0,1,2,3]).query('ti1 < ti2')
agg_func = '@mean(  @max(headjacc, @pow(headvec, 4)), @max(bodylsh, bodytype, @pow(bodyvec, 4) ) )'
sims['agg'] = aggregate_similarities(sims, agg_func)

partColAligns = table.get('partColAligns')
ci_head = {}
for p in partColAligns:
    head = [' '.join(c.get('text','') for c in hcol) for hcol in zip(*p['tableHeaders'])]
    l,g = p['partcol_local'], p['partcol_global']
    ci_head.update({ci:head[ l[pci] ] for pci,ci in g.items()})
ci_head = pd.Series(ci_head, name='header')

partsims = sims.join(ci_head, on='ci1', how='inner').join(ci_head, how='inner', on='ci2', rsuffix='2')

display(partsims.describe())

# partsims = partsims[partsims.header == '_pgTitle']
# partsims = partsims[partsims.header == 'Region']
# partsims = partsims[partsims['agg'] < .51]
partsims.sort_values('agg', ascending=False).head(60)

  "max": lambda *args: np.nanmax(args, axis=0),
  "mean": lambda *args: np.nanmean(args, axis=0),


Unnamed: 0,headjacc,headvec,bodylsh,bodyvec,bodytype,agg
count,44.0,36.0,52.0,44.0,120.0,131.0
mean,0.295455,0.703491,0.014573,0.700243,0.150281,0.231791
std,0.34638,0.268745,0.050234,0.199142,0.349125,0.354191
min,0.0,0.38544,0.0,0.20928,0.0,0.0
25%,0.0,0.49563,0.0,0.560835,0.0,0.0
50%,0.0,0.658545,0.0,0.732235,0.0,0.030172
75%,0.5,1.0,0.0,0.882038,0.0,0.304621
max,1.0,1.0,0.300781,0.99608,1.0,1.0


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,headjacc,headvec,bodylsh,bodyvec,bodytype,agg,header,header2
ti1,ti2,ci1,ci2,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
38,40,186,197,,,,,1.0,1.0,_Variable,_Variable
5,40,26,197,,,,,1.0,1.0,_Variable,_Variable
3,40,18,197,,,,,1.0,1.0,_Variable,_Variable
5,40,25,195,1.0,1.0,,,1.0,1.0,Peak position,Peak position
3,38,19,189,0.5,1.0,0.011719,0.88116,1.0,1.0,Chart,Chart )
3,5,18,26,,,,,1.0,1.0,_Variable,_Variable
5,38,26,186,,,,,1.0,1.0,_Variable,_Variable
3,38,18,186,,,,,1.0,1.0,_Variable,_Variable
3,38,17,185,1.0,1.0,,,1.0,1.0,Position,Position
5,38,27,188,0.5,1.0,0.0,0.9138,0.999889,0.999944,Chart,Chart (
