# Cluster analysis

In this notebook, we will look at how to debug blocked table similarities and the resulting clusters.

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
# Assume that we have the "chartedIn" results
import takco

# tables = takco.HashBag._load('../../output/chartedIn/4-coltypes/*.jsonl')
tables = takco.HashBag._load('../../output/chartedIn/5-cluster/*.jsonl')
tables = takco.HashBag._load('../../output/chartedIn/5-cluster/*.jsonl')

tables = list(tables)
print(f"Got {sum(1 for _ in tables)} tables")

# Get biggest table
table = sorted(tables, key=lambda x: -x.get('numDataRows'))[0]
takco.preview( table )

Got 69 tables


?,0,1,2,3,4,5
Unnamed: 0_level_1,Year,Peak position,Chart,_pgTitle,_pgTitle,_pgTitle
,1972,,,Tapestry (Carole King album),Album of the Year,Tapestry
,1972,,,Tapestry (Carole King album),Record of the Year,""" It's Too Late """
,1972,,,Tapestry (Carole King album),Song of the Year,""" You've Got a Friend """
,1972,,,Tapestry (Carole King album),"Best Pop Vocal Performance, Female",Tapestry
,1971,4.0,Australian Albums Chart,Jesus Christ Superstar (album),,


In [3]:
from takco.util import tableobj_to_dataframe
df = tableobj_to_dataframe(table)
for ci, col in enumerate(df.columns):
    tophead = table['tableHeaders'][0][ci].get('freq')
    if tophead:
        print("Top headers:",  dict(sorted(tophead.items(), key=lambda x:-x[1])) )
    else:
        print("Header:", col)
    print("Top values:", dict( df.T.iloc[ci].value_counts()[:3] ))
    print()

Top headers: {'Year': 6, '_Variable': 4}
Top values: {'2019': 227, '2018–19': 198, '2012': 163}

Top headers: {'Peak position': 4, 'Result': 2, 'Peak Position': 1, 'Position': 1}
Top values: {'1': 336, '2': 106, '3': 94}

Top headers: {'Chart': 5, 'Sales charts': 1, 'Ceremony': 1, 'Organization': 1, 'Charts': 1}
Top values: {'US Billboard Hot 100': 37, 'UK Singles (Official Charts Company)': 34, 'US Mainstream Top 40 ( Billboard )': 27}

Top headers: {'_pgTitle': 4, 'Ref.': 2, 'Region': 1}
Top values: {'': 1041, 'Diamonds (Rihanna song)': 58, 'Somebody That I Used to Know': 46}

Top headers: {'_pgTitle': 3, 'Category': 2, 'Peak position': 1}
Top values: {'': 463, 'Sucker (song)': 71, 'Thank U, Next (song)': 68}

Top headers: {'_pgTitle': 3, 'Award': 1, 'Winner': 1}
Top values: {'': 1443, 'Thick as a Brick': 11, 'Sunflower (Post Malone and Swae Lee song)': 10}



In [4]:
takco.preview( table.get('partColAligns'), ntables=None )

?,0,1,2,3
∈,string,dateTime,string,string
Unnamed: 0_level_2,_pgTitle,Year,Winner,Category
,Tapestry (Carole King album),1972,Tapestry,Album of the Year
,Tapestry (Carole King album),1972,""" It's Too Late """,Record of the Year
,Tapestry (Carole King album),1972,""" You've Got a Friend """,Song of the Year
,Tapestry (Carole King album),1972,Tapestry,"Best Pop Vocal Performance, Female"

?,0,1,2,3
∈,string,decimal,dateTime,string
Unnamed: 0_level_2,_pgTitle,Position,_Variable,Chart
,Jesus Christ Superstar (album),4,1971,Australian Albums Chart
,Jesus Christ Superstar (album),41,1971,Italian Albums Chart
,Jesus Christ Superstar (album),1,1971,US Billboard Pop Albums
,Captain Fantastic and the Brown Dirt Cowboy,5,1975,Australian Albums Chart
,Captain Fantastic and the Brown Dirt Cowboy,1,1975,Canadian Albums Chart

?,0,1,2,3
∈,string,decimal,dateTime,string
Unnamed: 0_level_2,_pgTitle,Peak position,_Variable,Charts
,Kikuuiki,4,2010,Japan Billboard Top Albums Sales
,Kikuuiki,2,2010,Japan Oricon daily albums
,Kikuuiki,3,2010,Japan Oricon weekly albums
,Aruku Around,9,2010,Japan Billboard Adult Contemporary Airplay
,Aruku Around,4,2010,Japan Billboard Japan Hot 100

?,0,1,2,3
∈,string,decimal,dateTime,string
Unnamed: 0_level_2,_pgTitle,Peak position,_Variable,Chart
,Jesus Christ Superstar (album),6,1970–71,Australian Kent Music Report
,Jesus Christ Superstar (album),4,1970–71,Austrian Albums Chart
,Jesus Christ Superstar (album),1,1970–71,Canadian RPM Albums Chart
,Jesus Christ Superstar (album),10,1970–71,Dutch Albums Chart
,Jesus Christ Superstar (album),16,1970–71,Italian Albums Chart

?,0,1,2,3
∈,string,string,dateTime,decimal
Unnamed: 0_level_2,_pgTitle,Chart,Year,Peak position
,Bolan Boogie,UK Albums Chart,1972,1
,Fog on the Tyne,UK Albums Chart,1972,1

?,0,1,2,3
∈,string,dateTime,string,decimal
Unnamed: 0_level_2,_pgTitle,Year,Chart,Peak Position
,Harvest (Neil Young album),1972,Billboard 200,1
,Harvest (Neil Young album),1972,Australian Kent Music Report Albums Chart,1
,Harvest (Neil Young album),1972,UK Albums Chart,1

?,0,1,2,3,4,5
∈,string,dateTime,string,string,string,string
Unnamed: 0_level_2,_pgTitle,Year,Organization,Award,Result,Ref.
,Sucker (song),2019,Teen Choice Awards,Choice Song: Group,Nominated,
,Sucker (song),2019,Teen Choice Awards,Choice Pop Song,Nominated,
,Sucker (song),2019,MTV Video Music Awards,Video of the Year,Nominated,
,Sucker (song),2019,MTV Video Music Awards,Song of the Year,Nominated,
,Sucker (song),2019,MTV Video Music Awards,Best Pop Video,Won,

?,0,1,2,3,4,5
∈,string,dateTime,string,string,string,string
Unnamed: 0_level_2,_pgTitle,Year,Ceremony,Category,Result,Ref.
,Sunflower (Post Malone and Swae Lee song),2019,American Music Awards,Collaboration of the Year,Nominated,
,Sunflower (Post Malone and Swae Lee song),2019,American Music Awards,Favorite Song — Pop/Rock,Nominated,
,Sunflower (Post Malone and Swae Lee song),2019,Clio Awards,Animation,Bronze,
,Sunflower (Post Malone and Swae Lee song),2019,Guild of Music Supervisors Awards,Best Song/Recording Created for a Film,Nominated,
,Sunflower (Post Malone and Swae Lee song),2019,Melon Music Awards,Best OST,Nominated,

?,0,1,2,3,4
∈,string,string,decimal,dateTime,string
Unnamed: 0_level_2,_pgTitle,Region,Peak position,_Variable,Sales charts
,Everything That Happens Will Happen Today,Australia,66,2009,ARIA Albums Chart
,Everything That Happens Will Happen Today,Belgium,58,2009,Belgian (Wallonian) Albums Chart
,Everything That Happens Will Happen Today,New Zealand,31,2009,Official New Zealand Music Chart
,Everything That Happens Will Happen Today,United States,174,2009,US Billboard 200
,Everything That Happens Will Happen Today,France,127,2008,Syndicat National de l'Édition Phonographique

?,0,1,2,3,4
∈,string,dateTime,string,string,decimal
Unnamed: 0_level_2,_pgTitle,Year,Chart,Peak position,Unnamed: 5_level_2
,Thick as a Brick,1972,Australian Albums ( Kent Music Report ),1,
,Thick as a Brick,1972,Canadian Albums ( RPM ),1,
,Thick as a Brick,1972,Danish Albums ( Tracklisten ),1,
,Thick as a Brick,1972,,German Albums ( Offizielle Top 100 ),2.0
,Thick as a Brick,2012,,German Albums ( Offizielle Top 100 ) 40th Anniversary Collector's Edition,53.0


In [5]:
from IPython.display import display
from takco.cluster import aggregate_similarities
import pandas as pd
sims = pd.read_csv('../../output/chartedIn/5-cluster/sims.csv', index_col=[0,1,2,3]).query('ti1 < ti2')
agg_func = '@mean(  @max(headjacc, @pow(headvec, 4)), @max(bodylsh, bodytype, @pow(bodyvec, 4) ) )'
sims['agg'] = aggregate_similarities(sims, agg_func)

partColAligns = table.get('partColAligns')
ci_head = {}
for p in partColAligns:
    head = [' '.join(c.get('text','') for c in hcol) for hcol in zip(*p['tableHeaders'])]
    l,g = p['partcol_local'], p['partcol_global']
    ci_head.update({ci:(head[ l[pci] ] if head and l else '') for pci,ci in g.items()})
ci_head = pd.Series(ci_head, name='header')

partsims = sims.join(ci_head, on='ci1', how='inner').join(ci_head, how='inner', on='ci2', rsuffix='2')

display(partsims.describe())

# partsims = partsims[partsims.header == '_pgTitle']
# partsims = partsims[partsims.header == 'Region']
# partsims = partsims[partsims['agg'] < .51]
partsims.sort_values('agg', ascending=False).head(60)

  "max": lambda *args: np.nanmax(args, axis=0),


Unnamed: 0,headjacc,headvec,bodylsh,bodyvec,bodytype,agg
count,455.0,427.0,350.0,350.0,949.0,949.0
mean,0.104396,0.506191,0.007757,0.652852,0.461538,0.407304
std,0.297905,0.259614,0.04805,0.171803,0.498781,0.431917
min,0.0,0.0,0.0,0.0716,0.0,0.0
25%,0.0,0.38544,0.0,0.531718,0.0,1.1e-05
50%,0.0,0.49563,0.0,0.6574,0.0,0.116301
75%,0.0,0.68467,0.0,0.781,1.0,1.0
max,1.0,1.0,0.550781,0.99546,1.0,1.0


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,headjacc,headvec,bodylsh,bodyvec,bodytype,agg,header,header2
ti1,ti2,ci1,ci2,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10,95,39,404,,,0.0,0.4395,1.0,1.0,_pgTitle,_pgTitle
29,32,130,140,,,,,1.0,1.0,_Variable,Year
29,93,131,393,,,0.0,0.58833,1.0,1.0,Chart,_pgTitle
29,93,128,393,,,0.0,0.81412,1.0,1.0,_pgTitle,_pgTitle
22,93,92,393,,,0.0,0.55311,1.0,1.0,Charts,_pgTitle
22,93,89,393,,,0.0,0.72977,1.0,1.0,_pgTitle,_pgTitle
11,93,46,393,,,0.0,0.57911,1.0,1.0,Chart,_pgTitle
11,93,43,393,,,0.0,0.80895,1.0,1.0,_pgTitle,_pgTitle
10,93,42,393,,,0.0,0.75982,1.0,1.0,Category,_pgTitle
10,93,41,393,,,0.0,0.89456,1.0,1.0,Winner,_pgTitle
