In [2]:
import pandas as pd
# read the file in. 
user_ht_links_main = pd.read_csv("output_panel_7_7_20.tsv", sep="\t")


In [3]:
user_ht_links = user_ht_links_main.copy()

In [4]:
vocab = set([v.strip() for v in open("vocab_panel_7_7_20.tsv.txt")])

In [5]:
user_ht_links = user_ht_links[user_ht_links.term.isin(vocab)]

In [9]:
v = user_ht_links.groupby("term").uid.nunique().reset_index()
v.columns = ['term', 'n']

In [28]:
v = v[v.n > 100]

In [29]:
v.shape

(2778, 2)

In [30]:
user_ht_links = user_ht_links[user_ht_links.term.isin(v.term)]

In [31]:
# aggregate across user/ht pairs
user_ht_links_agg = user_ht_links.groupby(['uid','term']).size().reset_index()
user_ht_links_agg.columns = ['uid', 'ht', 'linkweight']
user_ht_links_agg.head()

Unnamed: 0,uid,ht,linkweight
0,17,medium,1
1,18,hello,1
2,388,design,1
3,388,food,1
4,388,technology,1


In [32]:
m = user_ht_links_agg.uid.value_counts().reset_index()
user_ht_links_agg  = user_ht_links_agg[user_ht_links_agg.uid.isin(m[m.uid > 1]['index'])]


In [33]:
# To translate to a matrix we have to give each user and ht an index. 
# stealing fancy solution from here: 
# https://stackoverflow.com/questions/45685254/q-pandas-how-to-efficiently-assign-unique-id-to-individuals-with-multiple-ent
user_ht_links_agg = user_ht_links_agg.assign(uid_matrixid = user_ht_links_agg.groupby(['uid']).ngroup(),
                                             ht_matrixid = user_ht_links_agg.groupby(['ht']).ngroup())

In [34]:
# write out indices so we can remember what links to what
(user_ht_links_agg[['uid','uid_matrixid']]
 .groupby(['uid','uid_matrixid'])
 .head(1)
 .to_csv("bipartite_uid_map.csv",index=False))

# write out indices so we can remember what links to what
(user_ht_links_agg[['ht','ht_matrixid']]
 .groupby(['ht','ht_matrixid'])
 .head(1)
 .to_csv("bipartite_ht_map.csv",index=False))

In [35]:
# ok, cool, now we can write out the matrix needed for the bipartite pairs stuff
# this matrix is constructed oddly, you can look here for easier to understand examples: 
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html
from scipy import sparse, io
output_matrix = sparse.coo_matrix((user_ht_links_agg.linkweight.values.tolist(), 
                                   (user_ht_links_agg.ht_matrixid.values,
                                    user_ht_links_agg.uid_matrixid.values)))


In [36]:
output_matrix.shape

(2778, 337139)

In [37]:
from sklearn.cluster import SpectralCoclustering

clustering = SpectralCoclustering(n_clusters=100, random_state=0).fit(output_matrix)


In [38]:
mat = (user_ht_links_agg[['ht','ht_matrixid']]
 .groupby(['ht','ht_matrixid'])
 .head(1).sort_values("ht_matrixid"))

In [39]:
mat = mat.assign(labs = clustering.row_labels_)

In [54]:
def print_row(c):
    print(str(c.labs.iloc[0]) + " & " + ", ".join(c.ht).replace("&","\&")+ " \\\\ \\hline")
mat.groupby("labs").apply(print_row)

0 & american, army vet, army veteran, bad, catholic, conservative, constitutional conservative, dentist, father of 2, father of 4, father of 5, father of four, father of three, father of two, graphic, happy husband, husband of 1, independent thinker, libertarian, livin' the dream, loving husband, lucky husband, marine, native texan, navy vet, navy veteran, pro-life, professional speaker, property manager, proud american, proud father, real estate agent, real estate professional, republican, romans 8:28, steelers fan, truth seeker \\ \hline
1 & dignity, proverbs 31:25 \\ \hline
2 & imperfection is beauty, madness is genius \\ \hline
3 & play harder \\ \hline
4 & member finra, sipc \\ \hline
5 & only \\ \hline
6 & 49ers, all the time, america, angels, astros, bruins, celtics, constitution, cosmetic, cowboys, dodgers, dolphins, ducks, field, giants, go vols, god is good, heat, jets, junior, keeppounding, kings, knicks, lakers, mets, military, new jersey, notre dame, ny giants, panthers, p