In [1]:
import pandas as pd
from _clustergram import Clustergram
import networkx as nx

In [2]:
# traverse results folder
import os

all_results = []

for file in os.listdir('results'):
    if file.endswith('.txt'):
        with open(f'results/{file}', 'r') as file:
            text = file.read()
            week = int(file.name.split('week')[1].split('.')[0])
            entries = text.split('Album art')[1:]
            week_results = []
            for entry in entries:
                entry_results = []
                offset = 0
                elements = [e.strip() for e in entry.split('\n')]
                title = elements[4]
                if elements[5] != '':
                    #print(f'Title spans multiple lines in file {file.name}')
                    title = title + ' ' + elements[5]
                    offset = 1
                artist = elements[6 + offset]
                album = elements[8 + offset]
                total_votes = int(elements[11 + offset])
                vote_tally = 0
                # Get the first user by finding the index of the first 'User avatar':
                user_index = elements.index('User avatar')
                submitter = elements[user_index + offset + 3]
                # Get the votes by finding all of the remaining indexes of 'User avatar' 
                voter_ids = [i for i, e in enumerate(elements) if e == 'User avatar']
                vote_summary = {}
                for voter_id in voter_ids[1:]:
                    voter = elements[voter_id  + offset + 1].replace('*','')
                    if voter != 'Send':
                        try:
                            comment = elements[voter_id + 3]
                            vote_offset = 0
                            if elements[voter_id + 4] != '':
                                comment = comment + ' ' + elements[voter_id + 3]
                                vote_offset = 1
                            try:
                                vote = int(elements[voter_id + vote_offset + 6])
                            except:
                                vote = 0
                            vote_summary[voter] = vote
                            vote_tally += vote
                            result = {
                                'week': week,
                                'title': title,
                                'artist': artist,
                                'album': album,
                                'total_votes': total_votes,
                                'submitter': submitter,
                                'voter': voter,
                                'vote': vote,
                                'comment': comment
                            }
                            entry_results.append(result)
                        except:
                            print('ENTRY')
                            print(f'Error in week {week} submission {title} processing vote for {voter}')
                            print(elements[(voter_id):])
                if vote_tally != total_votes:
                    print(f'Error in week {week} submission {title} - vote tally is {vote_tally} but total votes is {total_votes}')
                    for r in entry_results:
                        print(f"{r['voter']}: {r['vote']}")
                    print()
                week_results.extend(entry_results)
            all_results.extend(week_results)
df = pd.DataFrame(all_results)
df

Unnamed: 0,week,title,artist,album,total_votes,submitter,voter,vote,comment
0,2,No Better,Lorde,No Better,26,skpurdue,gardenfractals,6,"* This song slaps, hadn't heard it before"
1,2,No Better,Lorde,No Better,26,skpurdue,Laura South,6,* love this song!!! good pick
2,2,No Better,Lorde,No Better,26,skpurdue,Dan Kerrigan,5,*
3,2,No Better,Lorde,No Better,26,skpurdue,Mac Creamer,3,*
4,2,No Better,Lorde,No Better,26,skpurdue,Connnnnor,3,*
...,...,...,...,...,...,...,...,...,...
368,6,Don't Judge Me,Janelle Monáe,Dirty Computer,17,skpurdue,Mac Creamer,2,* I had a hard time pulling the sample out of ...
369,6,Don't Judge Me,Janelle Monáe,Dirty Computer,17,skpurdue,Dan Kerrigan,2,*
370,6,Don't Judge Me,Janelle Monáe,Dirty Computer,17,skpurdue,Racquel Levia,1,*
371,6,Don't Judge Me,Janelle Monáe,Dirty Computer,17,skpurdue,gardenfractals,1,*


In [3]:
# add up the votes in the 'votes' column to get adjacency from voter to submitter
adjacency = df.groupby(['voter', 'submitter'])['vote'].sum().reset_index()
# Remove empty voters
adjacency = adjacency[adjacency['voter'] != '']
adjacency

Unnamed: 0,voter,submitter,vote
1,Connnnnor,Dan Kerrigan,18
2,Connnnnor,Eduardo Puerta C,22
3,Connnnnor,Laura South,13
4,Connnnnor,Mac Creamer,17
5,Connnnnor,Racquel Levia,11
...,...,...,...
68,skpurdue,Laura South,5
69,skpurdue,Mac Creamer,12
70,skpurdue,Racquel Levia,15
71,skpurdue,Shani Spivak,18


In [4]:
# convert to a matrix:
matrix = adjacency.pivot_table(index='voter', columns='submitter', values='vote').fillna(0)
# Remove any rows or columns that are all zeros
matrix = matrix.loc[(matrix != 0).any(axis=1), (matrix != 0).any()]


In [5]:
adjacency.to_csv('edgelist.csv', index=False)
matrix.to_csv('matrix.csv', index=False)

In [6]:
# For each submitter, get the total votes they received from the matrix and add it to their name:
new_names = []
for submitter in matrix.columns:
    total_votes = int(matrix[submitter].sum())
    new_names.append(f'{submitter} ({total_votes})')
matrix.columns = new_names

fig = Clustergram(
    data=matrix,
    column_labels=list(matrix.columns.values),
    row_labels=list(matrix.index),
    height=400,
    width=800,
    color_map = [
        [0, "#222"],
        [1, "aqua"]
    ],
    center_values=False,
)
fig.update_layout(template='plotly_dark')
# Update only the heatmap axes, not all axes:
fig.update_yaxes(title_text='Voters', selector={'anchor': 'x11'})
fig.update_xaxes(title_text='Submitters', selector={'anchor': 'y11'})

In [8]:
fig.write_html('clustergram.html')

In [18]:
def get(voter, submitter):
    subset = df[(df['voter'] == voter) & (df['submitter'] == submitter)]
    subset.sort_values(by='vote', ascending=False, inplace=True)
    print(f'{voter} gave {subset["vote"].sum()} votes to {submitter}')
    titles = subset['title'].values
    votes = subset['vote'].values
    print(f'Votes: {[f"{title} ({vote})" for title, vote in zip(titles, votes)]}')

In [20]:
get('Laura South', 'skpurdue')

Laura South gave 25 votes to skpurdue
Votes: ['No Better (6)', 'Rest in Peace (6)', 'American Pie (5)', "Don't Judge Me (5)", 'Surfing on a Sine Wave (3)']
