## Join EdCast Clusters to Wiki Metrics

**Author:** Jim Maddock

**Last Updated:** 12-15-20

**Description:** Merge the test EdCast clustered dataset (e.g. 1300 clusters based on vital 1k articles) to various metrics.  These metrics include:
* mean number of editors
* mean number of pageviews
* mean quality
* mean misalignment 

In [7]:
import pandas as pd
import json
import numpy as np

In [8]:
editors_df = pd.read_table('/Users/klogg/research_data/wmf_knowledge_graph/metrics/enwiki_pages_editusercount.tsv')
pagesviews_df = pd.read_table('/Users/klogg/research_data/wmf_knowledge_graph/metrics/enwiki_pageviews.tsv')
title_id_map = pd.read_table('/Users/klogg/research_data/wmf_knowledge_graph/metrics/page_title_id_map.tsv')

In [9]:
quality_df = pd.read_csv('/Users/klogg/research_data/wmf_knowledge_graph/metrics/quality_scores_small_INCOMPLETE_1-4-21.csv')
quality_df = quality_df.sort_values(['page_id','timestamp']).drop_duplicates(subset='page_id',keep='last')

In [10]:
FILEPATH = '/Users/klogg/research_data/wmf_knowledge_graph/wiki_1300_72k_7-14-20/wiki_1300_72k_logs_5434743174174776460_taxo.json'

def loadClusters(filepath):

    with open(filepath) as json_file:
        cluster_65 = json.load(json_file)

    df = pd.DataFrame()

    for i, cluster in enumerate(cluster_65):
        chunk = []
        for article in cluster_65[cluster]['items']:
            row = {
                'label':cluster_65[cluster]['label'],
                'cluster':cluster_65[cluster]['cluster'],
                'w':article['w'],
                'page_title':article['title']
            }
            chunk.append(row)
        df = df.append(pd.DataFrame(chunk))
        #print('finished cluster: {0}'.format(i))

    df['page_title'] = df['page_title'].apply(lambda x: x.replace(' ','_'))
    df['cluster'] = pd.to_numeric(df['cluster'])
    
    return df

clusters_df = loadClusters(FILEPATH)

In [13]:
print("pages with editor counts: {0}".format(len(editors_df)))
print("pages with pageview counts: {0}".format(len(pagesviews_df)))
print("pages in id/title map: {0}".format(len(title_id_map)))
print("pages in edcast clusters: {0}".format(len(clusters_df)))
print("clustered articles not in id/title map: {0}".format(len(clusters_df.loc[~clusters_df['page_title'].isin(title_id_map['page_title'])])))

pages with editor counts: 6129560
pages with pageview counts: 6091006
pages in id/title map: 15588706
pages in edcast clusters: 72418
clustered articles not in id/title map: 35


In [None]:
dup_count = title_id_map.merge(clusters_df,on='page_title',how='inner').groupby('page_title').size().to_frame('count')
len(dup_count.loc[dup_count['count'] > 1])

In [11]:
merged = title_id_map.merge(clusters_df,
                            on='page_title',
                            how='inner')

merged = merged.merge(editors_df,
                      on='page_id',
                      how='inner')

merged = merged.merge(pagesviews_df,
                      on='page_id',
                      how='inner')

merged = merged.merge(quality_df,
                      on='page_id',
                      how='inner')

merged.drop_duplicates(keep='first',subset='page_title',inplace=True)
merged.drop_duplicates(keep='first',subset='page_id',inplace=True)

In [12]:
OUTFILE = '/Users/klogg/research_data/wmf_knowledge_graph/metrics/article_list_1-4-21.csv'

merged.to_csv(OUTFILE,index=False)

In [6]:
editors_per_cluster = merged.groupby(['cluster','label'])['num_unique_registered_users'].mean().to_frame('mean_editors')
articles_per_cluster = merged.groupby(['cluster','label']).size().to_frame('num_articles')
pageviews_per_cluster = merged.groupby(['cluster','label'])['monthly_pageviews'].mean().to_frame('mean_pageviews')
quality_per_cluster = merged.groupby(['cluster','label'])['weighted_sum'].mean().to_frame('mean_quality')

result = editors_per_cluster.merge(articles_per_cluster,right_index=True,left_index=True)
result = result.merge(pageviews_per_cluster,right_index=True,left_index=True)
result = result.merge(quality_per_cluster,right_index=True,left_index=True)

result.sort_values('mean_editors')

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_editors,num_articles,mean_pageviews,mean_quality
cluster,label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
546,parmenas banu goshasp gezhouba,16.000000,2,136.500000,0.442271
186,ride valkyries home improvement,16.500000,2,121.500000,0.380154
797,phyllodulcin amylopectin retrogradation starch,18.250000,4,844.250000,0.553795
327,woman suffrage liechtenstein referendum,18.375000,8,223.000000,0.559921
1188,targum ketuvim timeline hindu,18.750000,4,1109.500000,1.089103
...,...,...,...,...,...
649,boy blue band brother,1271.065217,46,149154.152174,3.890207
138,unite state american movement,1302.090909,110,168451.209091,3.542260
769,culture australia people canada,1353.164179,67,155618.686567,3.331263
968,new massachusetts california city,1545.293103,58,130177.000000,3.480924


In [7]:
expanded_result = merged
grouped = expanded_result.groupby(['cluster','label'])
expanded_result['mean_editors'] = grouped['num_unique_registered_users'].transform('mean')
expanded_result['num_articles'] = grouped['num_unique_registered_users'].transform('size')
expanded_result['mean_pageviews'] = grouped['monthly_pageviews'].transform('mean')
expanded_result['mean_quality'] = grouped['weighted_sum'].transform('mean')
expanded_result.sort_values('label')

Unnamed: 0,page_id,page_title,label,cluster,w,num_revisions,num_unique_registered_users,monthly_pageviews,title,rev_id,timestamp,prediction,weighted_sum,mean_editors,num_articles,mean_pageviews,mean_quality
21150,19096964,Original_Rags,139p vaisala oterma richard,1210,0.416053,58,26,215.0,Original Rags,696393084,20160801000000,Start,1.652386,35.083333,12,593.416667,1.067935
24415,32125804,Rinder,139p vaisala oterma richard,1210,0.496906,5,4,70.0,Rinder,697704412,20160801000000,Stub,0.136505,35.083333,12,593.416667,1.067935
1698,35760624,Anna_Lomax_Wood,139p vaisala oterma richard,1210,0.506393,66,27,165.0,Anna Lomax Wood,721652555,20160801000000,B,2.450667,35.083333,12,593.416667,1.067935
13143,3865020,Houshang_Moradi_Kermani,139p vaisala oterma richard,1210,0.452922,92,46,176.0,Houshang Moradi Kermani,710582487,20160801000000,Start,0.807556,35.083333,12,593.416667,1.067935
39,4118466,139P/Väisälä–Oterma,139p vaisala oterma richard,1210,0.419381,28,20,20.0,139P/Väisälä–Oterma,712513175,20160801000000,Stub,0.071225,35.083333,12,593.416667,1.067935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10754,13149835,Frozen_zoo,zoo list park aquarium,1181,0.384426,138,54,1255.0,Frozen zoo,724385262,20160801000000,C,2.554291,107.294118,17,2551.764706,2.213387
18093,4794229,Marine_mammal_park,zoo list park aquarium,1181,0.358035,174,61,1116.0,Marine mammal park,718448343,20160801000000,C,2.135399,107.294118,17,2551.764706,2.213387
28395,2854621,Themed_Entertainment_Association,zoo list park aquarium,1181,0.405148,163,53,972.0,Themed Entertainment Association,727952392,20160801000000,B,2.695290,107.294118,17,2551.764706,2.213387
5052,12219596,Charta_Oecumenica,zoo list park aquarium,1181,0.283274,18,12,127.0,Charta Oecumenica,664048827,20160801000000,Stub,0.079561,107.294118,17,2551.764706,2.213387


In [17]:
JSON_FILENAME = '/Users/klogg/research_data/wmf_knowledge_graph/metrics/joined_metrics_1-5-21.json'

def expanded_results_to_json(df):
    result_dict = {}
    for _, row in df.iterrows():
        if row.cluster not in result_dict:
            cluster_dict = {
                'name':row.label,
                'cluster':row.cluster,
                'mean_editors':row.mean_editors,
                'num_articles':row.num_articles,
                'mean_pageviews':row.mean_pageviews,
                'mean_quality':row.mean_quality,
                'children':[]
            }
            result_dict[row.cluster] = cluster_dict
        
        page_dict = {
            'page_id':row.page_id,
            'name':row.page_title,
            'num_revisions':row.num_revisions,
            'num_unique_registered_users':row.num_unique_registered_users,
            'monthly_pageviews':row.monthly_pageviews,
            'quality_prediction':row.prediction,
            'quality_weighted_sum':row.weighted_sum,
            'value':1
        }
        result_dict[row.cluster]['children'].append(page_dict)
    
    result_json = {
        'name':'edcast_taxo_7-27-20',
        'children':list(result_dict.values())
    }
    return result_json

with open(JSON_FILENAME,'w') as outfile:
    json_dict = expanded_results_to_json(expanded_result)
    json.dump(json_dict,outfile)

In [30]:
JSON_FILENAME = '/Users/klogg/research_data/wmf_knowledge_graph/metrics/joined_metrics_small_1-5-21.json'

clusters_to_sample = np.random.choice(expanded_result['cluster'].unique(),size=10,replace=False)
sample_expanded_results = expanded_result.loc[expanded_result['cluster'].isin(clusters_to_sample)]

with open(JSON_FILENAME,'w') as outfile:
    json_dict = expanded_results_to_json(sample_expanded_results)
    json.dump(json_dict,outfile)

array([ 107,  910,  991, 1112,  863,   75, 1207,  958, 1133, 1264])