## Join EdCast Clusters to Wiki Metrics

**Author:** Jim Maddock

**Last Updated:** 12-15-20

**Description:** Merge the test EdCast clustered dataset (e.g. 1300 clusters based on vital 1k articles) to various metrics.  These metrics include:
* mean number of editors
* mean number of pageviews
* mean quality
* mean misalignment 

In [2]:
import pandas as pd
import json

In [3]:
editors_df = pd.read_table('/Users/klogg/research_data/wmf_knowledge_graph/metrics/enwiki_pages_editusercount.tsv')
pagesviews_df = pd.read_table('/Users/klogg/research_data/wmf_knowledge_graph/metrics/enwiki_pageviews.tsv')
title_id_map = pd.read_table('/Users/klogg/research_data/wmf_knowledge_graph/metrics/page_title_id_map.tsv')

In [4]:
FILEPATH = '/Users/klogg/research_data/wmf_knowledge_graph/wiki_1300_72k_7-14-20/wiki_1300_72k_logs_5434743174174776460_taxo.json'

def loadClusters(filepath):

    with open(filepath) as json_file:
        cluster_65 = json.load(json_file)

    df = pd.DataFrame()

    for i, cluster in enumerate(cluster_65):
        chunk = []
        for article in cluster_65[cluster]['items']:
            row = {
                'label':cluster_65[cluster]['label'],
                'cluster':cluster_65[cluster]['cluster'],
                'w':article['w'],
                'page_title':article['title']
            }
            chunk.append(row)
        df = df.append(pd.DataFrame(chunk))
        #print('finished cluster: {0}'.format(i))

    df['page_title'] = df['page_title'].apply(lambda x: x.replace(' ','_'))
    df['cluster'] = pd.to_numeric(df['cluster'])
    
    return df

clusters_df = loadClusters(FILEPATH)

In [10]:
print("pages with editor counts: {0}".format(len(editors_df)))
print("pages with pageview counts: {0}".format(len(pagesviews_df)))
print("pages in id/title map: {0}".format(len(title_id_map)))
print("pages in edcast clusters: {0}".format(len(clusters_df)))
print("clustered articles not in id/title map: {0}".format(len(clusters_df.loc[~clusters_df['page_title'].isin(title_id_map['page_title'])])))

pages with editor counts: 6129560
pages with pageview counts: 6091006
pages in id/title map: 15588706
pages in edcast clusters: 72418
clustered articles not in id/title map: 35


In [27]:
dup_count = title_id_map.merge(clusters_df,on='page_title',how='inner').groupby('page_title').size().to_frame('count')
len(dup_count.loc[dup_count['count'] > 1])

139

In [12]:
merged = title_id_map.merge(clusters_df,
                            on='page_title',
                            how='inner')

merged = merged.merge(editors_df,
                      on='page_id',
                      how='inner')

merged = merged.merge(pagesviews_df,
                      on='page_id',
                      how='inner')

merged.drop_duplicates(keep='first',subset='page_title',inplace=True)
merged.drop_duplicates(keep='first',subset='page_id',inplace=True)

In [15]:
editors_per_cluster = merged.groupby(['cluster','label'])['num_unique_registered_users'].mean().to_frame('mean_editors')
articles_per_cluster = merged.groupby(['cluster','label']).size().to_frame('num_articles')
pageviews_per_cluster = merged.groupby(['cluster','label'])['monthly_pageviews'].mean().to_frame('mean_pageviews')

result = editors_per_cluster.merge(articles_per_cluster,right_index=True,left_index=True)
result = result.merge(pageviews_per_cluster,right_index=True,left_index=True)

result.sort_values('mean_editors')

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_editors,num_articles,mean_pageviews
cluster,label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
643,braille guarani greek zambian,12.351351,74,253.945946
261,komos cara aitchison leotiomycetes,17.800000,10,363.600000
1074,plant aenus bank greece,23.214286,14,675.285714
262,dialect frisian yue language,24.472527,91,310.274725
216,act agabus perpetual foreigner,24.641026,39,1491.153846
...,...,...,...,...
808,music american rock gary,934.282051,117,160411.495726
138,unite state american movement,995.764045,267,118475.872659
769,culture australia people canada,1078.241667,120,115760.375000
968,new massachusetts california city,1134.102190,137,90461.751825


In [21]:
expanded_result = merged
grouped = expanded_result.groupby(['cluster','label'])
expanded_result['mean_editors'] = grouped['num_unique_registered_users'].transform('mean')
expanded_result['num_articles'] = grouped['num_unique_registered_users'].transform('size')
expanded_result['mean_pageviews'] = grouped['monthly_pageviews'].transform('mean')
expanded_result.sort_values('label')

Unnamed: 0,page_id,page_title,label,cluster,w,num_revisions,num_unique_registered_users,monthly_pageviews,mean,mean_editors,num_articles,mean_pageviews
48090,19096964,Original_Rags,139p vaisala oterma richard,1210,0.416053,58,26,215.0,33.687500,33.687500,16,765.375
71494,21323231,Zakuski,139p vaisala oterma richard,1210,0.374100,147,54,3666.0,33.687500,33.687500,16,765.375
86,4118466,139P/Väisälä–Oterma,139p vaisala oterma richard,1210,0.419381,28,20,20.0,33.687500,33.687500,16,765.375
36058,467875,Landmass,139p vaisala oterma richard,1210,0.313165,125,63,4505.0,33.687500,33.687500,16,765.375
33109,34570564,James_R._Oestreich,139p vaisala oterma richard,1210,0.479937,93,17,84.0,33.687500,33.687500,16,765.375
...,...,...,...,...,...,...,...,...,...,...,...,...
47265,4153951,Oceanarium,zoo list park aquarium,1181,0.483975,120,45,1296.0,152.676471,152.676471,34,4585.500
3777,26549598,Animal_theme_park,zoo list park aquarium,1181,0.528650,151,42,1438.0,152.676471,152.676471,34,4585.500
11394,193777,Chapultepec,zoo list park aquarium,1181,0.455889,538,164,5309.0,152.676471,152.676471,34,4585.500
39411,69841,London_Zoo,zoo list park aquarium,1181,0.662246,1720,394,9154.0,152.676471,152.676471,34,4585.500
