# Wikipedia Sandbox: Analysis

In [1]:
import json
import datetime
import requests
import pandas as pd
from tqdm.notebook import tqdm
import time
import os
from collections import defaultdict, Counter
from pathlib import Path
import pickle
from loguru import logger

with open('wmapicred.json', 'r') as f:
    credentials = json.load(f)

## Top 100

In [2]:
# Load in Data
datapath = Path('./Data/Wikipedia/')

langfile = datapath / 'metoo_base_lang+pageprops.json'
originalfile = datapath / 'metoo_base.json'
allcatfile = datapath / 'cat_per_article.pkl'

# get order of relevance from original base file:
with open(originalfile, 'r') as f:
    ORIGINAL_DATA = json.load(f)
logger.info('Original Data Loaded In.')

with open(langfile, 'r') as f:
    LANGDATA = json.load(f)
logger.info('Language Data Loaded In.')

# get categories
with open(allcatfile, 'rb') as f:
    CATS = pickle.load(f)
logger.info('Category Data Loaded In.')

[32m2024-01-29 14:51:49.238[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mOriginal Data Loaded In.[0m


[32m2024-01-29 14:51:52.096[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mLanguage Data Loaded In.[0m
[32m2024-01-29 14:51:52.947[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m21[0m - [1mCategory Data Loaded In.[0m


In [3]:
# obtain relevance order from ordering in original data
ORDER = [str(i['pageid']) for i in ORIGINAL_DATA]

In [4]:
# Process Langfile for lang data

LANGLINKS = []
for entry in tqdm(LANGDATA, desc='First Pass for Language Versions'):
    for k, v in entry['query']['pages'].items():
        assert k == str(v['pageid']), print(k,v)
        if 'langlinks' in v:
            for ll in v['langlinks']:
                LANGLINKS.append({
                    'pageid':k,
                    'title':v['title'],
                    'langname':ll['langname'],
                    'lang': ll['lang'],
                    'url': ll['url'],
                })

First Pass for Language Versions:   0%|          | 0/4235 [00:00<?, ?it/s]

In [6]:
LINKS = Counter()
for entry in tqdm(LANGDATA, desc='Second Pass for Language Versions'):
    for k, v in entry['query']['pages'].items():
        assert k == str(v['pageid']), print(k,v)
        if 'links' in v:
            LINKS[k] += len(v['links'])

Second Pass for Language Versions:   0%|          | 0/4235 [00:00<?, ?it/s]

In [7]:
LINKS

Counter({'1987615': 7474,
         '9982390': 5740,
         '160791': 5151,
         '11185': 4080,
         '5052197': 4020,
         '142056': 3500,
         '153557': 3384,
         '67163851': 3312,
         '15852': 3264,
         '203407': 3184,
         '345095': 3124,
         '249465': 2947,
         '7818598': 2899,
         '159599': 2890,
         '52737': 2880,
         '1980240': 2842,
         '78781': 2816,
         '165113': 2736,
         '9288': 2576,
         '29812': 2566,
         '3382': 2552,
         '59720': 2548,
         '50420': 2508,
         '149709': 2416,
         '8300': 2368,
         '53257': 2308,
         '3508629': 2262,
         '72336': 2252,
         '154265': 2212,
         '391887': 2184,
         '5301416': 2166,
         '81425': 2166,
         '735009': 2146,
         '140461': 2136,
         '17741': 2096,
         '1563002': 2070,
         '16175': 2050,
         '52263': 2046,
         '1436': 2034,
         '38954428': 2030,
         

In [8]:
CATS['1987615']

{'Category:All Wikipedia articles written in Australian English',
 'Category:All articles to be expanded',
 'Category:All articles with empty sections',
 'Category:Articles to be expanded from September 2023',
 'Category:Articles using small message boxes',
 'Category:Articles with empty sections from September 2023',
 'Category:Articles with short description',
 'Category:Australian Broadcasting Corporation original programming',
 'Category:Australian television-related lists',
 'Category:Lists of television series by network',
 'Category:Short description is different from Wikidata',
 'Category:Use Australian English from September 2013',
 'Category:Use dmy dates from September 2013'}

In [9]:
df = pd.DataFrame.from_records(LANGLINKS)

# merge link count into thingy
df['links'] = df['pageid'].apply(lambda x: LINKS[x])

df['category_count'] = df['pageid'].apply(lambda x: len(CATS[x]))

df['categories'] = df['pageid'].apply(lambda x: CATS[x])

In [10]:
df.head()

Unnamed: 0,pageid,title,langname,lang,url,links,category_count,categories
0,439693,Asia Argento,Arabic,ar,https://ar.wikipedia.org/wiki/%D8%A2%D8%B3%D9%...,285,60,"{Category:Use dmy dates from October 2021, Cat..."
1,439693,Asia Argento,Egyptian Arabic,arz,https://arz.wikipedia.org/wiki/%D8%A2%D8%B3%D9...,285,60,"{Category:Use dmy dates from October 2021, Cat..."
2,439693,Asia Argento,Asturian,ast,https://ast.wikipedia.org/wiki/Asia_Argento,285,60,"{Category:Use dmy dates from October 2021, Cat..."
3,439693,Asia Argento,Bulgarian,bg,https://bg.wikipedia.org/wiki/%D0%90%D0%B7%D0%...,285,60,"{Category:Use dmy dates from October 2021, Cat..."
4,439693,Asia Argento,Breton,br,https://br.wikipedia.org/wiki/Asia_Argento,285,60,"{Category:Use dmy dates from October 2021, Cat..."


In [11]:
pd.set_option("display.max_rows", 100)

In [12]:
df[df['title'] == 'MeToo movement']

Unnamed: 0,pageid,title,langname,lang,url,links,category_count,categories
311,55551931,MeToo movement,Arabic,ar,https://ar.wikipedia.org/wiki/%D8%A3%D9%86%D8%...,449,45,"{Category:Articles with J9U identifiers, Categ..."
312,55551931,MeToo movement,Assamese,as,https://as.wikipedia.org/wiki/%E0%A6%AE%E0%A6%...,449,45,"{Category:Articles with J9U identifiers, Categ..."
313,55551931,MeToo movement,Azerbaijani,az,https://az.wikipedia.org/wiki/Me_Too_h%C9%99r%...,449,45,"{Category:Articles with J9U identifiers, Categ..."
314,55551931,MeToo movement,Bulgarian,bg,https://bg.wikipedia.org/wiki/%D0%94%D0%B2%D0%...,449,45,"{Category:Articles with J9U identifiers, Categ..."
315,55551931,MeToo movement,Bhojpuri,bh,https://bh.wikipedia.org/wiki/%E0%A4%AE%E0%A5%...,449,45,"{Category:Articles with J9U identifiers, Categ..."
...,...,...,...,...,...,...,...,...
6467,55551931,MeToo movement,Urdu,ur,https://ur.wikipedia.org/wiki/%D9%85%DB%8C_%D9...,449,45,"{Category:Articles with J9U identifiers, Categ..."
6468,55551931,MeToo movement,Uzbek,uz,https://uz.wikipedia.org/wiki/Me_Too_harakati,449,45,"{Category:Articles with J9U identifiers, Categ..."
6469,55551931,MeToo movement,Vietnamese,vi,https://vi.wikipedia.org/wiki/Phong_tr%C3%A0o_...,449,45,"{Category:Articles with J9U identifiers, Categ..."
6470,55551931,MeToo movement,Chinese,zh,https://zh.wikipedia.org/wiki/%EF%BC%83MeToo,449,45,"{Category:Articles with J9U identifiers, Categ..."


In [17]:
saveto = './Data/Wikipedia/metoo_base_metoopage_only_langlinks.csv'

if not os.path.isfile(saveto):
    df.to_csv(saveto)
else:
    print('Already exists')

Already exists


In [14]:
df.groupby('title').count().sort_values(by='pageid', ascending=False)

Unnamed: 0_level_0,pageid,langname,lang,url,links,category_count,categories
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Feminism,5661,5661,5661,5661,5661,5661,5661
Celine Dion,3192,3192,3192,3192,3192,3192,3192
Elvis Presley,2992,2992,2992,2992,2992,2992,2992
Alfred Hitchcock,2641,2641,2641,2641,2641,2641,2641
Katharine Hepburn,2592,2592,2592,2592,2592,2592,2592
...,...,...,...,...,...,...,...
Me and Mr. Jonas and Mr. Jonas and Mr. Jonas,6,6,6,6,6,6,6
"One Hand, One Heart",6,6,6,6,6,6,6
Time's Up Legal Defense Fund,6,6,6,6,6,6,6
Django Jane,6,6,6,6,6,6,6


In [15]:
# getting relevance score too

# Define your custom function
def custom_function(group):
    return ORDER.index(str(group.name))

# Group by and apply the function
grouped = df.groupby('pageid').apply(custom_function).rename('relevance')

# Merge the new column back to the original DataFrame
df = df.merge(grouped, on='pageid')

In [28]:
df.groupby('title').first().sort_values(by='relevance').head(100).drop(['lang', 'langname', 'url'], axis=1).to_csv('./DAta/Wikipedia/metoo_base_top100_relevance.csv')

## Top 100 to do with a country

In [19]:
df[df['title'].apply(lambda x: 'MeToo movement in' in x)].groupby('title').first()

Unnamed: 0_level_0,pageid,langname,lang,url,links,category_count,categories,relevance
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MeToo movement in China,67493973,Greek,el,https://el.wikipedia.org/wiki/%CE%9A%CE%AF%CE%...,66,10,{Category:Articles with incomplete citations f...,37
MeToo movement in India,58717632,Arabic,ar,https://ar.wikipedia.org/wiki/%D8%AD%D8%B1%D9%...,498,13,"{Category:Use dmy dates from March 2019, Categ...",2
MeToo movement in Pakistan,62883008,Western Punjabi,pnb,https://pnb.wikipedia.org/wiki/%D9%85%DB%8C_%D...,60,9,"{Category:Socialism in Pakistan, Category:Wome...",28
MeToo movement in South Korea,64414806,Spanish,es,https://es.wikipedia.org/wiki/Movimiento_MeToo...,12,17,"{Category:Women's rights in South Korea, Categ...",94


In [22]:
df.groupby('title').first().sort_values(by='relevance')[100:200]

Unnamed: 0_level_0,pageid,langname,lang,url,links,category_count,categories,relevance
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Karthik (singer),5124517,Arabic,ar,https://ar.wikipedia.org/wiki/%D9%83%D8%A7%D8%...,220,23,"{Category:Short description matches Wikidata, ...",114
Bralette,38262540,Arabic,ar,https://ar.wikipedia.org/wiki/%D8%A8%D8%B1%D8%...,168,5,"{Category:Brassieres, Category:Women's clothin...",115
Cardi B,53594450,Afrikaans,af,https://af.wikipedia.org/wiki/Cardi_B,1480,94,"{Category:CS1 Spanish-language sources (es), C...",116
Chip Tsao,708657,Cebuano,ceb,https://ceb.wikipedia.org/wiki/Chip_Tsao,114,26,{Category:CS1 Chinese (Hong Kong)-language sou...,117
She Said (book),61758120,Spanish,es,https://es.wikipedia.org/wiki/She_Said_(libro),178,13,{Category:Sexual assaults in the United States...,118
Procter & Gamble,19629560,Arabic,ar,https://ar.wikipedia.org/wiki/%D8%A8%D8%B1%D9%...,1226,48,{Category:Companies listed on the New York Sto...,119
Promising Young Woman,60148050,Arabic,ar,https://ar.wikipedia.org/wiki/%D8%B4%D8%A7%D8%...,394,38,"{Category:British black comedy films, Category...",120
Elizabeth Perkins,333680,Afrikaans,af,https://af.wikipedia.org/wiki/Elizabeth_Perkins,234,34,"{Category:Articles with J9U identifiers, Categ...",121
Chris McCausland,7858934,French,fr,https://fr.wikipedia.org/wiki/Chris_McCausland,104,22,"{Category:1977 births, Category:Short descript...",122
Christina Baker Kline,22963663,Arabic,ar,https://ar.wikipedia.org/wiki/%D9%83%D8%B1%D9%...,36,44,"{Category:Articles with J9U identifiers, Categ...",123
