<a href="https://colab.research.google.com/github/ia234/Python_23/blob/main/Wikipedia_Top_Articles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import urllib.request as urllib2
import time
from datetime import datetime
import json
from urllib.parse import urlencode, quote_plus

In [2]:
TOP_API_URL = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/'\
              'top/{lang}.{project}/all-access/{year}/{month}/{day}'

def get_traffic(year, month, day):
    '''\
    Get the traffic report for the top 1000 articles for a given day.
    TODO: Get from local file, if available
    '''
    url = TOP_API_URL.format(lang='ar',
                             project='wikipedia',
                             year=year,
                             month=month,
                             day=day)
#     if DEBUG:
#         print 'Getting %s' % url
    resp = urllib2.urlopen(url)
    resp_bytes = resp.read()
#         rec.success('Fetched {len_bytes} bytes from {url}',
#                     len_bytes=len(resp_bytes), url=url)

    data = json.loads(resp_bytes)
    articles = data['items'][0]['articles']
    return articles

In [36]:
single_day = get_traffic('2023', '03','20')
print('These are the top', len(single_day), 'articles on the specified date above:')

These are the top 999 articles on the specified date above:


In [37]:
single_day_df = pd.DataFrame(single_day)
single_day_df

Unnamed: 0,article,views,rank
0,يوم_الأم,73739,1
1,الصفحة_الرئيسية,63985,2
2,خاص:بحث,47311,3
3,قائمة_مباريات_الكلاسيكو,32586,4
4,الكلاسيكو,31780,5
...,...,...,...
994,ياسر_عرفات,668,995
995,حلبة_(نبات),668,995
996,كأس_الأمم_الإفريقية,668,995
997,بوابة:تصفح,668,995


996

In [4]:
MW_API_URL = 'https://{lang}.{project}.org/w/api.php?'
PREFIXES = ['Special', 'Template', 'Sp?cial', 'Project']

def get_wiki_info(lang, project):
    '''\
    Get the mainpage title and local namespace map.
    '''
    url = MW_API_URL.format(lang=lang, project=project)
    params = {'action': 'query',
              'meta': 'siteinfo',
              'format': 'json',
              'siprop': 'general|namespaces'}
    resp = urllib2.urlopen(url + urlencode(params))
    data = json.loads(resp.read())
    mainpage = data['query']['general']['mainpage'].replace(' ', '_')
    namespaces = [ns_info['*'].replace(' ', '_') for ns_id, ns_info in
                  data['query']['namespaces'].items() if ns_id is not 0]
    return {'mainpage': mainpage, 'namespaces': namespaces}


  data['query']['namespaces'].items() if ns_id is not 0]


In [5]:
def is_article(title, wiki_info):
    '''\
    Is it an article, or some other sort of page? We'll want to filter out the
    search page (Special:Search in English, etc) and similar pages appearing
    inconveniently in the traffic report.
    Skip xhamster. There are a few clues this Wikipedia traffic is artificial.
    See https://en.wikipedia.org/w/index.php?title=XHamster&diff=701682670&oldid=700826198
    '''
    skip = ['-', '404.php', 'XHamster'] + [wiki_info['mainpage']]
    prefixes = PREFIXES + wiki_info['namespaces']
    if title in skip:
        return False
    if title == "Media":
        return False
    if title == "Wikipedia":
        return False
    if title == "United_States_Senate":
        return False
    for prefix in prefixes:
        if title.startswith(prefix + ':'):
            return False
    return True

In [6]:
wiki_info = get_wiki_info('en', 'wikipedia')
raw_traffic = get_traffic('2020', '02', '29')
articles = [a for a in raw_traffic if is_article(a['article'], wiki_info)]

In [7]:
len(articles[:25])
articles

[{'article': 'الصفحة_الرئيسية', 'views': 94409, 'rank': 1},
 {'article': 'سنة_كبيسة', 'views': 57969, 'rank': 2},
 {'article': 'خاص:بحث', 'views': 51984, 'rank': 3},
 {'article': 'محمد_عمارة', 'views': 32059, 'rank': 4},
 {'article': 'محمد_حسين_طنطاوي', 'views': 16058, 'rank': 5},
 {'article': '29_فبراير', 'views': 13364, 'rank': 6},
 {'article': 'ديانا_كرزون', 'views': 13146, 'rank': 7},
 {'article': 'فيروس_كورونا', 'views': 10344, 'rank': 8},
 {'article': 'ستيفن_هوكينج', 'views': 9706, 'rank': 9},
 {'article': 'أذربيجان', 'views': 9676, 'rank': 10},
 {'article': 'تفشي_فيروس_كورونا_المستجد_2019–20', 'views': 9569, 'rank': 11},
 {'article': 'فيروس_كورونا_المستجد_2019', 'views': 9396, 'rank': 12},
 {'article': 'حسني_مبارك', 'views': 8610, 'rank': 14},
 {'article': 'فيروس_كورونا_المرتبط_بمتلازمة_الشرق_الأوسط_التنفسية',
  'views': 7635,
  'rank': 15},
 {'article': 'طلال_بن_سعود_بن_عبد_العزيز_آل_سعود', 'views': 7373, 'rank': 16},
 {'article': 'شيرين_وجدي', 'views': 7216, 'rank': 17},
 {'ar

In [8]:
import pandas as pd
import numpy as np

In [9]:
date1 = '2022-08-28'
date2 = '2023-08-28'
dates = pd.date_range(date1, date2).tolist()

In [10]:
dates[0].strftime('%Y')

'2022'

In [11]:
dates

[Timestamp('2022-08-28 00:00:00', freq='D'),
 Timestamp('2022-08-29 00:00:00', freq='D'),
 Timestamp('2022-08-30 00:00:00', freq='D'),
 Timestamp('2022-08-31 00:00:00', freq='D'),
 Timestamp('2022-09-01 00:00:00', freq='D'),
 Timestamp('2022-09-02 00:00:00', freq='D'),
 Timestamp('2022-09-03 00:00:00', freq='D'),
 Timestamp('2022-09-04 00:00:00', freq='D'),
 Timestamp('2022-09-05 00:00:00', freq='D'),
 Timestamp('2022-09-06 00:00:00', freq='D'),
 Timestamp('2022-09-07 00:00:00', freq='D'),
 Timestamp('2022-09-08 00:00:00', freq='D'),
 Timestamp('2022-09-09 00:00:00', freq='D'),
 Timestamp('2022-09-10 00:00:00', freq='D'),
 Timestamp('2022-09-11 00:00:00', freq='D'),
 Timestamp('2022-09-12 00:00:00', freq='D'),
 Timestamp('2022-09-13 00:00:00', freq='D'),
 Timestamp('2022-09-14 00:00:00', freq='D'),
 Timestamp('2022-09-15 00:00:00', freq='D'),
 Timestamp('2022-09-16 00:00:00', freq='D'),
 Timestamp('2022-09-17 00:00:00', freq='D'),
 Timestamp('2022-09-18 00:00:00', freq='D'),
 Timestamp

In [12]:
#array of 25 most popular articles
top_array = []

for date in dates:
    year=date.strftime('%Y')
    month = date.strftime('%m')
    day = date.strftime('%d')
    print(date)
#     wiki_info = get_wiki_info('en', 'wikipedia')
    raw_traffic = get_traffic(year, month, day)
    articles = [a for a in raw_traffic if is_article(a['article'], wiki_info)]
    top_array.append(articles[:25])

2022-08-28 00:00:00
2022-08-29 00:00:00
2022-08-30 00:00:00
2022-08-31 00:00:00
2022-09-01 00:00:00
2022-09-02 00:00:00
2022-09-03 00:00:00
2022-09-04 00:00:00
2022-09-05 00:00:00
2022-09-06 00:00:00
2022-09-07 00:00:00
2022-09-08 00:00:00
2022-09-09 00:00:00
2022-09-10 00:00:00
2022-09-11 00:00:00
2022-09-12 00:00:00
2022-09-13 00:00:00
2022-09-14 00:00:00
2022-09-15 00:00:00
2022-09-16 00:00:00
2022-09-17 00:00:00
2022-09-18 00:00:00
2022-09-19 00:00:00
2022-09-20 00:00:00
2022-09-21 00:00:00
2022-09-22 00:00:00
2022-09-23 00:00:00
2022-09-24 00:00:00
2022-09-25 00:00:00
2022-09-26 00:00:00
2022-09-27 00:00:00
2022-09-28 00:00:00
2022-09-29 00:00:00
2022-09-30 00:00:00
2022-10-01 00:00:00
2022-10-02 00:00:00
2022-10-03 00:00:00
2022-10-04 00:00:00
2022-10-05 00:00:00
2022-10-06 00:00:00
2022-10-07 00:00:00
2022-10-08 00:00:00
2022-10-09 00:00:00
2022-10-10 00:00:00
2022-10-11 00:00:00
2022-10-12 00:00:00
2022-10-13 00:00:00
2022-10-14 00:00:00
2022-10-15 00:00:00
2022-10-16 00:00:00


In [13]:
top_array[0][0]

{'article': 'الصفحة_الرئيسية', 'views': 77475, 'rank': 1}

In [14]:
dates_array = []
views_array = []
articles_array = []
ranks_array = []

for i in range(len(top_array)):
    date = dates[i]
    array = top_array[i]
    for j in range(25):
        row = array[j]
        dates_array.append(date)
        articles_array.append(row['article'].replace('_', ' '))
#         print(articles_array)
        views_array.append(row['views'])
        ranks_array.append(row['rank'])


In [15]:
len(dates_array)
# articles_array

9150

In [16]:
dict = {'date': dates_array, 'article': articles_array, 'views': views_array, 'rank': ranks_array}
df = pd.DataFrame(data=dict)
df

Unnamed: 0,date,article,views,rank
0,2022-08-28,الصفحة الرئيسية,77475,1
1,2022-08-28,جورج الراسي,49742,2
2,2022-08-28,خاص:بحث,44480,3
3,2022-08-28,ميدوسا,24442,4
4,2022-08-28,نادين الراسي,20156,5
...,...,...,...,...
9145,2023-08-28,نادي ليفربول,5546,21
9146,2023-08-28,صلاة الاستخارة,5512,22
9147,2023-08-28,ميا خليفة,5406,23
9148,2023-08-28,جاكسون موليكا,5399,24


In [17]:
df[df.article == "United States Senate"]

Unnamed: 0,date,article,views,rank


In [18]:
df.sort_values(by="views", ascending=False)

Unnamed: 0,date,article,views,rank
7200,2023-06-12,الصفحة الرئيسية,676881,1
275,2022-09-08,إليزابيث الثانية,488324,1
2100,2022-11-20,كأس العالم 2022,380824,1
300,2022-09-09,إليزابيث الثانية,351975,1
1450,2022-10-25,كسوف الشمس,318111,1
...,...,...,...,...
7947,2023-07-11,علي بن أبي طالب,3651,23
7823,2023-07-06,يوليو,3626,24
7948,2023-07-11,قائمة محطات مترو القاهرة,3607,24
7949,2023-07-11,مصر,3589,25


In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
df.groupby('article').count()

Unnamed: 0_level_0,date,views,rank
article,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023,1,1,1
2029,3,3,3
21 مارس,1,1,1
آر إم إس تيتانيك,12,12,12
آرثر ميلو,1,1,1
...,...,...,...
يوم المعلم,2,2,2
يوم المعلم العالمي,4,4,4
يوم النصر (تركيا),1,1,1
يوم عرفة,12,12,12


In [21]:
df.groupby('article').agg({'date':'first',
                         'views':'max',
                         'rank':'count'}).sort_values(by=['views'], ascending=False)

Unnamed: 0_level_0,date,views,rank
article,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
الصفحة الرئيسية,2022-08-28,676881,366
إليزابيث الثانية,2022-09-08,488324,22
كأس العالم 2022,2022-09-06,380824,109
كسوف الشمس,2022-10-23,318111,5
كأس العالم,2022-11-03,299466,58
...,...,...,...
أبشر,2022-08-28,3999,2
كوسوفو,2022-12-28,3901,1
عبد الصمد الزلزولي,2023-07-09,3877,1
الجمهورية العربية الصحراوية الديمقراطية,2022-08-28,3730,1


In [22]:
df.to_csv('wiki_pageviews_jan-jan.csv')