<a href="https://colab.research.google.com/github/ia234/Python_23/blob/main/Wikipedia_Top_Article_Titles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import urllib.request as urllib2
import time
from datetime import datetime
import json
from urllib.parse import urlencode, quote_plus

In [2]:
TOP_API_URL = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/'\
              'top/{lang}.{project}/all-access/{year}/{month}/{day}'


# The call get_traffic(‘2020’, ‘02’, ‘21’), for example, would output a JSON object with the 1000 most viewed Wikpedia articles on that day.
# The object contains the article’s title, the number of views, and its “rank” for that day in terms of views.
def get_traffic(year, month, day):
    '''\
    Get the traffic report for the top 1000 articles for a given day.
    TODO: Get from local file, if available
    '''
    url = TOP_API_URL.format(lang='ar',
                             project='wikipedia',
                             year=year,
                             month=month,
                             day=day)
#     if DEBUG:
#         print 'Getting %s' % url
    resp = urllib2.urlopen(url)
    resp_bytes = resp.read()
#         rec.success('Fetched {len_bytes} bytes from {url}',
#                     len_bytes=len(resp_bytes), url=url)

    data = json.loads(resp_bytes)
    articles = data['items'][0]['articles']
    return articles

## Single day top articles

In [3]:
specific_day = get_traffic('2023', '03','20')

print('These are the top', len(specific_day)+1, 'articles on the specified date above:')

These are the top 1000 articles on the specified date above:


In [4]:
from datetime import date

today = date.today()

yesterdays_top_articles_list = get_traffic(today.strftime("%Y"), today.strftime("%m"), date.today().day - 1)

yesterdays_top_articles_list

[{'article': 'الصفحة_الرئيسية', 'views': 81394, 'rank': 1},
 {'article': 'خاص:بحث', 'views': 44394, 'rank': 2},
 {'article': 'المولد_النبوي', 'views': 35302, 'rank': 3},
 {'article': 'نادي_النصر_(السعودية)', 'views': 29603, 'rank': 4},
 {'article': 'ثورة_26_سبتمبر_اليمنية', 'views': 18290, 'rank': 5},
 {'article': 'محمد', 'views': 17187, 'rank': 6},
 {'article': 'اليوم_الوطني_للمملكة_العربية_السعودية',
  'views': 16829,
  'rank': 7},
 {'article': 'ريال_مدريد', 'views': 15339, 'rank': 8},
 {'article': 'نادي_برشلونة', 'views': 14746, 'rank': 9},
 {'article': 'كأس_خادم_الحرمين_الشريفين', 'views': 13031, 'rank': 10},
 {'article': 'يوتيوب', 'views': 12800, 'rank': 11},
 {'article': 'نادي_بيراميدز', 'views': 8971, 'rank': 12},
 {'article': 'صلاة_الفجر', 'views': 8719, 'rank': 13},
 {'article': 'كريستال_(مسلسل)', 'views': 8544, 'rank': 14},
 {'article': 'عبد_العزيز_آل_سعود', 'views': 8276, 'rank': 15},
 {'article': 'آية_الكرسي', 'views': 7860, 'rank': 16},
 {'article': 'صلاة_الاستخارة', 'view

In [5]:
print('This is a', type(yesterdays_top_articles_list), 'that contains', len(yesterdays_top_articles_list), type(yesterdays_top_articles_list[0]), 'as rows.')

This is a <class 'list'> that contains 1000 <class 'dict'> as rows.


In [6]:
print('Each row is a', type(yesterdays_top_articles_list[0]), 'with', len(yesterdays_top_articles_list[0]), 'key-value pairs:', yesterdays_top_articles_list[0].keys())

Each row is a <class 'dict'> with 3 key-value pairs: dict_keys(['article', 'views', 'rank'])


In [7]:
yesterdays_top_articles_list[2]

{'article': 'المولد_النبوي', 'views': 35302, 'rank': 3}

In [8]:
import pandas as pd
import numpy as np

In [9]:
single_day_df = pd.DataFrame(yesterdays_top_articles_list)
single_day_df

Unnamed: 0,article,views,rank
0,الصفحة_الرئيسية,81394,1
1,خاص:بحث,44394,2
2,المولد_النبوي,35302,3
3,نادي_النصر_(السعودية),29603,4
4,ثورة_26_سبتمبر_اليمنية,18290,5
...,...,...,...
995,ناروتو,649,994
996,علم_قوس_قزح_(حراك_المثليين),648,997
997,الانتحار_في_الإسلام,648,997
998,معرض_أعلام_الدول,648,997


In [10]:
MW_API_URL = 'https://{lang}.{project}.org/w/api.php?'
PREFIXES = ['Special', 'Template', 'Sp?cial', 'Project']

def get_wiki_info(lang, project):
    '''\
    Get the mainpage title and local namespace map.
    '''
    url = MW_API_URL.format(lang=lang, project=project)
    params = {'action': 'query',
              'meta': 'siteinfo',
              'format': 'json',
              'siprop': 'general|namespaces'}
    resp = urllib2.urlopen(url + urlencode(params))
    data = json.loads(resp.read())
    mainpage = data['query']['general']['mainpage'].replace(' ', '_')
    namespaces = [ns_info['*'].replace(' ', '_') for ns_id, ns_info in
                  data['query']['namespaces'].items() if ns_id is not 0]
    return {'mainpage': mainpage, 'namespaces': namespaces}


  data['query']['namespaces'].items() if ns_id is not 0]


In [11]:
# verify that each article is indeed an article before adding it to the dataframe.

def is_article(title, wiki_info):
    '''\
    Is it an article, or some other sort of page? We'll want to filter out the
    search page (Special:Search in English, etc) and similar pages appearing
    inconveniently in the traffic report.
    Skip xhamster. There are a few clues this Wikipedia traffic is artificial.
    See https://en.wikipedia.org/w/index.php?title=XHamster&diff=701682670&oldid=700826198
    '''
    skip = ['-', '404.php', 'XHamster'] + [wiki_info['mainpage']]
    prefixes = PREFIXES + wiki_info['namespaces']
    if title in skip:
        return False
    if title == "Media":
        return False
    if title == "Wikipedia":
        return False
    if title == "United_States_Senate":
        return False
    for prefix in prefixes:
        if title.startswith(prefix + ':'):
            return False
    return True

## Articles for a specific date

In [12]:
# The call get_traffic(‘2020’, ‘02’, ‘21’) would output a JSON object with the 1000 most viewed Wikpedia articles on the specified day.
# The returned object is a list that contains 1000 rows (dictionaries) with 3 key-value paris each: the article’s title, the number of views, and its “rank” for that day in terms of views.

wiki_info = get_wiki_info('ar', 'wikipedia')
raw_traffic = get_traffic('2023', '03', '20')
articles = [a for a in raw_traffic if is_article(a['article'], wiki_info)]

called the get_traffic function for every day in the year 2020 and concatenated the results into a dataframe. I only saved the 25 most viewed articles in this dataframe, rather than the entire output of get_traffic.

In [13]:
len(articles[:25])
articles

[{'article': 'يوم_الأم', 'views': 73739, 'rank': 1},
 {'article': 'قائمة_مباريات_الكلاسيكو', 'views': 32586, 'rank': 4},
 {'article': 'الكلاسيكو', 'views': 31780, 'rank': 5},
 {'article': 'نادي_برشلونة', 'views': 23127, 'rank': 6},
 {'article': 'نوروز', 'views': 20627, 'rank': 7},
 {'article': 'إعلان_استقلال_تونس', 'views': 19666, 'rank': 8},
 {'article': 'ريال_مدريد', 'views': 19287, 'rank': 9},
 {'article': 'لبنى_محمود', 'views': 13760, 'rank': 10},
 {'article': 'ميكيموتو_كويتشي', 'views': 11887, 'rank': 11},
 {'article': 'الدوري_الإسباني', 'views': 9685, 'rank': 12},
 {'article': 'رمضان', 'views': 9502, 'rank': 13},
 {'article': 'صلاة_الفجر', 'views': 8610, 'rank': 14},
 {'article': 'كليوباترا', 'views': 7673, 'rank': 15},
 {'article': 'كريستيانو_رونالدو', 'views': 7252, 'rank': 16},
 {'article': 'دوري_أبطال_أوروبا', 'views': 6965, 'rank': 17},
 {'article': 'آية_الكرسي', 'views': 6856, 'rank': 18},
 {'article': 'ترجمة', 'views': 6091, 'rank': 19},
 {'article': 'تلوث_المياه', 'views'

## Between two specified dates

In [14]:
date1 = '2022-08-28'
date2 = '2023-08-28'
dates = pd.date_range(date1, date2).tolist()

In [15]:
dates[0].strftime('%Y')

'2022'

In [16]:
print('dates is a', type(dates), 'of', len(dates), 'dates that range from', date1, 'till', date2, 'where each row looks like:', dates[0])

dates is a <class 'list'> of 366 dates that range from 2022-08-28 till 2023-08-28 where each row looks like: 2022-08-28 00:00:00


In [36]:
# array of 25 (or number specified in num_articles_per_day) most popular articles for each day in the dates specified in the range
top_array = []

start_timer = time.time()

# the number of top articles to fethc per day
num_articles_per_day = 26

print('\nFetching', num_articles_per_day, 'articles per day for', len(dates), 'days', 'which accumulates to', num_articles_per_day * len(dates), 'articles \n')

printed_month = 0

for date in dates:
    year=date.strftime('%Y')
    month = date.strftime('%m')
    day = date.strftime('%d')

    if  printed_month != month:
      print('Fetching the top', num_articles_per_day, 'articles for month', date.strftime("%Y %m"))
      printed_month = month
#     wiki_info = get_wiki_info('en', 'wikipedia')

    # raw_traffic is the top 1000 articles for the specified day
    raw_traffic = get_traffic(year, month, day)

    # check if the returned is an article and store articles only
    articles = [a for a in raw_traffic if is_article(a['article'], wiki_info)]

    # add the top 25 (or num_articles_per_day) articles on that day to the final list top_array
    top_array.append(articles[:num_articles_per_day])

end_timer = time.time()

print("\n The operation took", (end_timer - start_timer)/60, 'minutes')


Fetching 26 articles per day for 366 days which accumelates to 9516 articles 

Fetching the top 26 articles for month 2022 08
Fetching the top 26 articles for month 2022 09
Fetching the top 26 articles for month 2022 10
Fetching the top 26 articles for month 2022 11
Fetching the top 26 articles for month 2022 12
Fetching the top 26 articles for month 2023 01
Fetching the top 26 articles for month 2023 02
Fetching the top 26 articles for month 2023 03
Fetching the top 26 articles for month 2023 04
Fetching the top 26 articles for month 2023 05
Fetching the top 26 articles for month 2023 06
Fetching the top 26 articles for month 2023 07
Fetching the top 26 articles for month 2023 08

 The operation took 1.4225083589553833 minutes


In [18]:
print('top_array is a three dimensional', type(top_array), 'of size', len(top_array), 'which represents the number of days from', date1, 'till', date2)

top_array is a three dimensional <class 'list'> of size 366 which represents the number of days from 2022-08-28 till 2023-08-28


In [19]:
print('In that final list top_array, each row is another inner', type(top_array[2]), 'of size', len(top_array[2]), 'which represents the top article titles for that day.')

In that final list top_array, each row is another inner <class 'list'> of size 26 which represents the top article titles for that day.


In [20]:
print('Inside these', len(top_array[2]), 'inner lists are', type(top_array[2][2]), '. These dictionaries has', len(top_array[2][2]), 'key-pair values like:', top_array[2][2])

Inside these 26 inner lists are <class 'dict'> . These dictionaries has 3 key-pair values like: {'article': 'مقتدى_الصدر', 'views': 33450, 'rank': 5}


Converting the 3d list into a pandas dataframe

In [21]:
dates_array = []
views_array = []
articles_array = []
ranks_array = []

for i in range(len(top_array)):
    date = dates[i]
    array = top_array[i]
    for j in range(25):
        row = array[j]
        dates_array.append(date)
        articles_array.append(row['article'].replace('_', ' '))
#         print(articles_array)
        views_array.append(row['views'])
        ranks_array.append(row['rank'])


In [22]:
len(dates_array)
# articles_array

9150

In [23]:
dict = {'date': dates_array, 'article': articles_array, 'views': views_array, 'rank': ranks_array}
df = pd.DataFrame(data=dict)
df

Unnamed: 0,date,article,views,rank
0,2022-08-28,جورج الراسي,49742,2
1,2022-08-28,ميدوسا,24442,4
2,2022-08-28,نادين الراسي,20156,5
3,2022-08-28,ترجمة جوجل,15440,6
4,2022-08-28,نادي برشلونة,14656,7
...,...,...,...,...
9145,2023-08-28,ميا خليفة,5406,23
9146,2023-08-28,جاكسون موليكا,5399,24
9147,2023-08-28,محمد,5296,25
9148,2023-08-28,كريستال (مسلسل),5245,26


## Analyzing the dataframe

In [30]:
# the number of times an article made it to the daily top article list
df[df.article == "محمد"]

Unnamed: 0,date,article,views,rank
194,2022-09-04,محمد,4212,23
214,2022-09-05,محمد,4904,18
238,2022-09-06,محمد,6006,16
262,2022-09-07,محمد,6640,15
514,2022-09-17,محمد,6272,17
...,...,...,...,...
8723,2023-08-11,محمد,4470,28
8823,2023-08-15,محمد,4136,28
8871,2023-08-17,محمد,4135,26
8898,2023-08-18,محمد,4121,28


In [25]:
# Sorting the top articles list by the number of views from highest to lowest
df.sort_values(by="views", ascending=False)

Unnamed: 0,date,article,views,rank
275,2022-09-08,إليزابيث الثانية,488324,1
2100,2022-11-20,كأس العالم 2022,380824,1
300,2022-09-09,إليزابيث الثانية,351975,1
1450,2022-10-25,كسوف الشمس,318111,1
2350,2022-11-30,كأس العالم,299466,1
...,...,...,...,...
8174,2023-07-20,ليونيل ميسي,3440,29
7946,2023-07-11,نادية الجندي,3410,26
7947,2023-07-11,إنستغرام,3391,27
7948,2023-07-11,سورة الأعلى,3380,28


In [39]:
# Combining duplicate titles, calculating the number of times each article made it to the top daily list, and sorting them by that number.
df.groupby('article').count().sort_values(by=['views'], ascending=False)

Unnamed: 0_level_0,date,views,rank
article,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
كريستيانو رونالدو,322,322,322
يوتيوب,278,278,278
آية الكرسي,242,242,242
نادي برشلونة,213,213,213
صلاة الفجر,208,208,208
...,...,...,...
كارتل سينالوا,1,1,1
جوائز الفيفا للأفضل كرويا 2022,1,1,1
كارلوس ألكاراز (لاعب تنس),1,1,1
كارلوس منعم,1,1,1


In [41]:
df_no_duplicates = df.groupby('article').agg({'date':'first',
                         'views':'max',
                         'rank':'count'}).sort_values(by=['views'], ascending=False)

df_no_duplicates

Unnamed: 0_level_0,date,views,rank
article,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
إليزابيث الثانية,2022-09-08,488324,22
كأس العالم 2022,2022-09-06,380824,110
كسوف الشمس,2022-10-23,318111,5
كأس العالم,2022-11-02,299466,59
كيليان مبابي,2022-11-26,275608,49
...,...,...,...
ابن بطوطة,2023-08-10,3737,1
الجمهورية العربية الصحراوية الديمقراطية,2022-08-28,3730,1
سوسن علي,2022-12-29,3694,1
حبيب بن مظاهر الأسدي,2023-07-24,3621,1


In [28]:
df.to_csv('wiki_pageviews_jan-jan.csv')

In [43]:
df_no_duplicates.to_csv('df_no_duplicates.csv')