In [304]:
import pandas as pd
import numpy as np
import json
import requests
import plotly

In [305]:
# Defining the API endpoints that will used to collect the necessary data
endpoint_legacy = 'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access-site}/{granularity}/{start}/{end}'
endpoint_pageviews = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'

In [306]:
# Defining parameters for getting aggregated pagecounts desktop data
pagecounts_desktop = {"project" : "en.wikipedia.org",
                 "access-site" : "desktop-site",
                 "granularity" : "monthly",
                 "start" : "2007120100",
                # for end use 1st day of month following final month of data
                 "end" : "2016080100"
                    }



In [307]:
# Defining parameters for getting aggregated pagecounts mobile data
pagecounts_mobile = {"project" : "en.wikipedia.org",
                 "access-site" : "mobile-site",
                 "granularity" : "monthly",
                 "start" : "2007120100",
                # for end use 1st day of month following final month of data
                 "end" : "2016080100"
                    }

In [308]:
# Defining parameters for getting aggregated pageviews desktop data
pageviews_desktop = {"project" : "en.wikipedia.org",
                    "access" : "desktop",
                    "agent" : "user",
                    "granularity" : "monthly",
                    "start" : "2015070100",
                    # for end use 1st day of month following final month of data
                    "end" : '2021101000'
                        }



In [309]:
# Defining parameters for getting aggregated pageviews mobile app data
pageviews_mobile_app = {"project" : "en.wikipedia.org",
                    "access" : "mobile-app",
                    "agent" : "user",
                    "granularity" : "monthly",
                    "start" : "2015070100",
                    # for end use 1st day of month following final month of data
                    "end" : '2021101000'
                        }

In [310]:
# Defining parameters for getting aggregated pageviews mobile web data
pageviews_mobile_web = {"project" : "en.wikipedia.org",
                    "access" : "mobile-web",
                    "agent" : "user",
                    "granularity" : "monthly",
                    "start" : "2015070100",
                    # for end use 1st day of month following final month of data
                    "end" : '2021101000'
                        }

In [311]:
# Customize the header with your own information
headers = {
    'User-Agent': 'https://github.com/leenaelamrawy',
    'From': 'lelamraw@uw.edu'
}

In [312]:
# API call function that retrieves data from endpoints

def api_call(endpoint,parameters):
    call = requests.get(endpoint.format(**parameters), headers=headers)
    response = call.json()
    
    return response

In [313]:
#Extracting data from api and saving it to a dictionary
dict_pagecounts_desktop = api_call(endpoint_legacy, pagecounts_desktop)
dict_pagecounts_mobile = api_call(endpoint_legacy, pagecounts_mobile)
dict_pageviews_desktop = api_call(endpoint_pageviews, pageviews_desktop)
dict_pageviews_mobile_app = api_call(endpoint_pageviews, pageviews_mobile_app)
dict_pageviews_mobile_web = api_call(endpoint_pageviews, pageviews_mobile_web)

In [314]:
# Saving each dictionary to a JSON file

    
with open("pagecounts_desktop-site_200712-201608.json", "w") as outfile:
    json.dump(dict_pagecounts_desktop, outfile)
    
    
    
with open("pagecounts_mobile-site_200712-201608.json", "w") as outfile:
    json.dump(dict_pagecounts_mobile, outfile)
    
    
with open("pageviews_desktop_201507-202108.json", "w") as outfile:
    json.dump(dict_pageviews_desktop, outfile)
    
    
with open("pageviews_mobile_app_201507-202108.json", "w") as outfile:
    json.dump(dict_pageviews_mobile_app, outfile)
    
    
with open("pageviews_mobile_web_201507-202108.json", "w") as outfile:
    json.dump(dict_pageviews_mobile_web, outfile)


In [341]:
# Converting pagecounts dictionaries to dataframes for easier analysis


df_pagecounts_desktop_site = pd.DataFrame.from_dict(dict_pagecounts_desktop)
df_pagecounts_desktop_site = pd.json_normalize(df_pagecounts_desktop_site['items'])
df_pagecounts_desktop_site = df_pagecounts_desktop_site.rename(columns={'count': 'pagecount_desktop_views'})

df_pagecounts_mobile_site = pd.DataFrame.from_dict(dict_pagecounts_mobile)
df_pagecounts_mobile_site = pd.json_normalize(df_pagecounts_mobile_site['items'])
df_pagecounts_mobile_site = df_pagecounts_mobile_site.rename(columns={'count': 'pagecount_mobile_views'})

# Combining pagecounts dictionaries into a single dataframe called pagecount_combined_df
# Filling NAN values with 0
# Calculating the sum of all pagecounts and adding that as a column to the dataframe
pagecount_combined_df = pd.merge(df_pagecounts_mobile_site, df_pagecounts_desktop_site,on = 'timestamp', how = 'outer')[['timestamp', 'pagecount_mobile_views','pagecount_desktop_views' ]]
pagecount_combined_df['pagecount_mobile_views'] = pagecount_combined_df['pagecount_mobile_views'].fillna(0)
pagecount_combined_df['pagecount_all_views'] = pagecount_combined_df['pagecount_desktop_views'] + pagecount_combined_df['pagecount_mobile_views']
pagecount_combined_df.head()


Unnamed: 0,timestamp,pagecount_mobile_views,pagecount_desktop_views,pagecount_all_views
0,2014100100,3091547000.0,6577533128,9669080000.0
1,2014110100,3027490000.0,6153537606,9181027000.0
2,2014120100,3278950000.0,5830332248,9109282000.0
3,2015010100,3485302000.0,6103767055,9589069000.0
4,2015020100,3091534000.0,5602710439,8694245000.0


In [342]:
# Converting pageviews desktop dictionary to a dataframe
df_pageviews_desktop = pd.DataFrame.from_dict(dict_pageviews_desktop)
df_pageviews_desktop = pd.json_normalize(df_pageviews_desktop['items'])
df_pageviews_desktop = df_pageviews_desktop.rename(columns = {'views':'pageview_desktop_views'})[['timestamp','pageview_desktop_views']]
df_pageviews_desktop.head()

Unnamed: 0,timestamp,pageview_desktop_views
0,2015070100,4376666686
1,2015080100,4332482183
2,2015090100,4485491704
3,2015100100,4477532755
4,2015110100,4287720220


In [343]:
# Converting remaining pageview dictionaries to dataframes
df_pageviews_mobile_app = pd.DataFrame.from_dict(dict_pageviews_mobile_app)
df_pageviews_mobile_app = pd.json_normalize(df_pageviews_mobile_app['items'])

df_pageviews_mobile_web = pd.DataFrame.from_dict(dict_pageviews_mobile_web)
df_pageviews_mobile_web = pd.json_normalize(df_pageviews_mobile_web['items'])

print(df_pageviews_mobile_app.head())
print(df_pageviews_mobile_web.head())


        project      access agent granularity   timestamp      views
0  en.wikipedia  mobile-app  user     monthly  2015070100  109624146
1  en.wikipedia  mobile-app  user     monthly  2015080100  109669149
2  en.wikipedia  mobile-app  user     monthly  2015090100   96221684
3  en.wikipedia  mobile-app  user     monthly  2015100100   94523777
4  en.wikipedia  mobile-app  user     monthly  2015110100   94353925
        project      access agent granularity   timestamp       views
0  en.wikipedia  mobile-web  user     monthly  2015070100  3179131148
1  en.wikipedia  mobile-web  user     monthly  2015080100  3192663889
2  en.wikipedia  mobile-web  user     monthly  2015090100  3073981649
3  en.wikipedia  mobile-web  user     monthly  2015100100  3173975355
4  en.wikipedia  mobile-web  user     monthly  2015110100  3142247145


In [344]:
# Getting the sum of mobile views for pageviews and adding that as a column
df_pageviews_all_mobile =pd.concat([df_pageviews_mobile_web,df_pageviews_mobile_app])
monthly_all_mobile_pageviews = df_pageviews_all_mobile.groupby('timestamp')['views'].sum().to_frame().reset_index()
monthly_all_mobile_pageviews = monthly_all_mobile_pageviews.rename(columns = {'views' : 'pageview_mobile_views'})
monthly_all_mobile_pageviews.head()

Unnamed: 0,timestamp,pageview_mobile_views
0,2015070100,3288755294
1,2015080100,3302333038
2,2015090100,3170203333
3,2015100100,3268499132
4,2015110100,3236601070


In [321]:
# Combining al pageview dataframes into 1
pageviews_all_views = pd.merge(monthly_all_mobile_pageviews, df_pageviews_desktop,on = 'timestamp')
pageviews_all_views['pageview_all_views'] = pageviews_all_views['pageview_mobile_views'] + pageviews_all_views['pageview_desktop_views']
pageviews_all_views = pageviews_all_views[['timestamp', 'pageview_mobile_views','pageview_desktop_views','pageview_all_views' ]]
pageviews_all_views


Unnamed: 0,timestamp,pageview_mobile_views,pageview_desktop_views,pageview_all_views
0,2015070100,3288755294,4376666686,7665421980
1,2015080100,3302333038,4332482183,7634815221
2,2015090100,3170203333,4485491704,7655695037
3,2015100100,3268499132,4477532755,7746031887
4,2015110100,3236601070,4287720220,7524321290
...,...,...,...,...
70,2021050100,4976579558,2824416177,7800995735
71,2021060100,4584510417,2505971366,7090481783
72,2021070100,4778909421,2765584368,7544493789
73,2021080100,4732194000,2763413934,7495607934


In [345]:
# Functions to extract the year and month from timestamp
def get_year(timestamp):
    return timestamp[:4]

def get_month(timestamp):
    return timestamp[4:6]


In [323]:
# Adding year and month columns to the dataframe

pageviews_all_views['year'] = pageviews_all_views['timestamp'].apply(lambda x: get_year(x))
pageviews_all_views['month'] = pageviews_all_views['timestamp'].apply(lambda x: get_month(x))
pagecount_combined_df['year'] = pagecount_combined_df['timestamp'].apply(lambda x: get_year(x))
pagecount_combined_df['month'] = pagecount_combined_df['timestamp'].apply(lambda x: get_month(x))
pagecount_combined_df = pagecount_combined_df.sort_values(by = 'timestamp', ascending = True)

In [327]:
# Creating the final dataframe that combines pagecount_combined_df and pageviews_all_views
# cleaning the data/NAN values
# exporting to dataframe to csv
final_df = pd.merge(pagecount_combined_df,pageviews_all_views, on = 'timestamp', how = 'outer')
final_df['year_x'] = final_df.year_x.fillna(final_df.year_y)
final_df['month_x'] = final_df.month_x.fillna(final_df.month_y)
final_df = final_df.rename(columns = {'year_x':'year', 'month_x':'month'})
final_df = final_df[['year', 'month', 'pagecount_all_views', 'pagecount_desktop_views','pagecount_mobile_views', 'pageview_all_views', 'pageview_desktop_views', 'pageview_mobile_views']]
final_df = final_df.fillna(0)
final_df.to_csv("en-wikipedia_traffic_200712-202108.csv")

In [346]:
# Creating a datetime object using the year and month columns to be used in timeseries analysis
final_df['Date'] = final_df['year'] + '-' +final_df['month']
final_df['Date'] = pd.to_datetime(final_df['Date']).dt.strftime('%m-%Y')
final_df.head()

Unnamed: 0,year,month,pagecount_all_views,pagecount_desktop_views,pagecount_mobile_views,pageview_all_views,pageview_desktop_views,pageview_mobile_views,Date
0,2007,12,2998332000.0,2998332000.0,0.0,0.0,0.0,0.0,12-2007
1,2008,1,4930903000.0,4930903000.0,0.0,0.0,0.0,0.0,01-2008
2,2008,2,4818394000.0,4818394000.0,0.0,0.0,0.0,0.0,02-2008
3,2008,3,4955406000.0,4955406000.0,0.0,0.0,0.0,0.0,03-2008
4,2008,4,5159162000.0,5159162000.0,0.0,0.0,0.0,0.0,04-2008


In [340]:
# Using Plotly library to visualize the results
import plotly.express as px
fig = px.line(final_df, x="Date", y=final_df.columns[2:-1],
              title='Monthly Traffic on English Wikipedia December 1st 2008- August 30th 2021')
fig.update_layout(yaxis_title='Count')

fig.show()