In [1]:
import json
import requests

In [2]:
endpoint_legacy = 'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access-site}/{granularity}/{start}/{end}'

endpoint_pageviews = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'

In [3]:
params_legacy_desktop = {"project" : "en.wikipedia.org",
                 "access-site" : "desktop-site",
                 "granularity" : "monthly",
                 "start" : "2008010100",
                 "end" : "2016080100"
                    }

params_legacy_mobile = {"project" : "en.wikipedia.org",
                 "access-site" : "mobile-site",
                 "granularity" : "monthly",
                 "start" : "2008010100",
                 "end" : "2016080100"
                    }

params_pageviews_desktop = {"project" : "en.wikipedia.org",
                    "access" : "desktop",
                    "agent" : "user",
                    "granularity" : "monthly",
                    "start" : "2015070100",
                    "end" : '2021100100'
                        }

params_pageviews_mobile_web = {"project" : "en.wikipedia.org",
                    "access" : "mobile-web",
                    "agent" : "user",
                    "granularity" : "monthly",
                    "start" : "2015070100",
                    "end" : '2021100100'
                        }

params_pageviews_mobile_app = {"project" : "en.wikipedia.org",
                    "access" : "mobile-app",
                    "agent" : "user",
                    "granularity" : "monthly",
                    "start" : "2015070100",
                    "end" : '2021100100'
                        }

# Customize these with your own information
headers = {
    'User-Agent': 'https://github.com/laurathriftwood',
    'From': 'lwood3@uw.edu'
}

In [4]:
def api_call(endpoint,parameters):
    call = requests.get(endpoint.format(**parameters), headers=headers)
    response = call.json()
    return response

In [5]:
def write_json(output, file_name):
    print(file_name)
    with open(file_name, 'w+') as outfile:
        json.dump(output, outfile)
        print(" written successfully")

In [6]:
# Legacy PageCounts API calls

# desktop traffic
monthly_legacy_desktop = api_call(endpoint_legacy, params_legacy_desktop)

# mobile traffic
monthly_legacy_mobile = api_call(endpoint_legacy, params_legacy_mobile)

In [7]:
# Write Legacy PageCounts to file
write_json(monthly_legacy_desktop, 'pagecounts_desktop_200801-201607.json')
write_json(monthly_legacy_mobile, 'pagecounts_mobile_200801-201607.json')

pagecounts_desktop_200801-201607.json
 written successfully
pagecounts_mobile_200801-201607.json
 written successfully


In [8]:
# PageViews API calls

# desktop traffic
monthly_pageviews_desktop = api_call(endpoint_pageviews, params_pageviews_desktop)

# mobile-web traffic
monthly_pageviews_mobile_web = api_call(endpoint_pageviews, params_pageviews_mobile_web)

# mobile-app traffic
monthly_pageviews_mobile_app = api_call(endpoint_pageviews, params_pageviews_mobile_app)

In [9]:
# Write PageViews to file
write_json(monthly_pageviews_desktop, 'pageviews_desktop_201507-202109.json')
write_json(monthly_pageviews_mobile_web, 'pageviews_mobile-web_201507-202109.json')
write_json(monthly_pageviews_mobile_app, 'pageviews_mobile-app_201507-202109.json')

pageviews_desktop_201507-202109.json
 written successfully
pageviews_mobile-web_201507-202109.json
 written successfully
pageviews_mobile-app_201507-202109.json
 written successfully


In [10]:
import pandas as pd
import math

In [11]:
#read in Legacy data from json files
df_monthly_legacy_desktop = pd.read_json('pagecounts_desktop_200801-201607.json')
df_monthly_legacy_mobile = pd.read_json('pagecounts_mobile_200801-201607.json')

In [12]:
#unnest Legacy data into dataframe
df_monthly_legacy_desktop = pd.json_normalize(df_monthly_legacy_desktop['items'])
df_monthly_legacy_mobile = pd.json_normalize(df_monthly_legacy_mobile['items'])

In [13]:
#read in PageViews data from json files
df_monthly_pageviews_desktop = pd.read_json('pageviews_desktop_201507-202109.json')
df_monthly_pageviews_mobile_web = pd.read_json('pageviews_mobile-web_201507-202109.json')
df_monthly_pageviews_mobile_app = pd.read_json('pageviews_mobile-app_201507-202109.json')

In [14]:
#unnest PageViews data into dataframe
df_monthly_pageviews_desktop = pd.json_normalize(df_monthly_pageviews_desktop['items'])
df_monthly_pageviews_mobile_web = pd.json_normalize(df_monthly_pageviews_mobile_web['items'])
df_monthly_pageviews_mobile_app = pd.json_normalize(df_monthly_pageviews_mobile_app['items'])

In [15]:
#drop unnecessary columns from Legacy data

In [16]:
legacy_columns = ['project', 'granularity']

In [17]:
df_monthly_legacy_desktop = df_monthly_legacy_desktop.drop(columns = legacy_columns, axis=1)

In [18]:
df_monthly_legacy_mobile = df_monthly_legacy_mobile.drop(columns = legacy_columns, axis=1)

In [19]:
#rename legacy headers
df_monthly_legacy_desktop = df_monthly_legacy_desktop.rename(columns={'access-site': 'access', 'count': 'num_views'})

In [20]:
#rename legacy headers
df_monthly_legacy_mobile = df_monthly_legacy_mobile.rename(columns={'access-site': 'access', 'count': 'num_views'})

In [21]:
df_monthly_legacy_desktop

Unnamed: 0,access,timestamp,num_views
0,desktop-site,2008010100,4930902570
1,desktop-site,2008020100,4818393763
2,desktop-site,2008030100,4955405809
3,desktop-site,2008040100,5159162183
4,desktop-site,2008050100,5584691092
...,...,...,...
98,desktop-site,2016030100,5407676056
99,desktop-site,2016040100,5572235399
100,desktop-site,2016050100,5330532334
101,desktop-site,2016060100,4975092447


In [62]:
#rename access for legacy desktop views
df_monthly_legacy_desktop = df_monthly_legacy_desktop.replace(to_replace = ['desktop-site'], value = 'pagecount_desktop_views')

In [61]:
#rename access for legacy mobile views
df_monthly_legacy_mobile = df_monthly_legacy_mobile.replace(to_replace = ['mobile-site'], value = 'pagecount_mobile_views')

In [24]:
#drop unnecessary columns from desktop PageViews
df_monthly_pageviews_desktop = df_monthly_pageviews_desktop.drop(df_monthly_pageviews_desktop.columns[[0, 2, 3]], axis=1)

In [25]:
#rename access for pageviews desktop
df_monthly_pageviews_desktop = df_monthly_pageviews_desktop.replace(to_replace = ['desktop'], value = 'pageview_desktop_views')

In [26]:
#rename column header
df_monthly_pageviews_desktop = df_monthly_pageviews_desktop.rename(columns={'views': 'num_views'})

In [28]:
#For data collected from the Pageviews API, combine the monthly values for
# mobile-app and mobile-web to create a total mobile traffic count for each month.

In [29]:
#merge the two PageViews mobile using inner join
merged = pd.merge(left=df_monthly_pageviews_mobile_web, 
                       right=df_monthly_pageviews_mobile_app,
                       left_on='timestamp', 
                       right_on='timestamp')

In [30]:
#sum the views for the two mobile access types
merged['num_views'] = merged['views_x'] + merged['views_y']

In [31]:
#add a column for access type for combined views
merged['access'] = 'pageview_mobile_views'

In [32]:
df_monthly_pageviews_mobile_combined = merged.copy()

In [33]:
#drop unnecessary columns from combined mobile PageViews
df_monthly_pageviews_mobile_combined = merged.drop(merged.columns[[0, 1, 2, 3, 5, 6, 7, 8, 9, 10]], 
                                                                                 axis=1)

In [70]:
print(df_monthly_legacy_desktop.shape)
print(df_monthly_legacy_mobile.shape)
print(df_monthly_pageviews_desktop.shape)
print(df_monthly_pageviews_mobile_combined.shape)

(103, 3)
(22, 3)
(75, 3)
(75, 3)


In [75]:
df_monthly_legacy_desktop

Unnamed: 0,access,timestamp,num_views
0,pagecount_desktop_views,2008010100,4930902570
1,pagecount_desktop_views,2008020100,4818393763
2,pagecount_desktop_views,2008030100,4955405809
3,pagecount_desktop_views,2008040100,5159162183
4,pagecount_desktop_views,2008050100,5584691092
...,...,...,...
98,pagecount_desktop_views,2016030100,5407676056
99,pagecount_desktop_views,2016040100,5572235399
100,pagecount_desktop_views,2016050100,5330532334
101,pagecount_desktop_views,2016060100,4975092447


In [76]:
df_monthly_legacy_mobile

Unnamed: 0,access,timestamp,num_views
0,pagecount_mobile_views,2014100100,3091546685
1,pagecount_mobile_views,2014110100,3027489668
2,pagecount_mobile_views,2014120100,3278950021
3,pagecount_mobile_views,2015010100,3485302091
4,pagecount_mobile_views,2015020100,3091534479
5,pagecount_mobile_views,2015030100,3330832588
6,pagecount_mobile_views,2015040100,3222089917
7,pagecount_mobile_views,2015050100,3334069483
8,pagecount_mobile_views,2015060100,3038162463
9,pagecount_mobile_views,2015070100,3254472695


In [35]:
#For all data, separate the value of timestamp into four-digit year (YYYY) and two-digit
#month (MM) and discard values for day and hour (DDHH).

In [71]:
#combine all dataframes

In [72]:
#combine all dataframes
all_data = [df_monthly_legacy_desktop, 
            df_monthly_legacy_mobile, 
            df_monthly_pageviews_desktop, 
            df_monthly_pageviews_mobile_combined]

In [77]:
all_data_concat = pd.concat(all_data, axis=1, keys=['pagecount_desktop_views', 
                                                    'pagecount_mobile_views', 
                                                   'pageview_desktop_views', 
                                                   'pageview_mobile_views'])

In [78]:
all_data_concat

Unnamed: 0_level_0,pagecount_desktop_views,pagecount_desktop_views,pagecount_desktop_views,pagecount_mobile_views,pagecount_mobile_views,pagecount_mobile_views,pageview_desktop_views,pageview_desktop_views,pageview_desktop_views,pageview_mobile_views,pageview_mobile_views,pageview_mobile_views
Unnamed: 0_level_1,access,timestamp,num_views,access,timestamp,num_views,access,timestamp,num_views,timestamp,num_views,access
0,pagecount_desktop_views,2008010100,4930902570,pagecount_mobile_views,2014100100,3.091547e+09,pageview_desktop_views,2015070100,4.376667e+09,2015070100,3.288755e+09,pageview_mobile_views
1,pagecount_desktop_views,2008020100,4818393763,pagecount_mobile_views,2014110100,3.027490e+09,pageview_desktop_views,2015080100,4.332482e+09,2015080100,3.302333e+09,pageview_mobile_views
2,pagecount_desktop_views,2008030100,4955405809,pagecount_mobile_views,2014120100,3.278950e+09,pageview_desktop_views,2015090100,4.485492e+09,2015090100,3.170203e+09,pageview_mobile_views
3,pagecount_desktop_views,2008040100,5159162183,pagecount_mobile_views,2015010100,3.485302e+09,pageview_desktop_views,2015100100,4.477533e+09,2015100100,3.268499e+09,pageview_mobile_views
4,pagecount_desktop_views,2008050100,5584691092,pagecount_mobile_views,2015020100,3.091534e+09,pageview_desktop_views,2015110100,4.287720e+09,2015110100,3.236601e+09,pageview_mobile_views
...,...,...,...,...,...,...,...,...,...,...,...,...
98,pagecount_desktop_views,2016030100,5407676056,,,,,,,,,
99,pagecount_desktop_views,2016040100,5572235399,,,,,,,,,
100,pagecount_desktop_views,2016050100,5330532334,,,,,,,,,
101,pagecount_desktop_views,2016060100,4975092447,,,,,,,,,


In [37]:
#df_monthly_pageviews_mobile_combined['timestamp'] = df_monthly_pageviews_mobile_combined['timestamp'].astype(str)

In [38]:
#df_monthly_pageviews_mobile_combined['year'] = df_monthly_pageviews_mobile_combined['timestamp'].str[0:4]
#df_monthly_pageviews_mobile_combined['month'] = df_monthly_pageviews_mobile_combined['timestamp'].str[4:6]

In [39]:
#df_monthly_pageviews_mobile_combined

In [40]:
#columns = ['project', 'access', 'agent', 'granularity']
#columns = ['project', 'agent', 'granularity']

In [41]:
#df_monthly_pageviews_mobile_web = df_monthly_pageviews_mobile_web.drop(columns = columns)

In [42]:
#df_monthly_pageviews_mobile_app = df_monthly_pageviews_mobile_app.drop(columns = columns)

In [43]:
#pageviews_df = [df_monthly_pageviews_mobile_web, df_monthly_pageviews_mobile_app]

In [44]:
#pageviews_df1 = pd.concat(pageviews_df, axis=1, keys=['timestamp'])

In [45]:
#pageviews_combined = df_monthly_pageviews_mobile_web[['access', 'granularity', 'timestamp', 'views']].copy()

In [46]:
#data1 = [df_monthly_pageviews_mobile_web['views'], df_monthly_pageviews_mobile_app['views']]

In [47]:
#headers = ['mobile_web', 'mobile_app']

In [48]:
#df_combined_mobile = pd.concat(data1, axis=1, keys=headers)

In [49]:
#df_combined_mobile

In [50]:
#merged_left = pd.merge(left=df_monthly_pageviews_mobile_web, right=df_monthly_pageviews_mobile_app, how='left', left_on='timestamp', right_on='timestamp')

In [51]:
#merged_left

In [52]:
#merged_left['pageview_mobile_views'] = merged_left['views_x'] + merged_left['views_y']

In [53]:
#merged_left

In [54]:
#result1 = result.replace(to_replace = ['mobile-web', 'mobile-app'], value = 'page_view_mobile_views')

In [55]:
#result2 = result1.groupby('timestamp')['views'].sum()

In [56]:
#df_mobile_web = df_monthly_pageviews_mobile_web.copy()

In [57]:
#df_pageviews_mobile = [df_monthly_pageviews_mobile_web, df_monthly_pageviews_mobile_app]

In [58]:
#df_pageviews_mobile = pd.concat(df_pageviews_mobile, axis=1)

In [59]:
#columns = ['project', 'access', 'agent', 'granularity']

In [60]:
#df_pageviews_mobile.drop(columns = columns)