In [1]:
import pandas as pd
import json
from json import loads, dumps

df_desktop = pd.read_json('academy_monthly_desktop_start201507-end202310.json')['articles']
df_desktop = pd.json_normalize(df_desktop)

df_mobile_app = pd.read_json('academy_monthly_mobile_app_start201507-end202310.json')['articles']
df_mobile_app = pd.json_normalize(df_mobile_app)

df_mobile_web = pd.read_json('academy_monthly_mobile_web_start201507-end202310.json')['articles']
df_mobile_web = pd.json_normalize(df_mobile_web)

In [2]:
df_desktop.describe()

Unnamed: 0,views
count,128860.0
mean,11449.53
std,37203.01
min,0.0
25%,537.0
50%,2787.0
75%,11881.0
max,3355060.0


In [3]:
df_mobile_app.describe()

Unnamed: 0,views
count,128860.0
mean,875.145887
std,4109.631416
min,0.0
25%,16.0
50%,144.0
75%,824.0
max,516287.0


In [4]:
df_mobile_web.describe()

Unnamed: 0,views
count,128860.0
mean,18227.76
std,74766.95
min,0.0
25%,412.0
50%,3001.5
75%,17074.25
max,7736645.0


In [5]:
df_mobile_web.columns

Index(['project', 'article', 'granularity', 'timestamp', 'access', 'agent',
       'views'],
      dtype='object')

In [6]:
df_mobile_web.head()

Unnamed: 0,project,article,granularity,timestamp,access,agent,views
0,en.wikipedia,Everything_Everywhere_All_at_Once,monthly,2020010100,mobile-web,user,2241
1,en.wikipedia,Everything_Everywhere_All_at_Once,monthly,2020020100,mobile-web,user,4955
2,en.wikipedia,Everything_Everywhere_All_at_Once,monthly,2020030100,mobile-web,user,4427
3,en.wikipedia,Everything_Everywhere_All_at_Once,monthly,2020040100,mobile-web,user,9540
4,en.wikipedia,Everything_Everywhere_All_at_Once,monthly,2020050100,mobile-web,user,7878


In [7]:
# join on article and timestamp to prepare for the summation of total view from both mobile-web and -app
# further join on project, granularity and agent to remove redundancy before hand

mobile = pd.merge(df_mobile_app, df_mobile_web, on=['project','article','timestamp','granularity','agent'])
mobile

Unnamed: 0,project,article,granularity,timestamp,access_x,agent,views_x,access_y,views_y
0,en.wikipedia,Everything_Everywhere_All_at_Once,monthly,2020010100,mobile-app,user,65,mobile-web,2241
1,en.wikipedia,Everything_Everywhere_All_at_Once,monthly,2020020100,mobile-app,user,152,mobile-web,4955
2,en.wikipedia,Everything_Everywhere_All_at_Once,monthly,2020030100,mobile-app,user,120,mobile-web,4427
3,en.wikipedia,Everything_Everywhere_All_at_Once,monthly,2020040100,mobile-app,user,284,mobile-web,9540
4,en.wikipedia,Everything_Everywhere_All_at_Once,monthly,2020050100,mobile-app,user,231,mobile-web,7878
...,...,...,...,...,...,...,...,...,...
128855,en.wikipedia,Zorba_the_Greek_(film),monthly,2023050100,mobile-app,user,493,mobile-web,8134
128856,en.wikipedia,Zorba_the_Greek_(film),monthly,2023060100,mobile-app,user,484,mobile-web,8282
128857,en.wikipedia,Zorba_the_Greek_(film),monthly,2023070100,mobile-app,user,765,mobile-web,14055
128858,en.wikipedia,Zorba_the_Greek_(film),monthly,2023080100,mobile-app,user,1203,mobile-web,12530


In [8]:
mobile['views'] = mobile['views_x']+mobile['views_y']

In [9]:
mobile['access'] = mobile['access_x'].str.slice(stop=6)

In [10]:
mobile = mobile.drop(columns=['access_x', 'access_y', 'views_x', 'views_y'])

In [11]:
# join on article and timestamp to prepare for the summation of total view from both mobile and desktop
# further join on project, granularity and agent to remove redundancy before hand

cumulative = pd.merge(mobile, df_desktop, on=['project','article','timestamp','granularity','agent'])
cumulative

Unnamed: 0,project,article,granularity,timestamp,agent,views_x,access_x,access_y,views_y
0,en.wikipedia,Everything_Everywhere_All_at_Once,monthly,2020010100,user,2306,mobile,desktop,1209
1,en.wikipedia,Everything_Everywhere_All_at_Once,monthly,2020020100,user,5107,mobile,desktop,2944
2,en.wikipedia,Everything_Everywhere_All_at_Once,monthly,2020030100,user,4547,mobile,desktop,2612
3,en.wikipedia,Everything_Everywhere_All_at_Once,monthly,2020040100,user,9824,mobile,desktop,4530
4,en.wikipedia,Everything_Everywhere_All_at_Once,monthly,2020050100,user,8109,mobile,desktop,3952
...,...,...,...,...,...,...,...,...,...
128855,en.wikipedia,Zorba_the_Greek_(film),monthly,2023050100,user,8627,mobile,desktop,4398
128856,en.wikipedia,Zorba_the_Greek_(film),monthly,2023060100,user,8766,mobile,desktop,3865
128857,en.wikipedia,Zorba_the_Greek_(film),monthly,2023070100,user,14820,mobile,desktop,5919
128858,en.wikipedia,Zorba_the_Greek_(film),monthly,2023080100,user,13733,mobile,desktop,5789


In [12]:
cumulative['views'] = cumulative['views_x']+cumulative['views_y']

In [13]:
cumulative['access'] = cumulative[['access_x', 'access_y']].agg('/'.join, axis=1)

In [14]:
cumulative = cumulative.drop(columns=['access_x', 'access_y', 'views_x', 'views_y'])
cumulative

Unnamed: 0,project,article,granularity,timestamp,agent,views,access
0,en.wikipedia,Everything_Everywhere_All_at_Once,monthly,2020010100,user,3515,mobile/desktop
1,en.wikipedia,Everything_Everywhere_All_at_Once,monthly,2020020100,user,8051,mobile/desktop
2,en.wikipedia,Everything_Everywhere_All_at_Once,monthly,2020030100,user,7159,mobile/desktop
3,en.wikipedia,Everything_Everywhere_All_at_Once,monthly,2020040100,user,14354,mobile/desktop
4,en.wikipedia,Everything_Everywhere_All_at_Once,monthly,2020050100,user,12061,mobile/desktop
...,...,...,...,...,...,...,...
128855,en.wikipedia,Zorba_the_Greek_(film),monthly,2023050100,user,13025,mobile/desktop
128856,en.wikipedia,Zorba_the_Greek_(film),monthly,2023060100,user,12631,mobile/desktop
128857,en.wikipedia,Zorba_the_Greek_(film),monthly,2023070100,user,20739,mobile/desktop
128858,en.wikipedia,Zorba_the_Greek_(film),monthly,2023080100,user,19522,mobile/desktop


In [15]:
# produce the combination of mobile data from mobile-app and -web

out_mobile = mobile.to_json(orient="records")
parsed_mobile = loads(out_mobile)

with open("academy_monthly_mobile_start201507-end202310.json", "a") as outfile:
    outfile.write(json.dumps(parsed_mobile,indent=4))

In [16]:
# produce the combination of data from mobile and desktop

out_cumulative = cumulative.to_json(orient="records")
parsed_cumulative = loads(out_cumulative)

with open("academy_monthly_cumulative_start201507-end202310.json", "a") as outfile:
    outfile.write(json.dumps(parsed_cumulative,indent=4))