In [1]:
import ibis
import pandas as pd

## Default Row Limits
It is possible to turn on interactive mode, which automatically executs ibis expressions. By default, ibis limits result sets returned to the local process to 10,000 rows. If you know you require >10000 rows returned, be careful to change the default limit.

## Interactive Mode
Ibis also allows and interactive mode that automatically executes all expressions. This can be useful in a notebook or repl. I personally prefer to epxlicitly execute expresssions, but this is a personal preference.  If you use the interactive mode, I recommnd setting the defaultlimit low to prevent accidentally trying to return an unreasonable number of rows to your local process.

In [2]:
ibis.options.sql.default_limit = None

hdfs_conn = ibis.hdfs_connect(host='bottou03.sjc.cloudera.com')

ibis_conn = ibis.impala.connect(host='bottou01.sjc.cloudera.com',
                                    port=21050,
                                    hdfs_client=hdfs_conn)

In [3]:
pageviews_tbl = ibis_conn.table('wiki_pageviews', database='u_juliet')

What is in a project name? What does this data look like?

In [4]:
project_names_expr = pageviews_tbl.project_name.distinct()
project_names = ibis_conn.execute(project_names_expr)
project_names

0          iu.m.d
1       he.zero.q
2          br.m.q
3          co.m.d
4          zero.f
5          ru.m.b
6             ltg
7       bh.zero.d
8          lv.m.b
9              am
10         vi.m.d
11           ay.d
12           tk.d
13           cr.q
14           is.d
15      iu.zero.d
16           uk.s
17        dv.zero
18        en.zero
19      tl.zero.d
20           kl.d
21           ln.b
22           wa.b
23           da.q
24         sv.m.n
25           la.s
26           sk.s
27             tt
28           ko.q
29             es
          ...    
2367           an
2368           pi
2369       fo.m.s
2370      ha.zero
2371       uz.m.b
2372    kn.zero.s
2373    es.zero.v
2374        sco.m
2375         ks.m
2376       pt.m.q
2377         cy.m
2378       az.m.b
2379       Amanhã
2380          21º
2381          pnb
2382        wg-en
2383         pt.b
2384           be
2385    vi.zero.q
2386       qu.m.q
2387         wo.q
2388         ar.d
2389       uk.m.b
2390    wa.zero.d
2391      

Maybe we can understand this by finding the projects with the most pages. Let's group by porject name and then count the size of the groups.

In [5]:
project_page_counts = pageviews_tbl.group_by(pageviews_tbl.project_name)\
                                   .size()\
                                   .sort_by(('count', False))
project_names = ibis_conn.execute(project_page_counts)

To find something interesting, it'll help to understand the language. 

In [6]:
[name for name in project_names if 'en' in name]

[]

The part of the project name after the '.' specifies a special type of wiki. Let's just look at the standard wiki pages (ie, not media-wiki) that are also written in English.

In [7]:
ibis_conn.execute(pageviews_tbl[pageviews_tbl.project_name == 'en'].limit(10))

Unnamed: 0,project_name,page_name,n_views,n_bytes,day,hour,month,year
0,en,!,1,0,31,6,12,2015
1,en,!!!,4,0,31,6,12,2015
2,en,!!!Fuck_You!!!,1,0,31,6,12,2015
3,en,!Kung_people,2,0,31,6,12,2015
4,en,!Que_viva_la_musica!,1,0,31,6,12,2015
5,en,!T.O.O.H.!,1,0,31,6,12,2015
6,en,!Women_Art_Revolution,1,0,31,6,12,2015
7,en,!_(album),1,0,31,6,12,2015
8,en,!_(disambiguation),1,0,31,6,12,2015
9,en,"""",1,0,31,6,12,2015


Project_name is homogenous in this dataset, so lets just take the projection of all other columns.

In [8]:
en_pageviews = pageviews_tbl[pageviews_tbl.project_name == 'en'].projection(['page_name',
                                                                              'n_views',
                                                                             'n_bytes',
                                                                             'day',
                                                                             'hour',
                                                                             'month',
                                                                             'year'])

In [9]:
ibis_conn.execute(en_pageviews.limit(10))

Unnamed: 0,page_name,n_views,n_bytes,day,hour,month,year
0,!,1,0,2,5,1,2016
1,!!,1,0,2,5,1,2016
2,!!!,8,0,2,5,1,2016
3,!!!Fuck_You!!!,1,0,2,5,1,2016
4,!Action_Pact!,1,0,2,5,1,2016
5,!Arriba!_La_Pachanga,1,0,2,5,1,2016
6,!Hero,1,0,2,5,1,2016
7,!Kung,1,0,2,5,1,2016
8,!Kung_people,1,0,2,5,1,2016
9,!Oka_Tokat,1,0,2,5,1,2016


It seems that we should exclude these pages with no names, and NaN counts. (With big data sets, you will find all
types of messed up data.)

In [10]:
top_10_pg_views_hourly = en_pageviews.sort_by((en_pageviews.n_views, False)).limit(10)
ibis_conn.execute(top_10_pg_views_hourly)

Unnamed: 0,page_name,n_views,n_bytes,day,hour,month,year
0,,,,31,1,12,2015
1,http://example.com/?,,,31,1,12,2015
2,,,,31,1,12,2015
3,,,,31,1,12,2015
4,,,,31,1,12,2015
5,Special:Export/,1936066.0,0.0,31,8,12,2015
6,Special:Export/,1121921.0,0.0,31,9,12,2015
7,Main_Page,724796.0,0.0,2,9,1,2016
8,Main_Page,707561.0,0.0,2,7,1,2016
9,Main_Page,655542.0,0.0,2,8,1,2016


In [11]:
null_pg_views = en_pageviews[en_pageviews.n_views.isnull()]

In [12]:
ibis_conn.execute(null_pg_views)

Unnamed: 0,page_name,n_views,n_bytes,day,hour,month,year
0,,,,31,1,12,2015
1,,,,31,1,12,2015
2,,,,31,1,12,2015
3,,,,31,1,12,2015
4,http://example.com/?,,,31,1,12,2015


In [13]:
nn_pg_views = en_pageviews[en_pageviews.n_views.notnull()]

What are the top ten page in this series that 

In [14]:
ibis_conn.execute(nn_pg_views.sort_by((nn_pg_views.n_views, False)).limit(10))

Unnamed: 0,page_name,n_views,n_bytes,day,hour,month,year
0,Special:Export/,1936066,0,31,8,12,2015
1,Special:Export/,1121921,0,31,9,12,2015
2,Main_Page,724796,0,2,9,1,2016
3,Main_Page,707561,0,2,7,1,2016
4,Main_Page,655542,0,2,8,1,2016
5,Main_Page,650881,0,3,7,1,2016
6,Main_Page,649290,0,3,8,1,2016
7,Main_Page,647347,0,2,6,1,2016
8,Main_Page,645168,0,30,9,12,2015
9,Main_Page,641259,0,30,8,12,2015


hangover, brands of champagne, mew years traditions, time differences, international datetime,

In [15]:
champagne_df = ibis_conn.execute(nn_pg_views[nn_pg_views.page_name.lower() == 'champagne'])

In [16]:
champagne_df.sort(['day', 'hour'])

  if __name__ == '__main__':


Unnamed: 0,page_name,n_views,n_bytes,day,hour,month,year
21,Champagne,223,0,01,00,01,2016
52,Champagne,176,0,01,01,01,2016
35,Champagne,186,0,01,02,01,2016
25,Champagne,162,0,01,03,01,2016
1,Champagne,147,0,01,04,01,2016
7,Champagne,150,0,01,05,01,2016
3,Champagne,226,0,01,06,01,2016
24,Champagne,210,0,01,07,01,2016
5,Champagne,113,0,01,08,01,2016
28,Champagne,117,0,01,09,01,2016


In [17]:
champagne_df['time'] = pd.to_datetime(champagne_df[['year', 'month', 'day', 'hour']])

In [18]:
champagne_df[['n_views', 'time']].plot()

ImportError: No module named matplotlib.pyplot

In [None]:
w_daily_views = nn_pg_views.group_by(['page_name', 'month', 'day']).aggregate(daily_views=nn_pg_views.n_views.sum())

ibis_conn.execute(w_daily_views.sort_by((w_daily_views.daily_views, False)).limit(10))

In [None]:
tot_view = nn_pg_views.group_by('page_name').aggregate(all_views=nn_pg_views.n_views.sum())
ibis_conn.execute(tot_view.sort_by((tot_view.all_views, False)).limit(30))