In [25]:
import ibis
import pandas as pd

## Default Row Limits
It is possible to turn on interactive mode, which automatically executs ibis expressions. By default, ibis limits result sets returned to the local process to 10,000 rows. If you know you require >10000 rows returned, be careful to change the default limit.

## Interactive Mode
Ibis also allows and interactive mode that automatically executes all expressions. This can be useful in a notebook or repl. I personally prefer to epxlicitly execute expresssions, but this is a personal preference.  If you use the interactive mode, I recommnd setting the defaultlimit low to prevent accidentally trying to return an unreasonable number of rows to your local process.

In [26]:
ibis.options.sql.default_limit = None

hdfs_conn = ibis.hdfs_connect(host='bottou03.sjc.cloudera.com')

ibis_conn = ibis.impala.connect(host='bottou01.sjc.cloudera.com',
                                    port=21050,
                                    hdfs_client=hdfs_conn)

In [5]:
pageviews_tbl = ibis_conn.table('wiki_pageviews', database='u_juliet')

What is in a project name? What does this data look like?

In [27]:
project_names = pageviews_tbl.project_name.distinct()
ibis_conn.execute(project_names)

0          iu.m.d
1       he.zero.q
2          br.m.q
3          co.m.d
4          zero.f
5          ru.m.b
6             ltg
7          lv.m.b
8              am
9          vi.m.d
10           ay.d
11           tk.d
12           cr.q
13           is.d
14      iu.zero.d
15           uk.s
16        dv.zero
17        en.zero
18      tl.zero.d
19           kl.d
20           ln.b
21           wa.b
22           da.q
23         sv.m.n
24           la.s
25           sk.s
26             tt
27           ko.q
28             es
29             kw
          ...    
2288           an
2289           pi
2290       fo.m.s
2291      ha.zero
2292       uz.m.b
2293    kn.zero.s
2294    es.zero.v
2295        sco.m
2296         ks.m
2297       pt.m.q
2298         cy.m
2299       az.m.b
2300       Amanhã
2301          21º
2302          pnb
2303        wg-en
2304         pt.b
2305           be
2306    vi.zero.q
2307       qu.m.q
2308         wo.q
2309         ar.d
2310       uk.m.b
2311    wa.zero.d
2312      

Maybe we can understand this by finding the projects with the most pages. Let's group by porject name and then count the size of the groups.

In [28]:
project_page_counts = pageviews_tbl.group_by(pageviews_tbl.project_name)\
                                   .size()\
                                   .sort_by(('count', False))
ibis_conn.execute(project_page_counts)

Unnamed: 0,project_name,count
0,en,57774129
1,en.m,38093008
2,fr,11234880
3,de,10624957
4,ja.m,9061891
5,ru,8835232
6,ja,8238480
7,es,7545728
8,de.m,6931540
9,es.m,6561233


To find something interesting, it'll help to understand the language. 

In [13]:
[name for name in project_names if 'en' in name]

['en.zero',
 'en.m.v',
 'en.m.q',
 'en.n',
 'en.zero.v',
 'ten.m',
 'en.zero.b',
 'en.zero.s',
 'en.m.b',
 'en.m.s',
 'en.zero.n',
 'en.d',
 'en.voy',
 'en.zero.q',
 'en.m.n',
 'en.zero.d',
 'en.s',
 'en.m.voy',
 'en.m.d',
 'ten',
 'en.zero.voy',
 'en.m',
 'en.q',
 'en.v',
 'en',
 'en.b',
 'wg-en']

The part of the project name after the '.' specifies a special type of wiki. Let's just look at the standard wiki pages (ie, not media-wiki) that are also written in English.

In [15]:
ibis_conn.execute(pageviews_tbl[pageviews_tbl.project_name == 'en'].limit(10))

Unnamed: 0,project_name,page_name,monthly_total,hourly_total,day,hour,month,year
0,en,!,1,0,30,7,12,2015
1,en,!!!,2,0,30,7,12,2015
2,en,!!!Fuck_You!!!_and_Then_Some,2,0,30,7,12,2015
3,en,!!Destroy-Oh-Boy!!,1,0,30,7,12,2015
4,en,!Kung_people,2,0,30,7,12,2015
5,en,!Xoo,1,0,30,7,12,2015
6,en,!_(album),1,0,30,7,12,2015
7,en,"""",1,0,30,7,12,2015
8,en,"""...And_Ladies_of_the_Club""",3,0,30,7,12,2015
9,en,"""Babbacombe""_Lee",1,0,30,7,12,2015


Project_name is homogenous in this dataset, so lets just take the projection of all other columns.

In [30]:
en_pageviews = pageviews_tbl[pageviews_tbl.project_name == 'en'].projection(['page_name',
                                                                              'monthly_total',
                                                                             'hourly_total',
                                                                             'day',
                                                                             'hour',
                                                                             'month',
                                                                             'year'])

In [32]:
ibis_conn.execute(en_pageviews.limit(10))

Unnamed: 0,page_name,monthly_total,hourly_total,day,hour,month,year
0,!!!,5,0,30,3,12,2015
1,!!!Fuck_You!!!,2,0,30,3,12,2015
2,!!!Fuck_You!!!_And_Then_Some,1,0,30,3,12,2015
3,!!!Fuck_You!!!_and_Then_Some,1,0,30,3,12,2015
4,!!!_(album),1,0,30,3,12,2015
5,!337$P34K,1,0,30,3,12,2015
6,!=,1,0,30,3,12,2015
7,!?!,1,0,30,3,12,2015
8,!Action_Pact!,1,0,30,3,12,2015
9,!Kung_language,4,0,30,3,12,2015


It seems that we should exclude these pages with no names, and NaN counts. (With big data sets, you will find all
types of messed up data.)

In [29]:
top_10_pgviews_hourly = en_pageviews.sort_by((en_pageviews.monthly_total, False)).limit(10)
ibis_conn.execute(top_10_pg_views_hourly)

Unnamed: 0,page_name,monthly_total,hourly_total,day,hour,month,year
0,,,,31,1,12,2015
1,http://example.com/?,,,31,1,12,2015
2,,,,31,1,12,2015
3,,,,31,1,12,2015
4,,,,31,1,12,2015
5,Special:Export/,1936066.0,0.0,31,8,12,2015
6,Special:Export/,1121921.0,0.0,31,9,12,2015
7,Main_Page,645168.0,0.0,30,9,12,2015
8,Main_Page,641259.0,0.0,30,8,12,2015
9,Main_Page,611956.0,0.0,31,9,12,2015


In [41]:
null_pg_views = en_page_views[en_page_views.hourly_total.isnull()]

In [42]:
ibis_conn.execute(null_pg_views)

Unnamed: 0,page_name,monthly_total,hourly_total,day,hour,month,year
0,,,,31,1,12,2015
1,,,,31,1,12,2015
2,,,,31,1,12,2015
3,,,,31,1,12,2015
4,http://example.com/?,,,31,1,12,2015


In [43]:
nn_pg_views = en_page_views[en_page_views.hourly_total.notnull()]

What are the top ten page in this series that 

In [52]:
ibis_conn.execute(nn_pg_views[nn_pg_views.monthly_total >  100000].limit(20))

Unnamed: 0,page_name,monthly_total,hourly_total,day,hour,month,year
0,Main_Page,403827,0,30,5,12,2015
1,Main_Page,221548,0,31,4,12,2015
2,Main_Page,236697,0,30,4,12,2015
3,Main_Page,505361,0,31,6,12,2015
4,Main_Page,598570,0,30,7,12,2015
5,Main_Page,567263,0,30,6,12,2015
6,Main_Page,278845,0,31,23,12,2015
7,Main_Page,210538,0,30,3,12,2015
8,Main_Page,208819,0,30,2,12,2015
9,Main_Page,342662,0,31,5,12,2015


In [None]:
nn_pg_views.mutate()

In [46]:
ibis_conn.execute(nn_pg_views.sort_by((nn_pg_views.hourly_total, False)).limit(10))

Unnamed: 0,page_name,monthly_total,hourly_total,day,hour,month,year
0,!!!Fuck_You!!!,2,0,31,3,12,2015
1,!!,1,0,31,3,12,2015
2,!!!Fuck_You!!!_and_Then_Some,1,0,31,3,12,2015
3,!Kung_people,1,0,31,3,12,2015
4,!!!_(album),1,0,31,3,12,2015
5,!Oka_Tokat,2,0,31,3,12,2015
6,"""",1,0,31,3,12,2015
7,!BANG!,1,0,31,3,12,2015
8,!!!,3,0,31,3,12,2015
9,!,1,0,31,3,12,2015


hangover, brands of champagne, mew years traditions, time differences, international datetime,

In [54]:
champagne_df = ibis_conn.execute(nn_pg_views[nn_pg_views.page_name.lower() == 'champagne'])

In [55]:
champagne_df.sort(['day', 'hour'])

  if __name__ == '__main__':


Unnamed: 0,page_name,monthly_total,hourly_total,day,hour,month,year
22,Champagne,52,0,30,0,12,2015
30,Champagne,52,0,30,1,12,2015
23,Champagne,48,0,30,2,12,2015
3,Champagne,60,0,30,3,12,2015
1,Champagne,55,0,30,4,12,2015
0,Champagne,47,0,30,5,12,2015
11,Champagne,50,0,30,6,12,2015
4,Champagne,43,0,30,7,12,2015
15,Champagne,44,0,30,8,12,2015
41,Champagne,53,0,30,9,12,2015
