Programación Basica
===

* 60 min | Última modificación: Noviembre 07, 2019

Cell magic `%%pig`
---

In [1]:
from IPython.core.magic import Magics, cell_magic, line_magic, magics_class
from pexpect import spawn

TIMEOUT = 60
PROG = "pig"
PROMPT = ["\r\n>> ", "\r\ngrunt> "]
DISCARD = ["INFO  org.apache", "WARN  org.apache"]
QUIT = "quit"


@magics_class
class Magic(Magics):
    def __init__(self, shell):
        super().__init__(shell)
        self.app = spawn(PROG, timeout=60)
        self.app.expect(PROMPT)

    @cell_magic
    def pig(self, line, cell):
        cell_lines = [cell_line.strip() for cell_line in cell.split("\n")]
        cell_lines = [cell_line for cell_line in cell_lines if cell_line != ""]
        for cell_line in cell_lines:
            self.app.sendline(cell_line)
            self.app.expect(PROMPT, timeout=TIMEOUT)
            output = self.app.before.decode()
            output = output.replace("\r\n", "\n")
            output = output.split("\n")
            output = [output_line.strip() for output_line in output]
            for output_line in output:
                if output_line not in cell_lines:
                    if not any(word in output_line for word in DISCARD):
                        print(output_line)
        return None

    @line_magic
    def quit(self, line):
        self.app.sendline(QUIT)


def load_ipython_extension(ip):
    ip.register_magics(Magic(ip))


load_ipython_extension(ip=get_ipython())

### LOAD

El nombre de archivo soporta wilcards; por ejemplo:

    x = LOAD 'files/*.txt';
    
leerá todos los archivos terminados en `.txt`. Por defecto, se espera que los campos en los archivos estén separados por comas.

In [2]:
%%writefile /tmp/persons.csv
1,Vivian,Hamilton,1971-07-08,green,1
2,Karen,Holcomb,1974-05-23,green,4
3,Cody,Garrett,1973-04-22,orange,1
4,Roth,Fry,1975-01-29,black,1
5,Zoe,Conway,1974-07-03,blue,2
6,Gretchen,Kinney,1974-10-18,viole,1
7,Driscoll,Klein,1970-10-05,blue,5
8,Karyn,Diaz,1969-02-24,red,1
9,Merritt,Guy,1974-10-17,indigo,4
10,Kylan,Sexton,1975-02-28,black,4
11,Jordan,Estes,1969-12-07,indigo,4
12,Hope,Coffey,1973-12-24,green,5
13,Vivian,Crane,1970-08-27,gray,5
14,Clio,Noel,1972-12-12,red,5
15,Hope,Silva,1970-07-01,blue,5
16,Ayanna,Jarvis,1974-02-11,orange,5
17,Chanda,Boyer,1973-04-01,green,4
18,Chadwick,Knight,1973-04-29,yellow,1

Writing /tmp/persons.csv


In [3]:
%%pig
fs -put /tmp/persons.csv 

In [4]:
%%pig
-- 
-- Carga el archivo desde el disco duro
--
u = LOAD 'persons.csv' USING PigStorage(',') 
    AS (id:INT, 
        firstname:CHARARRAY, 
        surname:CHARARRAY, 
        birtday:CHARARRAY, 
        color:CHARARRAY, 
        quantity:INT);
DUMP u;

(1,Vivian,Hamilton,1971-07-08,green,1)
(2,Karen,Holcomb,1974-05-23,green,4)
(3,Cody,Garrett,1973-04-22,orange,1)
(4,Roth,Fry,1975-01-29,black,1)
(5,Zoe,Conway,1974-07-03,blue,2)
(6,Gretchen,Kinney,1974-10-18,viole,1)
(7,Driscoll,Klein,1970-10-05,blue,5)
(8,Karyn,Diaz,1969-02-24,red,1)
(9,Merritt,Guy,1974-10-17,indigo,4)
(10,Kylan,Sexton,1975-02-28,black,4)
(11,Jordan,Estes,1969-12-07,indigo,4)
(12,Hope,Coffey,1973-12-24,green,5)
(13,Vivian,Crane,1970-08-27,gray,5)
(14,Clio,Noel,1972-12-12,red,5)
(15,Hope,Silva,1970-07-01,blue,5)
(16,Ayanna,Jarvis,1974-02-11,orange,5)
(17,Chanda,Boyer,1973-04-01,green,4)
(18,Chadwick,Knight,1973-04-29,yellow,1)


### STORE

In [5]:
%%pig
STORE u INTO 'output';

In [6]:
!hadoop fs -ls output/*

-rw-r--r--   1 root supergroup          0 2022-05-17 02:08 output/_SUCCESS
-rw-r--r--   1 root supergroup        616 2022-05-17 02:08 output/part-m-00000


In [7]:
!hadoop fs -cat output/part-m-00000 | head

1	Vivian	Hamilton	1971-07-08	green	1
2	Karen	Holcomb	1974-05-23	green	4
3	Cody	Garrett	1973-04-22	orange	1
4	Roth	Fry	1975-01-29	black	1
5	Zoe	Conway	1974-07-03	blue	2
6	Gretchen	Kinney	1974-10-18	viole	1
7	Driscoll	Klein	1970-10-05	blue	5
8	Karyn	Diaz	1969-02-24	red	1
9	Merritt	Guy	1974-10-17	indigo	4
10	Kylan	Sexton	1975-02-28	black	4


In [8]:
!hadoop fs -rm output/*  
!hadoop fs -rmdir  output

Deleted output/_SUCCESS
Deleted output/part-m-00000


In [9]:
%%pig
STORE u INTO 'output' USING PigStorage(';');

In [10]:
!hadoop fs -ls output/*

-rw-r--r--   1 root supergroup          0 2022-05-17 02:09 output/_SUCCESS
-rw-r--r--   1 root supergroup        616 2022-05-17 02:09 output/part-m-00000


In [11]:
!hadoop fs -cat output/part-m-00000 | head

1;Vivian;Hamilton;1971-07-08;green;1
2;Karen;Holcomb;1974-05-23;green;4
3;Cody;Garrett;1973-04-22;orange;1
4;Roth;Fry;1975-01-29;black;1
5;Zoe;Conway;1974-07-03;blue;2
6;Gretchen;Kinney;1974-10-18;viole;1
7;Driscoll;Klein;1970-10-05;blue;5
8;Karyn;Diaz;1969-02-24;red;1
9;Merritt;Guy;1974-10-17;indigo;4
10;Kylan;Sexton;1975-02-28;black;4


In [12]:
!hadoop fs -rm output/*  
!hadoop fs -rmdir  output

Deleted output/_SUCCESS
Deleted output/part-m-00000


### FOREACH

In [13]:
%%pig
--
-- Los campos del archivo puden ser indicados por nombre 
-- o por posición iniciando en 0
--
v = FOREACH u GENERATE firstname, $2, $3..$5;
DUMP v;

(Vivian,Hamilton,1971-07-08,green,1)
(Karen,Holcomb,1974-05-23,green,4)
(Cody,Garrett,1973-04-22,orange,1)
(Roth,Fry,1975-01-29,black,1)
(Zoe,Conway,1974-07-03,blue,2)
(Gretchen,Kinney,1974-10-18,viole,1)
(Driscoll,Klein,1970-10-05,blue,5)
(Karyn,Diaz,1969-02-24,red,1)
(Merritt,Guy,1974-10-17,indigo,4)
(Kylan,Sexton,1975-02-28,black,4)
(Jordan,Estes,1969-12-07,indigo,4)
(Hope,Coffey,1973-12-24,green,5)
(Vivian,Crane,1970-08-27,gray,5)
(Clio,Noel,1972-12-12,red,5)
(Hope,Silva,1970-07-01,blue,5)
(Ayanna,Jarvis,1974-02-11,orange,5)
(Chanda,Boyer,1973-04-01,green,4)
(Chadwick,Knight,1973-04-29,yellow,1)


In [14]:
%%pig
--
-- uso de condicionales 
--
v = FOREACH u GENERATE ($0 > $5 ? 10 : 20) ;
DUMP v;

(20)
(20)
(10)
(10)
(10)
(10)
(10)
(10)
(10)
(10)
(10)
(10)
(10)
(10)
(10)
(10)
(10)
(10)


In [15]:
%%pig
y = FOREACH u GENERATE $1, $3;
DUMP y;

(Vivian,1971-07-08)
(Karen,1974-05-23)
(Cody,1973-04-22)
(Roth,1975-01-29)
(Zoe,1974-07-03)
(Gretchen,1974-10-18)
(Driscoll,1970-10-05)
(Karyn,1969-02-24)
(Merritt,1974-10-17)
(Kylan,1975-02-28)
(Jordan,1969-12-07)
(Hope,1973-12-24)
(Vivian,1970-08-27)
(Clio,1972-12-12)
(Hope,1970-07-01)
(Ayanna,1974-02-11)
(Chanda,1973-04-01)
(Chadwick,1973-04-29)


In [16]:
%%pig
y = FOREACH u GENERATE UPPER($1);
DUMP y;

(VIVIAN)
(KAREN)
(CODY)
(ROTH)
(ZOE)
(GRETCHEN)
(DRISCOLL)
(KARYN)
(MERRITT)
(KYLAN)
(JORDAN)
(HOPE)
(VIVIAN)
(CLIO)
(HOPE)
(AYANNA)
(CHANDA)
(CHADWICK)


In [17]:
%%pig
y = FOREACH u GENERATE [$1, $3];
DUMP y;

([Vivian#1971-07-08])
([Karen#1974-05-23])
([Cody#1973-04-22])
([Roth#1975-01-29])
([Zoe#1974-07-03])
([Gretchen#1974-10-18])
([Driscoll#1970-10-05])
([Karyn#1969-02-24])
([Merritt#1974-10-17])
([Kylan#1975-02-28])
([Jordan#1969-12-07])
([Hope#1973-12-24])
([Vivian#1970-08-27])
([Clio#1972-12-12])
([Hope#1970-07-01])
([Ayanna#1974-02-11])
([Chanda#1973-04-01])
([Chadwick#1973-04-29])


### FILTER

In [18]:
%%pig
y = FILTER u BY $4 MATCHES 'blue';
DUMP y;

(5,Zoe,Conway,1974-07-03,blue,2)
(7,Driscoll,Klein,1970-10-05,blue,5)
(15,Hope,Silva,1970-07-01,blue,5)


### GROUP

In [19]:
%%pig
y = GROUP u BY $4;
DUMP y;

(red,{(8,Karyn,Diaz,1969-02-24,red,1),(14,Clio,Noel,1972-12-12,red,5)})
(blue,{(15,Hope,Silva,1970-07-01,blue,5),(7,Driscoll,Klein,1970-10-05,blue,5),(5,Zoe,Conway,1974-07-03,blue,2)})
(gray,{(13,Vivian,Crane,1970-08-27,gray,5)})
(black,{(10,Kylan,Sexton,1975-02-28,black,4),(4,Roth,Fry,1975-01-29,black,1)})
(green,{(12,Hope,Coffey,1973-12-24,green,5),(2,Karen,Holcomb,1974-05-23,green,4),(17,Chanda,Boyer,1973-04-01,green,4),(1,Vivian,Hamilton,1971-07-08,green,1)})
(viole,{(6,Gretchen,Kinney,1974-10-18,viole,1)})
(indigo,{(9,Merritt,Guy,1974-10-17,indigo,4),(11,Jordan,Estes,1969-12-07,indigo,4)})
(orange,{(3,Cody,Garrett,1973-04-22,orange,1),(16,Ayanna,Jarvis,1974-02-11,orange,5)})
(yellow,{(18,Chadwick,Knight,1973-04-29,yellow,1)})


In [20]:
%%pig
z = FOREACH y GENERATE u, COUNT(u);
DUMP z;

({(8,Karyn,Diaz,1969-02-24,red,1),(14,Clio,Noel,1972-12-12,red,5)},2)
({(15,Hope,Silva,1970-07-01,blue,5),(7,Driscoll,Klein,1970-10-05,blue,5),(5,Zoe,Conway,1974-07-03,blue,2)},3)
({(13,Vivian,Crane,1970-08-27,gray,5)},1)
({(10,Kylan,Sexton,1975-02-28,black,4),(4,Roth,Fry,1975-01-29,black,1)},2)
({(12,Hope,Coffey,1973-12-24,green,5),(2,Karen,Holcomb,1974-05-23,green,4),(17,Chanda,Boyer,1973-04-01,green,4),(1,Vivian,Hamilton,1971-07-08,green,1)},4)
({(6,Gretchen,Kinney,1974-10-18,viole,1)},1)
({(9,Merritt,Guy,1974-10-17,indigo,4),(11,Jordan,Estes,1969-12-07,indigo,4)},2)
({(3,Cody,Garrett,1973-04-22,orange,1),(16,Ayanna,Jarvis,1974-02-11,orange,5)},2)
({(18,Chadwick,Knight,1973-04-29,yellow,1)},1)


### DESCRIBE


In [21]:
%%pig
DESCRIBE u;

u: {
id: int,
firstname: chararray,
surname: chararray,
birtday: chararray,
color: chararray,
quantity: int
}


### ORDER .. BY ..

In [22]:
%%pig
y = ORDER u BY $4;
DUMP y;

(4,Roth,Fry,1975-01-29,black,1)
(10,Kylan,Sexton,1975-02-28,black,4)
(5,Zoe,Conway,1974-07-03,blue,2)
(7,Driscoll,Klein,1970-10-05,blue,5)
(15,Hope,Silva,1970-07-01,blue,5)
(13,Vivian,Crane,1970-08-27,gray,5)
(2,Karen,Holcomb,1974-05-23,green,4)
(12,Hope,Coffey,1973-12-24,green,5)
(17,Chanda,Boyer,1973-04-01,green,4)
(1,Vivian,Hamilton,1971-07-08,green,1)
(9,Merritt,Guy,1974-10-17,indigo,4)
(11,Jordan,Estes,1969-12-07,indigo,4)
(3,Cody,Garrett,1973-04-22,orange,1)
(16,Ayanna,Jarvis,1974-02-11,orange,5)
(8,Karyn,Diaz,1969-02-24,red,1)
(14,Clio,Noel,1972-12-12,red,5)
(6,Gretchen,Kinney,1974-10-18,viole,1)
(18,Chadwick,Knight,1973-04-29,yellow,1)


### DISCTINCT

In [23]:
%%pig
-- 
-- opera únicamente sobre registros completos
--
y = FOREACH u GENERATE $4;
z = DISCTINCT y;
DUMP z;

2022-05-17 02:12:01,202 [main] ERROR org.apache.pig.tools.grunt.Grunt - ERROR 1200: <line 20, column 0>  Syntax error, unexpected symbol at or near 'z'
Details at logfile: /workspace/pig/pig_1652753296015.log
({(8,Karyn,Diaz,1969-02-24,red,1),(14,Clio,Noel,1972-12-12,red,5)},2)
({(15,Hope,Silva,1970-07-01,blue,5),(7,Driscoll,Klein,1970-10-05,blue,5),(5,Zoe,Conway,1974-07-03,blue,2)},3)
({(13,Vivian,Crane,1970-08-27,gray,5)},1)
({(10,Kylan,Sexton,1975-02-28,black,4),(4,Roth,Fry,1975-01-29,black,1)},2)
({(12,Hope,Coffey,1973-12-24,green,5),(2,Karen,Holcomb,1974-05-23,green,4),(17,Chanda,Boyer,1973-04-01,green,4),(1,Vivian,Hamilton,1971-07-08,green,1)},4)
({(6,Gretchen,Kinney,1974-10-18,viole,1)},1)
({(9,Merritt,Guy,1974-10-17,indigo,4),(11,Jordan,Estes,1969-12-07,indigo,4)},2)
({(3,Cody,Garrett,1973-04-22,orange,1),(16,Ayanna,Jarvis,1974-02-11,orange,5)},2)
({(18,Chadwick,Knight,1973-04-29,yellow,1)},1)


### JOIN

In [24]:
%%writefile /tmp/jointable.csv
1,A
2,B
3,C
24,X
25,Y
26,Z

Writing /tmp/jointable.csv


In [25]:
!hadoop fs -put /tmp/jointable.csv

In [26]:
%%pig
w = LOAD 'jointable.csv' USING PigStorage(',') 
    AS (id:INT, 
        letter:CHARARRAY);
DUMP w;

(1,A)
(2,B)
(3,C)
(24,X)
(25,Y)
(26,Z)


In [27]:
%%pig
w = JOIN u BY id, w BY id;
DUMP w;

(1,Vivian,Hamilton,1971-07-08,green,1,1,A)
(2,Karen,Holcomb,1974-05-23,green,4,2,B)
(3,Cody,Garrett,1973-04-22,orange,1,3,C)


### LIMIT

In [28]:
%%pig
z = LIMIT u 10;
DUMP z;

(1,Vivian,Hamilton,1971-07-08,green,1)
(2,Karen,Holcomb,1974-05-23,green,4)
(3,Cody,Garrett,1973-04-22,orange,1)
(4,Roth,Fry,1975-01-29,black,1)
(5,Zoe,Conway,1974-07-03,blue,2)
(6,Gretchen,Kinney,1974-10-18,viole,1)
(7,Driscoll,Klein,1970-10-05,blue,5)
(8,Karyn,Diaz,1969-02-24,red,1)
(9,Merritt,Guy,1974-10-17,indigo,4)
(10,Kylan,Sexton,1975-02-28,black,4)


### SAMPLE

In [29]:
%%pig
w = SAMPLE u 0.2;
DUMP w;

(2,Karen,Holcomb,1974-05-23,green,4)
(6,Gretchen,Kinney,1974-10-18,viole,1)


### CASE

In [30]:
%%pig
r = FOREACH u GENERATE (
  CASE $0
    WHEN 1 THEN 10
    WHEN 2 THEN 20
    ELSE 3
  END
);
DUMP r;

(10)
(20)
(3)
(3)
(3)
(3)
(3)
(3)
(3)
(3)
(3)
(3)
(3)
(3)
(3)
(3)
(3)
(3)


In [31]:
%quit