# Hadoop MapReduce

## Código `mapper.py`

In [6]:
%%writefile example/mapper.py
#!/usr/bin/env python3
import sys

for line in sys.stdin:
    line = line.strip()
    words = line.split()
    for word in words:
        print(f"{word}\t1")

Overwriting example/mapper.py


## Código `reducer.py`

In [7]:
%%writefile example/reducer.py
#!/usr/bin/env python3
import sys

current_word = None
current_count = 0

for line in sys.stdin:
    word, count = line.strip().split("\t", 1)
    count = int(count)

    if current_word == word:
        current_count += count
    else:
        if current_word:
            print(f"{current_word}\t{current_count}")
        current_word = word
        current_count = count
    
if current_word:
    print(f"{current_word}\t{current_count}")

Overwriting example/reducer.py


## Obtención archivo quijote.txt

In [8]:
!wget https://aitor-medrano.github.io/iabd/spark/resources/el_quijote.txt
!mv el_quijote.txt ./example

--2025-11-18 09:39:43--  https://aitor-medrano.github.io/iabd/spark/resources/el_quijote.txt
Resolving aitor-medrano.github.io (aitor-medrano.github.io)... 185.199.111.153, 185.199.108.153, 185.199.109.153, ...
Connecting to aitor-medrano.github.io (aitor-medrano.github.io)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1060259 (1.0M) [text/plain]
Saving to: ‘el_quijote.txt’


2025-11-18 09:39:45 (806 KB/s) - ‘el_quijote.txt’ saved [1060259/1060259]



## Subida del archivo a `hdfs`

In [4]:
!hdfs dfs -mkdir /contar

In [None]:
!hdfs dfs -put example/el_quijote.txt /contar

In [6]:
!hdfs dfs -ls /contar/

Found 1 items
-rw-r--r--   3 root supergroup    1060259 2025-11-18 08:43 /contar/el_quijote.txt


## Testeo código

In [None]:
!cat example/el_quijote.txt | python3 example/mapper.py | sort | python3 example/reducer.py

"Apenas	1
"Caballero	4
"Conde	1
"Donde	1
"Más	1
"Miau",	1
"No	1
"Rastrea	1
"Ricamonte",	1
"Tablante",	1
"dichosa	1
"el	7
"y	1
"¡Oh,	1
''Éste	2
''¡Ea,	1
(Y	1
(a	1
(al	1
(como	3
(con	1
(cosas	1
(creyendo	1
(de	4
(en	1
(lo	2
(los	1
(para	1
(por	1
(porque	10
(pues	1
(que	26
(que,	2
(quizá	1
(si	3
(sin	1
(tal	1
(y	7
,	1
-	4
-A	8
-Acaba	1
-Acudid,	1
-Agora	1
-Agora,	1
-Ahí	1
-Ahora	11
-Amejí,	1
-Amigo	1
-Ansí	1
-Antes	1
-Añadió	2
-Aquí	3
-Aquí,	1
-Así	33
-Aun	1
-Aunque	1
-Aún	2
-Basta	1
-Bien	12
-Buena	1
-Caballeros,	1
-Calla,	2
-Cardenio,	1
-Castillo	1
-Cesen,	1
-Cierto	1
-Cismáticos	1
-Como	2
-Con	8
-Contra	1
-Corrida	1
-Cualquiera	2
-Cuando	1
-Cuatro	1
-Dadme	1
-De	3
-Debe	2
-Debes	1
-Decidme,	1
-Decilda,	1
-Dejadme,	1
-Del	1
-Desa	1
-Deso	1
-Después	2
-Deteneos,	1
-Deténgome	1
-Déjeme	2
-Déjeseme	1
-Di	1
-Dice	1
-Dichosa	1
-Digalos,	1
-Digo	7
-Digo,	5
-Dilas	1
-Dígame,	3
-Dígolo	2
-Dígote,	1
-Don	2
-Dulce	1
-Ea,	1
-Echemos,	2
-El	3
-Ella	1
-Ellos,	1
-En	13
-Es	2
-Esa	

In [None]:
!hadoop jar \
/usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.4.0.jar \
-file example/mapper.py \
-mapper example/mapper.py \
-file example/reducer.py \
-reducer example/reducer.py \
-input /contar/el_quijote.txt \
-output /contar/output

2025-11-18 09:13:22,142 WARN streaming.StreamJob: -file option is deprecated, please use generic option -files instead.
packageJobJar: [./mapper.py, ./reducer.py, /tmp/hadoop-unjar7690245356784757461/] [] /tmp/streamjob7047902108027858550.jar tmpDir=null
2025-11-18 09:13:22,979 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at yarnmanager/172.19.0.5:8032
2025-11-18 09:13:23,093 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at yarnmanager/172.19.0.5:8032
2025-11-18 09:13:23,320 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/root/.staging/job_1763455359622_0005
2025-11-18 09:13:23,752 INFO mapred.FileInputFormat: Total input files to process : 1
2025-11-18 09:13:23,847 INFO mapreduce.JobSubmitter: number of splits:2
2025-11-18 09:13:23,948 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1763455359622_0005
2025-11-18 09:13:23,948 INFO mapreduce.JobSubmitter: Executin

In [33]:
!hdfs dfs -cat /contar/output/part-00000

"Apenas	1
"Caballero	4
"Conde	1
"Donde	1
"Más	1
"Miau",	1
"No	1
"Rastrea	1
"Ricamonte",	1
"Tablante",	1
"dichosa	1
"el	7
"y	1
"¡Oh,	1
''Éste	2
''¡Ea,	1
(Y	1
(a	1
(al	1
(como	3
(con	1
(cosas	1
(creyendo	1
(de	4
(en	1
(lo	2
(los	1
(para	1
(por	1
(porque	10
(pues	1
(que	26
(que,	2
(quizá	1
(si	3
(sin	1
(tal	1
(y	7
,	1
-	4
-A	8
-Acaba	1
-Acudid,	1
-Agora	1
-Agora,	1
-Ahí	1
-Ahora	11
-Amejí,	1
-Amigo	1
-Ansí	1
-Antes	1
-Añadió	2
-Aquí	3
-Aquí,	1
-Así	33
-Aun	1
-Aunque	1
-Aún	2
-Basta	1
-Bien	12
-Buena	1
-Caballeros,	1
-Calla,	2
-Cardenio,	1
-Castillo	1
-Cesen,	1
-Cierto	1
-Cismáticos	1
-Como	2
-Con	8
-Contra	1
-Corrida	1
-Cualquiera	2
-Cuando	1
-Cuatro	1
-Dadme	1
-De	3
-Debe	2
-Debes	1
-Decidme,	1
-Decilda,	1
-Dejadme,	1
-Del	1
-Desa	1
-Deso	1
-Después	2
-Deteneos,	1
-Deténgome	1
-Déjeme	2
-Déjeseme	1
-Di	1
-Dice	1
-Dichosa	1
-Digalos,	1
-Digo	7
-Digo,	5
-Dilas	1
-Dígame,	3
-Dígolo	2
-Dígote,	1
-Don	2
-Dulce	1
-Ea,	1
-Echemos,	2
-El	3
-Ella	1
-Ellos,	1
-En	13
-Es	2
-Esa	

## Reverse index

In [9]:
%%writefile example/mapper_indice.py
#!/usr/bin/env python3
import sys
import os

filename = os.environ.get("map_input_file", "desconocido")

for line in sys.stdin:
    line = line.strip().split()
    for word in line:
        print(f"{word}\t{filename}")

Overwriting example/mapper_indice.py


In [10]:
%%writefile example/reducer_indice.py
#!/usr/bin/env python3

import sys

current_word = None
docs = set()

for line in sys.stdin:
    word, doc = line.strip().split("\t", 1)

    if current_word == word:
        docs.add(doc)
    else:
        if current_word:
            print(f"{current_word}\t{','.join(sorted(docs))}")
        current_word = word
        docs = {doc}

if current_word:
    print(f"{current_word}\t{','.join(sorted(docs))}")

Overwriting example/reducer_indice.py


In [None]:
%%writefile example/doc1.txt
el cielo es azul

Writing doc1.txt


In [None]:
%%writefile example/doc2.txt
el sol es amarillo

Writing doc2.txt


In [None]:
%%writefile example/doc3.txt
el cielo es grande

Writing doc3.txt


In [22]:
!hdfs dfs -ls /

Found 4 items
drwxr-xr-x   - root supergroup          0 2025-11-18 08:52 /contar
drwxr-xr-x   - root supergroup          0 2025-11-18 09:05 /indice_invertido
drwxrwx---   - root supergroup          0 2025-11-18 08:42 /tmp
drwxrwxrwt   - root root                0 2025-11-18 08:52 /yarn


In [None]:
!hdfs dfs -mkdir /indice_invertido

!hdfs dfs -put /media/notebooks/example/doc*.txt /indice_invertido

In [None]:
!hdfs dfs -rm /salida_indice/*
!hdfs dfs -rmdir /salida_indice
!hadoop jar \
/usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.4.0.jar \
-file example/mapper_indice.py \
-mapper example/mapper_indice.py \
-file example/reducer_indice.py \
-reducer example/reducer_indice.py \
-input /indice_invertido/ \
-output /salida_indice

Deleted /salida_indice/_SUCCESS
Deleted /salida_indice/part-00000
2025-11-18 09:27:57,991 WARN streaming.StreamJob: -file option is deprecated, please use generic option -files instead.
packageJobJar: [mapper_indice.py, reducer_indice.py, /tmp/hadoop-unjar5807269031198460539/] [] /tmp/streamjob8820789166848595483.jar tmpDir=null
2025-11-18 09:27:58,965 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at yarnmanager/172.19.0.5:8032
2025-11-18 09:27:59,128 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at yarnmanager/172.19.0.5:8032
2025-11-18 09:27:59,482 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/root/.staging/job_1763455359622_0009
2025-11-18 09:28:00,119 INFO mapred.FileInputFormat: Total input files to process : 3
2025-11-18 09:28:00,238 INFO mapreduce.JobSubmitter: number of splits:3
2025-11-18 09:28:00,448 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1763

In [45]:
!hdfs dfs -cat /salida_indice/part-00000

amarillo	hdfs://namenode:9000/indice_invertido/doc2.txt
azul	hdfs://namenode:9000/indice_invertido/doc1.txt
cielo	hdfs://namenode:9000/indice_invertido/doc1.txt,hdfs://namenode:9000/indice_invertido/doc3.txt
el	hdfs://namenode:9000/indice_invertido/doc1.txt,hdfs://namenode:9000/indice_invertido/doc2.txt,hdfs://namenode:9000/indice_invertido/doc3.txt
es	hdfs://namenode:9000/indice_invertido/doc1.txt,hdfs://namenode:9000/indice_invertido/doc2.txt,hdfs://namenode:9000/indice_invertido/doc3.txt
grande	hdfs://namenode:9000/indice_invertido/doc3.txt
sol	hdfs://namenode:9000/indice_invertido/doc2.txt
