# PR0404: Aplicación de patrones MapReduce

In [10]:
!hdfs dfs -mkdir /gdp
!hdfs dfs -put ./pr0404/countries_gdp_hist.csv /gdp

mkdir: `/gdp': File exists


In [13]:
!hdfs dfs -ls /gdp

Found 1 items
-rw-r--r--   3 root supergroup    1658937 2025-12-18 09:18 /gdp/countries_gdp_hist.csv


## Ejercicio 1: Limpieza y transformación

In [46]:
%%writefile pr0404/ejercicio1/mapper-gdp.py
#!/usr/bin/env python3
import sys

first = True
buffer = []
MAX_BUFFER = 50000
for line in sys.stdin.buffer:
    if first:
        first = False
        continue
    values = line.strip().split(b';')
    try:
        year = int(values[6].decode('utf-8'))
        if year < 2000:
            continue
        gdp = float(values[7].decode('utf-8'))
        if gdp <= 0:
            continue
        country = values[4].decode('utf-8')
        buffer.append(f"{country}\t{year}\t{gdp}")

        if len(buffer) > MAX_BUFFER:
            print("\n".join(buffer))
            buffer = []
    except ValueError:
        continue

if len(buffer) > 0:
    print("\n".join(buffer))


Overwriting pr0404/ejercicio1/mapper-gdp.py


In [16]:
%%writefile pr0404/ejercicio1/reducer-gdp.py
#!/usr/bin/env python3
import sys

for line in sys.stdin:
    print(line)

Writing pr0404/ejercicio1/reducer-gdp.py


In [47]:
!cat pr0404/countries_gdp_hist.csv | python3 pr0404/ejercicio1/mapper-gdp.py | sort | python3 pr0404/ejercicio1/reducer-gdp.py

AFGHANISTAN	2000	3521418059.92345

AFGHANISTAN	2001	2813571753.87253

AFGHANISTAN	2002	3825701438.99963

AFGHANISTAN	2003	4520946818.54581

AFGHANISTAN	2004	5224896718.67782

AFGHANISTAN	2005	6203256538.70967

AFGHANISTAN	2006	6971758282.29351

AFGHANISTAN	2007	9747886187.39393

AFGHANISTAN	2008	10109297047.5432

AFGHANISTAN	2009	12416152732.0567

AFGHANISTAN	2010	15856668555.8336

AFGHANISTAN	2011	17805098206.3141

AFGHANISTAN	2012	19907329777.5872

AFGHANISTAN	2013	20146416757.5987

AFGHANISTAN	2014	20497128555.6972

AFGHANISTAN	2015	19134221644.7325

AFGHANISTAN	2016	18116572395.0772

AFGHANISTAN	2017	18753456497.8159

AFGHANISTAN	2018	18053222687.4126

AFGHANISTAN	2019	18799444490.1128

AFGHANISTAN	2020	19955929052.1496

AFGHANISTAN	2021	14259995441.0759

AFGHANISTAN	2022	14497243872.1337

AFGHANISTAN	2023	17233051620.1117

ALBANIA	2000	3480355258.04122

ALBANIA	2001	3922100793.5403

ALBANIA	2002	4348068242.19512

ALBANIA	2003	5611496257.14231

ALBANIA	2004	7184685781.51876

ALBANI

In [48]:
!hdfs dfs -rm /processed/country_gdp/*
!hdfs dfs -rmdir /processed/country_gdp/
!hadoop jar \
/usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.4.0.jar \
-file pr0404/ejercicio1/mapper-gdp.py \
-mapper pr0404/ejercicio1/mapper-gdp.py \
-file pr0404/ejercicio1/reducer-gdp.py \
-reducer pr0404/ejercicio1/reducer-gdp.py \
-input /gdp/countries_gdp_hist.csv \
-output /processed/country_gdp
!hdfs dfs -cat /processed/country_gdp/part-00000

rm: `/processed/country_gdp/*': No such file or directory
2025-12-18 09:28:30,250 WARN streaming.StreamJob: -file option is deprecated, please use generic option -files instead.
packageJobJar: [pr0404/ejercicio1/mapper-gdp.py, pr0404/ejercicio1/reducer-gdp.py, /tmp/hadoop-unjar7435796241443121094/] [] /tmp/streamjob6649821022205143726.jar tmpDir=null
2025-12-18 09:28:31,543 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at yarnmanager/172.19.0.5:8032
2025-12-18 09:28:31,731 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at yarnmanager/172.19.0.5:8032
2025-12-18 09:28:32,070 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/root/.staging/job_1766048350482_0003
2025-12-18 09:28:32,768 INFO mapred.FileInputFormat: Total input files to process : 1
2025-12-18 09:28:32,951 INFO mapreduce.JobSubmitter: number of splits:2
2025-12-18 09:28:33,227 INFO mapreduce.JobSubmitter: Submitting to

## Ejercicio 2: Agregación por clave

In [52]:
%%writefile pr0404/ejercicio2/mapper.py
#!/usr/bin/env python3
import sys
buffer = []
MAX_BUFFER = 50000 
first = True
for line in sys.stdin.buffer:
    if first:
        first = False
        continue
    values = line.strip().split(b';')
    region = values[1].decode('utf-8')
    gdp = values[-3].decode('utf-8')
    buffer.append(f"{region}\t{gdp}")
    if len(buffer) > MAX_BUFFER:
        print("\n".join(buffer))
if buffer:
    print("\n".join(buffer))

Overwriting pr0404/ejercicio2/mapper.py


In [50]:
%%writefile pr0404/ejercicio2/reducer.py
#!/usr/bin/env python3
import sys
cur_region = None
curr_gdp = 0
buffer = []
for line in sys.stdin:
    region, gdp = line.split("\t")
    gdp = float(gdp)
    if cur_region == region:
        curr_gdp += gdp
    else:
        if cur_region:
            buffer.append(f"{cur_region}\t{curr_gdp}")
        cur_region = region
        curr_gdp = gdp
print("\n".join(buffer))

    

Writing pr0404/ejercicio2/reducer.py


In [53]:
!cat pr0404/countries_gdp_hist.csv | python3 pr0404/ejercicio2/mapper.py | sort | python3 pr0404/ejercicio2/reducer.py

AFRICA	60187281517661.51
AMERICAS	736355543214066.8
ASIA	658922540034302.8
EUROPE	627599261193299.4


In [54]:
!hdfs dfs -rm /processed/region_gdp/*
!hdfs dfs -rmdir /processed/region_gdp/
!hadoop jar \
/usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.4.0.jar \
-file pr0404/ejercicio2/mapper.py \
-mapper pr0404/ejercicio2/mapper.py \
-file pr0404/ejercicio2/reducer.py \
-reducer pr0404/ejercicio2/reducer.py \
-input /gdp/countries_gdp_hist.csv \
-output /processed/region_gdp
!hdfs dfs -cat /processed/region_gdp/part-00000

rm: `/processed/region_gdp/*': No such file or directory
rmdir: `/processed/region_gdp/': No such file or directory
2025-12-18 09:40:23,588 WARN streaming.StreamJob: -file option is deprecated, please use generic option -files instead.
packageJobJar: [pr0404/ejercicio2/mapper.py, pr0404/ejercicio2/reducer.py, /tmp/hadoop-unjar5004364127867558141/] [] /tmp/streamjob6736221488560556774.jar tmpDir=null
2025-12-18 09:40:25,312 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at yarnmanager/172.19.0.5:8032
2025-12-18 09:40:25,580 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at yarnmanager/172.19.0.5:8032
2025-12-18 09:40:26,080 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/root/.staging/job_1766048350482_0004
2025-12-18 09:40:26,807 INFO mapred.FileInputFormat: Total input files to process : 1
2025-12-18 09:40:27,042 INFO mapreduce.JobSubmitter: number of splits:2
2025-12-18 09:40

## Ejercicio 3: Máximos por grupo (Filtering/Top-K)

In [60]:
%%writefile pr0404/ejercicio3/mapper.py
#!/usr/bin/env python3
import sys

first = True
buffer = []
MAX_BUFFER = 50000
for line in sys.stdin.buffer:
    if first:
        first = False
        continue
    values = line.strip().split(b';')
    year = values[6].decode('utf-8')
    country = values[4].decode('utf-8')
    variation = values[-1].decode('utf-8')
    buffer.append(f"{country}\t{year}\t{variation}")
    if len(buffer) > MAX_BUFFER:
        print("\n".join(buffer))
if buffer:
    print("\n".join(buffer))

Overwriting pr0404/ejercicio3/mapper.py


In [58]:
%%writefile pr0404/ejercicio3/reducer.py
#!/usr/bin/env python3
import sys

current_country = None
max_variation = 0
max_year = None
buffer = []
for line in sys.stdin:
    country, year, variation = line.strip().split('\t')
    variation = float(variation)
    if country == current_country:
        if variation > max_variation:
            max_variation = variation
            max_year = year
    else:
        if current_country:
            buffer.append(f"{current_country}\t{max_year}")
        current_country = country
        max_variation = variation
        max_year = year
print("\n".join(buffer))


Overwriting pr0404/ejercicio3/reducer.py


In [61]:
!cat pr0404/countries_gdp_hist.csv | python3 pr0404/ejercicio3/mapper.py | sort | python3 pr0404/ejercicio3/reducer.py

AFGHANISTAN	2002
ALBANIA	1995
ALGERIA	1963
AMERICAN SAMOA	2020
ANDORRA	2022
ANGOLA	2005
ANTIGUA AND BARBUDA	2006
ARGENTINA	1965
ARMENIA	2003
ARUBA	2021
AUSTRALIA	1970
AUSTRIA	1970
AZERBAIJAN	2006
BAHAMAS	1979
BAHRAIN	1976
BANGLADESH	1964
BARBADOS	2022
BELARUS	2004
BELGIUM	1964
BELIZE	2021
BENIN	1981
BERMUDA	1966
BHUTAN	1987
BOLIVIA (PLURINATIONAL STATE OF)	1968
BOSNIA AND HERZEGOVINA	1996
BOTSWANA	1972
BRAZIL	1973
BRUNEI DARUSSALAM	1979
BULGARIA	1988
BURKINA FASO	1996
BURUNDI	1970
CABO VERDE	1994
CAMBODIA	1987
CAMEROON	1978
CANADA	1962
CAYMAN ISLANDS	2022
CENTRAL AFRICAN REPUBLIC	1984
CHAD	2004
CHILE	2021
CHINA	1970
COLOMBIA	2021
COMOROS	2000
CONGO	1982
CONGO, DEMOCRATIC REPUBLIC OF THE	1962
COSTA RICA	1992
CROATIA	2021
CUBA	1981
CURAÇAO	2022
CYPRUS	1976
CZECHIA	2006
CÔTE D'IVOIRE	1964
DENMARK	1964
DJIBOUTI	2015
DOMINICA	1980
DOMINICAN REPUBLIC	1970
ECUADOR	1973
EGYPT	1976
EL SALVADOR	2021
EQUATORIAL GUINEA	1997
ERITREA	1994
ESTONIA	1997
ESWATINI	1990
ETHIOPIA	1987
FAROE ISLANDS	2012
F

In [62]:
!hdfs dfs -rm /processed/country_max/*
!hdfs dfs -rmdir /processed/country_max/
!hadoop jar \
/usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.4.0.jar \
-file pr0404/ejercicio3/mapper.py \
-mapper pr0404/ejercicio3/mapper.py \
-file pr0404/ejercicio3/reducer.py \
-reducer pr0404/ejercicio3/reducer.py \
-input /gdp/countries_gdp_hist.csv \
-output /processed/country_max
!hdfs dfs -cat /processed/country_max/part-00000

rm: `/processed/country_max/*': No such file or directory
rmdir: `/processed/country_max/': No such file or directory
2025-12-18 09:51:48,752 WARN streaming.StreamJob: -file option is deprecated, please use generic option -files instead.
packageJobJar: [pr0404/ejercicio3/mapper.py, pr0404/ejercicio3/reducer.py, /tmp/hadoop-unjar6743312656658264312/] [] /tmp/streamjob4924808858893607978.jar tmpDir=null
2025-12-18 09:51:49,887 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at yarnmanager/172.19.0.5:8032
2025-12-18 09:51:50,062 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at yarnmanager/172.19.0.5:8032
2025-12-18 09:51:50,398 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/root/.staging/job_1766048350482_0005
2025-12-18 09:51:51,009 INFO mapred.FileInputFormat: Total input files to process : 1
2025-12-18 09:51:51,161 INFO mapreduce.JobSubmitter: number of splits:2
2025-12-18 09: