## 1. Installing Hadoop


In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [None]:
#Installing the hadoop latest version
!wget https://downloads.apache.org/hadoop/common/hadoop-3.3.2/hadoop-3.3.2.tar.gz


--2022-05-07 21:10:35--  https://downloads.apache.org/hadoop/common/hadoop-3.3.2/hadoop-3.3.2.tar.gz
Resolving downloads.apache.org (downloads.apache.org)... 135.181.214.104, 88.99.95.219, 2a01:4f8:10a:201a::2, ...
Connecting to downloads.apache.org (downloads.apache.org)|135.181.214.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 638660563 (609M) [application/x-gzip]
Saving to: ‘hadoop-3.3.2.tar.gz’


2022-05-07 21:11:11 (17.4 MB/s) - ‘hadoop-3.3.2.tar.gz’ saved [638660563/638660563]



In [None]:
#unzipping
!tar -xzf hadoop-3.3.2.tar.gz

In [None]:
#copying the hadoop file to user/local
!mv  hadoop-3.3.2/ /usr/local

## 2. Setting up the Environment


In [None]:
#To find the default Java path
!readlink -f /usr/bin/java | sed "s:bin/java::"


/usr/lib/jvm/java-11-openjdk-amd64/


In [None]:
#setting up the java path
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64" 
os.environ["HADOOP_HOME"] = "/usr/local/hadoop-3.3.2/"

In [None]:
!echo $PATH

/opt/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/tools/node/bin:/tools/google-cloud-sdk/bin


In [None]:
#add hadoop bin to path
current_path = '/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/tools/node/bin:/tools/google-cloud-sdk/bin:/opt/bin'
new_path = current_path+':/usr/local/hadoop-3.3.2/bin/'
os.environ["PATH"] = new_path
!echo $PATH

/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/tools/node/bin:/tools/google-cloud-sdk/bin:/opt/bin:/usr/local/hadoop-3.3.2/bin/


## 3. Testing Hadoop Installation



In [None]:
!hadoop

Usage: hadoop [OPTIONS] SUBCOMMAND [SUBCOMMAND OPTIONS]
 or    hadoop [OPTIONS] CLASSNAME [CLASSNAME OPTIONS]
  where CLASSNAME is a user-provided Java class

  OPTIONS is none or any of:

buildpaths                       attempt to add class files from build tree
--config dir                     Hadoop config directory
--debug                          turn on shell script debug mode
--help                           usage information
hostnames list[,of,host,names]   hosts to use in slave mode
hosts filename                   list of hosts to use in slave mode
loglevel level                   set the log4j level for this command
workers                          turn on worker mode

  SUBCOMMAND is one of:


    Admin Commands:

daemonlog     get/set the log level for each daemon

    Client Commands:

archive       create a Hadoop archive
checknative   check native Hadoop and compression libraries availability
classpath     prints the class path needed to get the Hadoop jar and the
    

## 4. Word Count with Hadoop


Creating mapper and reducer files 

In [None]:
%%writefile mapper.py
import sys

# input comes from STDIN (standard input)
for line in sys.stdin:
    # remove leading and trailing whitespace
    line = line.strip()
    # split the line into words
    words = line.split()
    # increase counters
    for word in words:
        # write the results to STDOUT (standard output);
        # what we output here will be the input for the
        # Reduce step, i.e. the input for reducer.py
        #
        # tab-delimited; the trivial word count is 1
        print (word, 1)

Writing mapper.py


In [None]:
%%writefile reducer.py
#!/usr/bin/env python

from operator import itemgetter
import sys

current_word = None
current_count = 0
word = None

# input comes from STDIN
for line in sys.stdin:
    # remove leading and trailing whitespace
    line = line.strip()
    

    # parse the input we got from mapper.py
    word, count = line.split(' ')
    # convert count (currently a string) to int
    count = int(count)

    # this IF-switch only works because Hadoop sorts map output
    # by key (here: word) before it is passed to the reducer
    if current_word == word:
        current_count += count
    else:
        if current_word:
            # write result to STDOUT
            print (current_word, current_count)
        current_count = count
        current_word = word

# do not forget to output the last word if needed!
if current_word == word:
    print (current_word, current_count)

Writing reducer.py


In [None]:
!chmod u+rwx /content/mapper.py
!chmod u+rwx /content/reducer.py

In [None]:
!wget -q /word_count_hadoop.txt

In [None]:
!cat /word_count_hadoop.txt

My name is harendra sai nath
I have opted for the data science course at the praxis business school
the curriculum of the course was very interesting
It covers all the aspects of a data scientist role such as 
python, sql, machine learning, statistics, data visualisation and some other topics such as mathematics, marketing research and other programming languages as well.
This course is so intensive and exhaustive in nature.


In [None]:
!find / -name 'hadoop-streaming*.jar'

/usr/local/hadoop-3.3.2/share/hadoop/tools/lib/hadoop-streaming-3.3.2.jar
/usr/local/hadoop-3.3.2/share/hadoop/tools/sources/hadoop-streaming-3.3.2-test-sources.jar
/usr/local/hadoop-3.3.2/share/hadoop/tools/sources/hadoop-streaming-3.3.2-sources.jar
find: ‘/proc/26/task/26/net’: Invalid argument
find: ‘/proc/26/net’: Invalid argument


In [None]:
# remove output directories
!rm -r wc_out
!rm -r wc2_out

rm: cannot remove 'wc_out': No such file or directory
rm: cannot remove 'wc2_out': No such file or directory


In [None]:
!hadoop jar /usr/local/hadoop-3.3.2/share/hadoop/tools/lib/hadoop-streaming-3.3.2.jar -input /word_count_hadoop.txt -output /content/wc_out  -mapper 'python mapper.py'  -reducer 'python reducer.py'

2022-05-07 21:17:07,283 INFO impl.MetricsConfig: Loaded properties from hadoop-metrics2.properties
2022-05-07 21:17:07,413 INFO impl.MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
2022-05-07 21:17:07,413 INFO impl.MetricsSystemImpl: JobTracker metrics system started
2022-05-07 21:17:07,433 WARN impl.MetricsSystemImpl: JobTracker metrics system already initialized!
2022-05-07 21:17:07,671 INFO mapred.FileInputFormat: Total input files to process : 1
2022-05-07 21:17:07,691 INFO mapreduce.JobSubmitter: number of splits:1
2022-05-07 21:17:08,036 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_local1769816372_0001
2022-05-07 21:17:08,036 INFO mapreduce.JobSubmitter: Executing with tokens: []
2022-05-07 21:17:08,251 INFO mapreduce.Job: The url to track the job: http://localhost:8080/
2022-05-07 21:17:08,252 INFO mapreduce.Job: Running job: job_local1769816372_0001
2022-05-07 21:17:08,260 INFO mapred.LocalJobRunner: OutputCommitter set in config null
2022-05

In [None]:
# check output directory
!ls wc_out

part-00000  _SUCCESS


In [None]:
!head wc_out/part-00000

I 1	
It 1	
My 1	
This 1	
a 1	
all 1	
and 3	
as 3	
aspects 1	
at 1	


In [None]:
!sort -nr -k 2 -t$'\t' wc_out/part-00000 > sorted.txt

In [None]:
!cat /word_count_hadoop.txt

My name is harendra sai nath
I have opted for the data science course at the praxis business school
the curriculum of the course was very interesting
It covers all the aspects of a data scientist role such as 
python, sql, machine learning, statistics, data visualisation and some other topics such as mathematics, marketing research and other programming languages as well.
This course is so intensive and exhaustive in nature.


In [None]:
!head -60 sorted.txt


well. 1	
was 1	
visualisation 1	
very 1	
topics 1	
This 1	
the 5	
such 2	
statistics, 1	
sql, 1	
some 1	
so 1	
scientist 1	
science 1	
school 1	
sai 1	
role 1	
research 1	
python, 1	
programming 1	
praxis 1	
other 2	
opted 1	
of 2	
nature. 1	
nath 1	
name 1	
My 1	
mathematics, 1	
marketing 1	
machine 1	
learning, 1	
languages 1	
It 1	
is 2	
interesting 1	
intensive 1	
in 1	
I 1	
have 1	
harendra 1	
for 1	
exhaustive 1	
data 3	
curriculum 1	
covers 1	
course 3	
business 1	
at 1	
aspects 1	
as 3	
and 3	
all 1	
a 1	


In [None]:
!tail -30 sorted.txt


nature. 1	
nath 1	
name 1	
My 1	
mathematics, 1	
marketing 1	
machine 1	
learning, 1	
languages 1	
It 1	
is 2	
interesting 1	
intensive 1	
in 1	
I 1	
have 1	
harendra 1	
for 1	
exhaustive 1	
data 3	
curriculum 1	
covers 1	
course 3	
business 1	
at 1	
aspects 1	
as 3	
and 3	
all 1	
a 1	
