<a href="https://colab.research.google.com/github/harnalashok/hadoop/blob/main/hadoop_install_on_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Last amended: 29th March, 2021
# Myfolder: github/hadoop
# Objective:
#            i)  Install hadoop on colab
#                (current version is 3.2.2)
#            ii) Experiments with hadoop
#
#
# Java 8 install: https://stackoverflow.com/a/58191107
# Hadoop install: https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html
# Spark install:  https://stackoverflow.com/a/64183749

In [1]:
# 1.0 How to set environment variable
import os  
import time    

In [2]:
# 2.0 Function to install ssh client and sshd (Server)
def ssh_install():
  print("\n--1. Download and install ssh server----\n")
  ! sudo apt-get remove openssh-client openssh-server
  ! sudo apt install openssh-client openssh-server
  
  print("\n--2. Restart ssh server----\n")
  ! service ssh restart

In [3]:
# 3.0 Function to download and install java 8
def install_java():
  ! rm -rf /usr/java

  print("\n--Download and install Java 8----\n")
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null        # install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     # set environment variable

  !update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
  !update-alternatives --set javac /usr/lib/jvm/java-8-openjdk-amd64/bin/javac
  
  !mkdir -p /usr/java
  ! ln -s "/usr/lib/jvm/java-8-openjdk-amd64"  "/usr/java"
  ! mv "/usr/java/java-8-openjdk-amd64"  "/usr/java/latest"
  
  !java -version       #check java version
  !javac -version

In [4]:
# 4.0 Function to download and install hadoop
def hadoop_install():
  print("\n--5. Download hadoop tar.gz----\n")
  ! wget -c https://mirrors.estointernet.in/apache/hadoop/common/hadoop-3.2.2/hadoop-3.2.2.tar.gz

  print("\n--6. Transfer downloaded content and unzip tar.gz----\n")
  !  mv /content/hadoop*   /opt/
  ! tar -xzvf /opt/hadoop-3.2.2.tar.gz  --directory /opt/

  print("\n--7. Create hadoop folder----\n")
  ! rm -r /app/hadoop/tmp
  ! mkdir  -p   /app/hadoop/tmp
  
  print("\n--8. Check folder for files----\n")
  ! ls -la /opt

In [5]:
# 5.0 Function for setting hadoop configuration
def hadoop_config():
  print("\n--Begin Configuring hadoop---\n")
  print("\n=============================\n")
  print("\n--9. core-site.xml----\n")
  ! cat  /opt/hadoop-3.2.2/etc/hadoop/core-site.xml

  print("\n--10. Amend core-site.xml----\n")
  !  echo  '<?xml version="1.0" encoding="UTF-8"?>' >  /opt/hadoop-3.2.2/etc/hadoop/core-site.xml
  !  echo  '<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>' >>  /opt/hadoop-3.2.2/etc/hadoop/core-site.xml
  !  echo  ' <configuration>' >>  /opt/hadoop-3.2.2/etc/hadoop/core-site.xml
  !  echo  '    <property>' >>  /opt/hadoop-3.2.2/etc/hadoop/core-site.xml
  !  echo  '        <name>fs.defaultFS</name>' >>  /opt/hadoop-3.2.2/etc/hadoop/core-site.xml
  !  echo  '        <value>hdfs://localhost:9000</value>' >>  /opt/hadoop-3.2.2/etc/hadoop/core-site.xml
  !  echo  '    </property>' >>  /opt/hadoop-3.2.2/etc/hadoop/core-site.xml
  !  echo  '    <property>' >>  /opt/hadoop-3.2.2/etc/hadoop/core-site.xml
  !  echo  '          <name>hadoop.tmp.dir</name>' >>  /opt/hadoop-3.2.2/etc/hadoop/core-site.xml
  !  echo  '          <value>/app/hadoop/tmp</value>' >>  /opt/hadoop-3.2.2/etc/hadoop/core-site.xml
  !  echo  '       <description>A base for other temporary directories.</description>' >>  /opt/hadoop-3.2.2/etc/hadoop/core-site.xml
  !  echo  '     </property>' >>  /opt/hadoop-3.2.2/etc/hadoop/core-site.xml
  !  echo  '  </configuration>' >>  /opt/hadoop-3.2.2/etc/hadoop/core-site.xml

  print("\n--11. Amended core-site.xml----\n")
  ! cat  /opt/hadoop-3.2.2/etc/hadoop/core-site.xml

  print("\n--12. yarn-site.xml----\n")
  !cat /opt/hadoop-3.2.2/etc/hadoop/yarn-site.xml

  !echo '<?xml version="1.0" encoding="UTF-8"?>' > /opt/hadoop-3.2.2/etc/hadoop/yarn-site.xml
  !echo '<configuration>' >> /opt/hadoop-3.2.2/etc/hadoop/yarn-site.xml
  !echo '    <property>' >> /opt/hadoop-3.2.2/etc/hadoop/yarn-site.xml
  !echo '        <name>yarn.nodemanager.aux-services</name>' >> /opt/hadoop-3.2.2/etc/hadoop/yarn-site.xml
  !echo '        <value>mapreduce_shuffle</value>' >> /opt/hadoop-3.2.2/etc/hadoop/yarn-site.xml
  !echo '    </property>' >> /opt/hadoop-3.2.2/etc/hadoop/yarn-site.xml
  !echo '    <property>'  >> /opt/hadoop-3.2.2/etc/hadoop/yarn-site.xml
  !echo '       <name>yarn.nodemanager.vmem-check-enabled</name>'  >> /opt/hadoop-3.2.2/etc/hadoop/yarn-site.xml
  !echo '       <value>false</value>'  >> /opt/hadoop-3.2.2/etc/hadoop/yarn-site.xml
  !echo '    </property>'  >> /opt/hadoop-3.2.2/etc/hadoop/yarn-site.xml
  !echo ' </configuration>'  >> /opt/hadoop-3.2.2/etc/hadoop/yarn-site.xml
  
  print("\n--13. Amended yarn-site.xml----\n")
  !cat /opt/hadoop-3.2.2/etc/hadoop/yarn-site.xml

  print("\n--14. mapred-site.xml----\n")
  !cat  /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml

  print("\n--15. Amend mapred-site.xml----\n")
  !echo '<?xml version="1.0"?>'  > /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml
  !echo '<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>'  >> /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml
  !echo '<configuration>'  >> /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml
  !echo '    <property>'  >> /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml
  !echo '       <name>mapreduce.framework.name</name>'  >> /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml
  !echo '        <value>yarn</value>'  >> /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml
  !echo '    </property>'  >> /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml
  !echo '    <property>'  >> /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml
  !echo '        <name>yarn.app.mapreduce.am.env</name>'  >> /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml
  !echo '        <value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>'  >> /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml
  !echo '    </property>'  >> /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml
  !echo '    <property>'  >> /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml
  !echo '       <name>mapreduce.map.env</name>'  >> /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml
  !echo '       <value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>'  >> /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml
  !echo '    </property>'  >> /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml
  !echo '    <property>'  >> /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml
  !echo '      <name>mapreduce.reduce.env</name>'  >> /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml
  !echo '      <value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>'  >> /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml
  !echo '   </property>'  >> /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml
  !echo '</configuration>'  >> /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml

  print("\n--16, Amended mapred-site.xml----\n")
  !cat  /opt/hadoop-3.2.2/etc/hadoop/mapred-site.xml

  print("\n---17. hdfs-site.xml----\n")
  !cat  /opt/hadoop-3.2.2/etc/hadoop/hdfs-site.xml
  
  print("\n---18. Amend hdfs-site.xml----\n")
  !echo  '<?xml version="1.0" encoding="UTF-8"?> '   > /opt/hadoop-3.2.2/etc/hadoop/hdfs-site.xml
  !echo  '<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>' >> /opt/hadoop-3.2.2/etc/hadoop/hdfs-site.xml
  !echo  '<configuration>'  >> /opt/hadoop-3.2.2/etc/hadoop/hdfs-site.xml
  !echo  '    <property>'  >> /opt/hadoop-3.2.2/etc/hadoop/hdfs-site.xml
  !echo  '        <name>dfs.replication</name>'  >> /opt/hadoop-3.2.2/etc/hadoop/hdfs-site.xml
  !echo  '        <value>1</value>'  >> /opt/hadoop-3.2.2/etc/hadoop/hdfs-site.xml
  !echo  '    </property>'  >> /opt/hadoop-3.2.2/etc/hadoop/hdfs-site.xml
  !echo  '   <property>'   >> /opt/hadoop-3.2.2/etc/hadoop/hdfs-site.xml
  !echo  '        <name>dfs.block.size</name>'  >> /opt/hadoop-3.2.2/etc/hadoop/hdfs-site.xml
  !echo  '        <value>16777216</value>'  >> /opt/hadoop-3.2.2/etc/hadoop/hdfs-site.xml
  !echo  '        <description>Block size</description>'  >> /opt/hadoop-3.2.2/etc/hadoop/hdfs-site.xml
  !echo  '  </property>'  >> /opt/hadoop-3.2.2/etc/hadoop/hdfs-site.xml
  !echo  '</configuration>'  >> /opt/hadoop-3.2.2/etc/hadoop/hdfs-site.xml

  print("\n---19. Amended hdfs-site.xml----\n")
  !cat  /opt/hadoop-3.2.2/etc/hadoop/hdfs-site.xml

  print("\n---20. hadoop-env.sh----\n")
  # https://stackoverflow.com/a/53140448
  !cat /opt/hadoop-3.2.2/etc/hadoop/hadoop-env.sh
  ! echo 'export JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64"' >> /opt/hadoop-3.2.2/etc/hadoop/hadoop-env.sh
  ! echo 'export HDFS_NAMENODE_USER="root"'  >> /opt/hadoop-3.2.2/etc/hadoop/hadoop-env.sh
  ! echo 'export HDFS_DATANODE_USER="root"'  >> /opt/hadoop-3.2.2/etc/hadoop/hadoop-env.sh
  ! echo 'export HDFS_SECONDARYNAMENODE_USER="root"'  >> /opt/hadoop-3.2.2/etc/hadoop/hadoop-env.sh
  ! echo 'export YARN_RESOURCEMANAGER_USER="root"'  >> /opt/hadoop-3.2.2/etc/hadoop/hadoop-env.sh
  ! echo 'export YARN_NODEMANAGER_USER="root"'  >> /opt/hadoop-3.2.2/etc/hadoop/hadoop-env.sh
  
  print("\n---21. Amended hadoop-env.sh----\n")
  !cat /opt/hadoop-3.2.2/etc/hadoop/hadoop-env.sh


In [6]:
# 6.0 Function tp setup ssh passphrase
def set_keys():
  print("\n---22. Generate SSH keys----\n")
  ! cd ~ ; pwd 
  ! cd ~ ; ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa
  ! cd ~ ; cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
  ! cd ~ ; chmod 0600 ~/.ssh/authorized_keys


In [7]:
# 7.0 Function to set up environmental variables
def set_env():
  print("\n---23. Set Environment variables----\n")
  # 'export' command does not work in colab
  # https://stackoverflow.com/a/57240319
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  os.environ["JRE_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64/jre"   
  os.environ["HADOOP_HOME"] = "/opt/hadoop-3.2.2"
  os.environ["HADOOP_CONF_DIR"] = "/opt/hadoop-3.2.2/etc/hadoop" 
  os.environ["LD_LIBRARY_PATH"] += ":/opt/hadoop-3.2.2/lib/native"
  os.environ["PATH"] += ":/opt/hadoop-3.2.2/bin:/opt/hadoop-3.2.2/sbin"

In [8]:
# 8.0 Function to call all functions
def install_hadoop():
  print("\n--Install java----\n")
  ssh_install()
  install_java()  
  hadoop_install()
  hadoop_config()
  set_keys()
  set_env()


In [None]:
# 9.0 Start installation
start = time.time()
install_hadoop()
end = time.time()
print("\n---Time taken----\n")
print((end- start)/60)

In [10]:
# 10.0 Format hadoop
print("\n---24. Format namenode----\n")
!hdfs namenode  -format


---24. Format namenode----

2021-03-29 13:35:09,057 INFO namenode.NameNode: STARTUP_MSG: 
/************************************************************
STARTUP_MSG: Starting NameNode
STARTUP_MSG:   host = f28419ec4842/172.28.0.2
STARTUP_MSG:   args = [-format]
STARTUP_MSG:   version = 3.2.2
STARTUP_MSG:   classpath = /opt/hadoop-3.2.2/etc/hadoop:/opt/hadoop-3.2.2/share/hadoop/common/lib/curator-recipes-2.13.0.jar:/opt/hadoop-3.2.2/share/hadoop/common/lib/commons-io-2.5.jar:/opt/hadoop-3.2.2/share/hadoop/common/lib/httpclient-4.5.13.jar:/opt/hadoop-3.2.2/share/hadoop/common/lib/commons-net-3.6.jar:/opt/hadoop-3.2.2/share/hadoop/common/lib/jsp-api-2.1.jar:/opt/hadoop-3.2.2/share/hadoop/common/lib/commons-cli-1.2.jar:/opt/hadoop-3.2.2/share/hadoop/common/lib/kerb-admin-1.0.1.jar:/opt/hadoop-3.2.2/share/hadoop/common/lib/jackson-core-2.9.10.jar:/opt/hadoop-3.2.2/share/hadoop/common/lib/kerb-server-1.0.1.jar:/opt/hadoop-3.2.2/share/hadoop/common/lib/jackson-xc-1.9.13.jar:/opt/hadoop-3.2.2/

In [11]:
# 11.0 Start namenode
#      If this fails, run
#       ssh_install() below
#        and start hadoop again:

print("\n---25. Start namenode----\n")
! start-dfs.sh


---25. Start namenode----

Starting namenodes on [localhost]
Starting datanodes
Starting secondary namenodes [f28419ec4842]


In [None]:
# ssh_install()

If `start-dfs.sh` fails, issue the following three commands, one after another:<br>  
`! sudo apt-get remove openssh-client openssh-server`<br>
`! sudo apt-get install openssh-client openssh-server`<br>
`! service ssh restart`<br>

And then try to start hadoop again, as: `start-dfs.sh`

In [12]:
print("\n---26. Make folders in hadoop----\n")
! hdfs dfs -mkdir /user
! hdfs dfs -mkdir /user/ashok


---26. Make folders in hadoop----



In [13]:
# Run hadoop commands
! hdfs dfs -ls /
! hdfs dfs -ls /user

Found 1 items
drwxr-xr-x   - root supergroup          0 2021-03-29 13:35 /user
Found 1 items
drwxr-xr-x   - root supergroup          0 2021-03-29 13:35 /user/ashok


In [None]:
#!stop-dfs.sh

Run the following commands if hadoop fails to start with `start-dfs.sh` then try again.

In [None]:
################# I am done #############