From 420fc498284045d0683f94cf56223512d33c0c74 Mon Sep 17 00:00:00 2001 From: "D. Kasi Pavan Kumar" <44864604+kasipavankumar@users.noreply.github.com> Date: Tue, 3 Aug 2021 03:35:14 +0000 Subject: [PATCH] v1.0.0 --- .github/workflows/deploy.yml | 49 ++++++++++++++++++++++++++++ Dockerfile | 62 ++++++++++++++++++++++++++++++++++++ LICENSE | 2 +- README.md | 43 +++++++++++++++++++++++-- bootstrap.sh | 25 +++++++++++++++ etc/core-site.xml | 10 ++++++ etc/hdfs-site.xml | 14 ++++++++ etc/mapred-site.xml | 6 ++++ etc/yarn-site.xml | 22 +++++++++++++ 9 files changed, 230 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/deploy.yml create mode 100644 Dockerfile create mode 100644 bootstrap.sh create mode 100644 etc/core-site.xml create mode 100644 etc/hdfs-site.xml create mode 100644 etc/mapred-site.xml create mode 100644 etc/yarn-site.xml diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000..78799e0 --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,49 @@ +name: Deploy Docker image + +on: + release: + types: [published] + + # Do not run when README gets updated + paths-ignore: + - '**/README.md' + + # Can trigger action manually + workflow_dispatch: + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +jobs: + build-and-deploy-image: + runs-on: ubuntu-latest + + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Log in to the container registry + uses: docker/login-action@v1 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata for Docker + id: meta + uses: docker/metadata-action@v3 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + + - name: Build & push Docker image + uses: docker/build-push-action@v2 + with: + context: . + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..4db4622 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,62 @@ +# Ubuntu as the base image +FROM ubuntu:20.04 + +# Set working directory to /home +WORKDIR / + +# Install required dependencies +RUN apt-get update && apt-get install -y \ + openjdk-8-jdk \ + openssh-server \ + openssh-client \ + nano \ + && rm -rf /var/lib/apt/lists/* + +# Generate SSH key pair for password less login +RUN ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa \ + && cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys \ + && chmod 0600 ~/.ssh/authorized_keys + +# Download Hadoop 3.3.1 +RUN wget https://mirrors.estointernet.in/apache/hadoop/common/hadoop-3.3.1/hadoop-3.3.1.tar.gz + +# Unzip the .tar.gz +RUN tar xzf hadoop-3.3.1.tar.gz + +# Remove the .tar.gz file +RUN rm ./hadoop-3.3.1.tar.gz + +# Hadoop home +ENV HADOOP_HOME=/hadoop-3.3.1 + +# Other Hadoop environment variables +ENV HADOOP_INSTALL=${HADOOP_HOME} \ + HADOOP_MAPRED_HOME=${HADOOP_HOME} \ + HADOOP_COMMON_HOME=${HADOOP_HOME} \ + HADOOP_HDFS_HOME=${HADOOP_HOME} \ + YARN_HOME=${HADOOP_HOME} \ + HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_HOME}/lib/native \ + PATH=$PATH:${HADOOP_HOME}/sbin:${HADOOP_HOME}/bin \ + HADOOP_OPTS="-Djava.library.path=${HADOOP_HOME}/lib/nativ" \ + + # Java home + JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ \ + + # For start-all.sh + HDFS_NAMENODE_USER="root" \ + HDFS_DATANODE_USER="root" \ + HDFS_SECONDARYNAMENODE_USER="root" \ + YARN_RESOURCEMANAGER_USER="root" \ + YARN_NODEMANAGER_USER="root" + +# Dump environment variables since connecting +# to localhost via SSH wipes them out +RUN env | grep _ >> /etc/environment + +# Copy Hadoop configuration files to the "etc" directory +COPY /etc/* ${HADOOP_HOME}/etc/hadoop/ + +# Copy bootstrap.sh +COPY ./bootstrap.sh / + +CMD [ "bash", "./bootstrap.sh" ] diff --git a/LICENSE b/LICENSE index 261eeb9..dd61706 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2021 D. Kasi Pavan Kumar Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/README.md b/README.md index f9981e1..f71007f 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,41 @@ -# hadoop-docker -Apache Hadoop's Pseudo Distributed Mode using Docker. 🐳 +# Apache Hadoop using Docker 🐳 + +A Docker image to play around with [Apache Hadoop](https://hadoop.apache.org) in [Pseudo Distributed Mode](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html) (single cluster mode). + +### Below are the steps to play around with this image using [Play with Docker](https://labs.play-with-docker.com). + +1. First of all, create an account on [Docker Hub](https://hub.docker.com/signup). +2. Login to [Play with Docker](https://labs.play-with-docker.com) using the Docker Hub account you just created. +3. You should see a green "Start" button, click on it to start a session. +4. Create an instance by clicking on "+ Add new instance" in the left pane, to create a VM. +5. A new terminal should show up in the right pane. Here, we need to pull the Docker image from _Github Container Registry (GHCR)_. +To do so, execute: + +```bash +docker pull ghcr.io/max-rocco/hadoop-docker:main +``` + +6. After the image has been pulled into the VM, we need to start a new container & switch into it's terminal (mostly bash). +To do so, execute: + +```bash +docker run -it ghcr.io/max-rocco/hadoop-docker:main +``` + +_At this stage, the image will be booting up by executing all the required steps to start Hadoop._ + +**From now on, you will be inside container's bash (terminal) and can start using Hadoop's filesystem commands.** 🚀 + +
+ +[![Deploy Docker image](https://github.com/max-rocco/hadoop-docker/actions/workflows/build.yml/badge.svg)](https://github.com/max-rocco/hadoop-docker/actions/workflows/build.yml) + +
+ +
+ +```txt +D. Kasi Pavan Kumar (c) 2021 +``` + +
\ No newline at end of file diff --git a/bootstrap.sh b/bootstrap.sh new file mode 100644 index 0000000..c9777a2 --- /dev/null +++ b/bootstrap.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# 1. Start the SSH server +/etc/init.d/ssh start + +# 2. Connect to "localhost" via SSH +# https://askubuntu.com/a/123080 +ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -t localhost << EOF + + # 3. Source the environment variables dumped in /etc/enviroment + source /etc/environment + + # 4. Set Hadoop's "sbin" & "bin" in path + export PATH=$PATH:$HADOOP_HOME/sbin:$HADOOP_HOME/bin + +EOF + +# 3. Format "Namenode" +hdfs namenode -format + +# 4. Start all Hadoop services +$HADOOP_HOME/sbin/start-all.sh + +# 5. Leave user with the shell +/bin/bash diff --git a/etc/core-site.xml b/etc/core-site.xml new file mode 100644 index 0000000..0ce8fcd --- /dev/null +++ b/etc/core-site.xml @@ -0,0 +1,10 @@ + + + hadoop.tmp.dir + /home/tmpdata + + + fs.default.name + hdfs://127.0.0.1:9000 + + \ No newline at end of file diff --git a/etc/hdfs-site.xml b/etc/hdfs-site.xml new file mode 100644 index 0000000..0d508e4 --- /dev/null +++ b/etc/hdfs-site.xml @@ -0,0 +1,14 @@ + + + dfs.data.dir + /home/dfsdata/namenode + + + dfs.data.dir + /home/dfsdata/datanode + + + dfs.replication + 1 + + \ No newline at end of file diff --git a/etc/mapred-site.xml b/etc/mapred-site.xml new file mode 100644 index 0000000..c660d8b --- /dev/null +++ b/etc/mapred-site.xml @@ -0,0 +1,6 @@ + + + mapreduce.framework.name + yarn + + \ No newline at end of file diff --git a/etc/yarn-site.xml b/etc/yarn-site.xml new file mode 100644 index 0000000..d1e45f6 --- /dev/null +++ b/etc/yarn-site.xml @@ -0,0 +1,22 @@ + + + yarn.nodemanager.aux-services + mapreduce_shuffle + + + yarn.nodemanager.aux-services.mapreduce.shuffle.class + org.apache.hadoop.mapred.ShuffleHandler + + + yarn.resourcemanager.hostname + 127.0.0.1 + + + yarn.acl.enable + 0 + + + yarn.nodemanager.env-whitelist + JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PERPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME + + \ No newline at end of file