<a href="https://colab.research.google.com/github/harnalashok/hadoop/blob/main/hadoop_spark_install_on_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Last amended: 30th March, 2021
# Myfolder: github/hadoop
# Objective:
#            i)  Install hadoop on colab
#                (current version is 3.3.5)
#            ii) Experiments with hadoop
#           iii) Install spark on colab
#            iv) Access hadoop file from spark
#             v) Install koalas on colab
#
#
# Java 8 install: https://stackoverflow.com/a/58191107
# Hadoop install: https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html
# Spark install:  https://stackoverflow.com/a/64183749
#                 https://www.analyticsvidhya.com/blog/2020/11/a-must-read-guide-on-how-to-work-with-pyspark-on-google-colab-for-data-scientists/

## Install hadoop
If it takes too long, it means, it is awaiting input from you regarding overwriting ssh keys

### Define functions
No downloads. Just function definitions

In [1]:
# 1.0 How to set environment variable
import os  
import time  

#### ssh_install()

In [2]:
# 2.0 Function to install ssh client and sshd (Server)
def ssh_install():
  print("\n--1. Download and install ssh server----\n")
  ! sudo apt-get remove openssh-client openssh-server
  ! sudo apt install openssh-client openssh-server
  
  print("\n--2. Restart ssh server----\n")
  ! service ssh restart

#### Java install

In [5]:
# 3.0 Function to download and install java 8
def install_java():
  ! rm -rf /usr/java

  print("\n--Download and install Java 8----\n")
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null        # install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     # set environment variable

  !update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
  !update-alternatives --set javac /usr/lib/jvm/java-8-openjdk-amd64/bin/javac
  
  !mkdir -p /usr/java
  ! ln -s "/usr/lib/jvm/java-8-openjdk-amd64"  "/usr/java"
  ! mv "/usr/java/java-8-openjdk-amd64"  "/usr/java/latest"
  
  !java -version       #check java version
  !javac -version

#### hadoop install

In [14]:
# 4.0 Function to download and install hadoop
def hadoop_install():
  print("\n--5. Download hadoop tar.gz----\n")
  ! wget -c https://dlcdn.apache.org/hadoop/common/hadoop-3.3.5/hadoop-3.3.5.tar.gz

  print("\n--6. Transfer downloaded content and unzip tar.gz----\n")
  !  mv /content/hadoop*   /opt/
  ! tar -xzf /opt/hadoop-3.3.5.tar.gz  --directory /opt/

  print("\n--7. Create hadoop folder----\n")
  ! rm -r /app/hadoop/tmp
  ! mkdir  -p   /app/hadoop/tmp
  
  print("\n--8. Check folder for files----\n")
  ! ls -la /opt

#### hadoop config

In [7]:
# 5.0 Function for setting hadoop configuration
def hadoop_config():
  print("\n--Begin Configuring hadoop---\n")
  print("\n=============================\n")
  print("\n--9. core-site.xml----\n")
  ! cat  /opt/hadoop-3.3.5/etc/hadoop/core-site.xml

  print("\n--10. Amend core-site.xml----\n")
  !  echo  '<?xml version="1.0" encoding="UTF-8"?>' >  /opt/hadoop-3.3.5/etc/hadoop/core-site.xml
  !  echo  '<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>' >>  /opt/hadoop-3.3.5/etc/hadoop/core-site.xml
  !  echo  ' <configuration>' >>  /opt/hadoop-3.3.5/etc/hadoop/core-site.xml
  !  echo  '    <property>' >>  /opt/hadoop-3.3.5/etc/hadoop/core-site.xml
  !  echo  '        <name>fs.defaultFS</name>' >>  /opt/hadoop-3.3.5/etc/hadoop/core-site.xml
  !  echo  '        <value>hdfs://localhost:9000</value>' >>  /opt/hadoop-3.3.5/etc/hadoop/core-site.xml
  !  echo  '    </property>' >>  /opt/hadoop-3.3.5/etc/hadoop/core-site.xml
  !  echo  '    <property>' >>  /opt/hadoop-3.3.5/etc/hadoop/core-site.xml
  !  echo  '          <name>hadoop.tmp.dir</name>' >>  /opt/hadoop-3.3.5/etc/hadoop/core-site.xml
  !  echo  '          <value>/app/hadoop/tmp</value>' >>  /opt/hadoop-3.3.5/etc/hadoop/core-site.xml
  !  echo  '       <description>A base for other temporary directories.</description>' >>  /opt/hadoop-3.3.5/etc/hadoop/core-site.xml
  !  echo  '     </property>' >>  /opt/hadoop-3.3.5/etc/hadoop/core-site.xml
  # Added following regarding safemode from here:
  # https://stackoverflow.com/a/33800253
  !  echo  '     <property>'    >> /opt/hadoop-3.3.5/etc/hadoop/core-site.xml
  !  echo  '      <name>dfs.safemode.threshold.pct</name>'  >> /opt/hadoop-3.3.5/etc/hadoop/core-site.xml
  !  echo  '    <value>0</value>'  >> /opt/hadoop-3.3.5/etc/hadoop/core-site.xml
  !  echo  '  </property>'  >> /opt/hadoop-3.3.5/etc/hadoop/core-site.xml
  !  echo  '  </configuration>' >>  /opt/hadoop-3.3.5/etc/hadoop/core-site.xml

  print("\n--11. Amended core-site.xml----\n")
  ! cat  /opt/hadoop-3.3.5/etc/hadoop/core-site.xml

  print("\n--12. yarn-site.xml----\n")
  !cat /opt/hadoop-3.3.5/etc/hadoop/yarn-site.xml

  !echo '<?xml version="1.0" encoding="UTF-8"?>' > /opt/hadoop-3.3.5/etc/hadoop/yarn-site.xml
  !echo '<configuration>' >> /opt/hadoop-3.3.5/etc/hadoop/yarn-site.xml
  !echo '    <property>' >> /opt/hadoop-3.3.5/etc/hadoop/yarn-site.xml
  !echo '        <name>yarn.nodemanager.aux-services</name>' >> /opt/hadoop-3.3.5/etc/hadoop/yarn-site.xml
  !echo '        <value>mapreduce_shuffle</value>' >> /opt/hadoop-3.3.5/etc/hadoop/yarn-site.xml
  !echo '    </property>' >> /opt/hadoop-3.3.5/etc/hadoop/yarn-site.xml
  !echo '    <property>'  >> /opt/hadoop-3.3.5/etc/hadoop/yarn-site.xml
  !echo '       <name>yarn.nodemanager.vmem-check-enabled</name>'  >> /opt/hadoop-3.3.5/etc/hadoop/yarn-site.xml
  !echo '       <value>false</value>'  >> /opt/hadoop-3.3.5/etc/hadoop/yarn-site.xml
  !echo '    </property>'  >> /opt/hadoop-3.3.5/etc/hadoop/yarn-site.xml
  !echo ' </configuration>'  >> /opt/hadoop-3.3.5/etc/hadoop/yarn-site.xml
  
  print("\n--13. Amended yarn-site.xml----\n")
  !cat /opt/hadoop-3.3.5/etc/hadoop/yarn-site.xml

  print("\n--14. mapred-site.xml----\n")
  !cat  /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml

  print("\n--15. Amend mapred-site.xml----\n")
  !echo '<?xml version="1.0"?>'  > /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml
  !echo '<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>'  >> /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml
  !echo '<configuration>'  >> /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml
  !echo '    <property>'  >> /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml
  !echo '       <name>mapreduce.framework.name</name>'  >> /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml
  !echo '        <value>yarn</value>'  >> /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml
  !echo '    </property>'  >> /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml
  !echo '    <property>'  >> /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml
  !echo '        <name>yarn.app.mapreduce.am.env</name>'  >> /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml
  !echo '        <value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>'  >> /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml
  !echo '    </property>'  >> /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml
  !echo '    <property>'  >> /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml
  !echo '       <name>mapreduce.map.env</name>'  >> /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml
  !echo '       <value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>'  >> /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml
  !echo '    </property>'  >> /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml
  !echo '    <property>'  >> /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml
  !echo '      <name>mapreduce.reduce.env</name>'  >> /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml
  !echo '      <value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>'  >> /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml
  !echo '   </property>'  >> /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml
  !echo '</configuration>'  >> /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml

  print("\n--16, Amended mapred-site.xml----\n")
  !cat  /opt/hadoop-3.3.5/etc/hadoop/mapred-site.xml

  print("\n---17. hdfs-site.xml----\n")
  !cat  /opt/hadoop-3.3.5/etc/hadoop/hdfs-site.xml
  
  print("\n---18. Amend hdfs-site.xml----\n")
  !echo  '<?xml version="1.0" encoding="UTF-8"?> '   > /opt/hadoop-3.3.5/etc/hadoop/hdfs-site.xml
  !echo  '<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>' >> /opt/hadoop-3.3.5/etc/hadoop/hdfs-site.xml
  !echo  '<configuration>'  >> /opt/hadoop-3.3.5/etc/hadoop/hdfs-site.xml
  !echo  '    <property>'  >> /opt/hadoop-3.3.5/etc/hadoop/hdfs-site.xml
  !echo  '        <name>dfs.replication</name>'  >> /opt/hadoop-3.3.5/etc/hadoop/hdfs-site.xml
  !echo  '        <value>1</value>'  >> /opt/hadoop-3.3.5/etc/hadoop/hdfs-site.xml
  !echo  '    </property>'  >> /opt/hadoop-3.3.5/etc/hadoop/hdfs-site.xml
  !echo  '   <property>'   >> /opt/hadoop-3.3.5/etc/hadoop/hdfs-site.xml
  !echo  '        <name>dfs.block.size</name>'  >> /opt/hadoop-3.3.5/etc/hadoop/hdfs-site.xml
  !echo  '        <value>16777216</value>'  >> /opt/hadoop-3.3.5/etc/hadoop/hdfs-site.xml
  !echo  '        <description>Block size</description>'  >> /opt/hadoop-3.3.5/etc/hadoop/hdfs-site.xml
  !echo  '  </property>'  >> /opt/hadoop-3.3.5/etc/hadoop/hdfs-site.xml
  !echo  '</configuration>'  >> /opt/hadoop-3.3.5/etc/hadoop/hdfs-site.xml

  print("\n---19. Amended hdfs-site.xml----\n")
  !cat  /opt/hadoop-3.3.5/etc/hadoop/hdfs-site.xml

  print("\n---20. hadoop-env.sh----\n")
  # https://stackoverflow.com/a/53140448
  !cat /opt/hadoop-3.3.5/etc/hadoop/hadoop-env.sh
  ! echo 'export JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64"' >> /opt/hadoop-3.3.5/etc/hadoop/hadoop-env.sh
  ! echo 'export HDFS_NAMENODE_USER="root"'  >> /opt/hadoop-3.3.5/etc/hadoop/hadoop-env.sh
  ! echo 'export HDFS_DATANODE_USER="root"'  >> /opt/hadoop-3.3.5/etc/hadoop/hadoop-env.sh
  ! echo 'export HDFS_SECONDARYNAMENODE_USER="root"'  >> /opt/hadoop-3.3.5/etc/hadoop/hadoop-env.sh
  ! echo 'export YARN_RESOURCEMANAGER_USER="root"'  >> /opt/hadoop-3.3.5/etc/hadoop/hadoop-env.sh
  ! echo 'export YARN_NODEMANAGER_USER="root"'  >> /opt/hadoop-3.3.5/etc/hadoop/hadoop-env.sh
  
  print("\n---21. Amended hadoop-env.sh----\n")
  !cat /opt/hadoop-3.3.5/etc/hadoop/hadoop-env.sh


#### ssh keys

In [8]:
# 6.0 Function tp setup ssh passphrase
def set_keys():
  print("\n---22. Generate SSH keys----\n")
  ! cd ~ ; pwd 
  ! cd ~ ; ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa
  ! cd ~ ; cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
  ! cd ~ ; chmod 0600 ~/.ssh/authorized_keys


#### Set environment

In [9]:
# 7.0 Function to set up environmental variables
def set_env():
  print("\n---23. Set Environment variables----\n")
  # 'export' command does not work in colab
  # https://stackoverflow.com/a/57240319
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  os.environ["JRE_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64/jre"   
  os.environ["HADOOP_HOME"] = "/opt/hadoop-3.3.5"
  os.environ["HADOOP_CONF_DIR"] = "/opt/hadoop-3.3.5/etc/hadoop" 
  os.environ["LD_LIBRARY_PATH"] += ":/opt/hadoop-3.3.5/lib/native"
  os.environ["PATH"] += ":/opt/hadoop-3.3.5/bin:/opt/hadoop-3.3.5/sbin"

#### Install all function

In [10]:
# 8.0 Function to call all functions
def install_hadoop():
  print("\n--Install java----\n")
  ssh_install()
  install_java()  
  hadoop_install()
  hadoop_config()
  set_keys()
  set_env()


### Begin install
Start downloading, install and configure. Takes around 2 minutes

In [15]:
y# 9.0 Start installation
start = time.time()
install_hadoop()
end = time.time()
print("\n---Time taken----\n")
print((end- start)/60)


--Install java----


--1. Download and install ssh server----

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following packages were automatically installed and are no longer required:
  libboost-atomic-dev libboost-atomic1.71-dev libboost-atomic1.71.0
  libboost-chrono-dev libboost-chrono1.71-dev libboost-chrono1.71.0
  libboost-container-dev libboost-container1.71-dev libboost-container1.71.0
  libboost-context-dev libboost-context1.71-dev libboost-context1.71.0
  libboost-coroutine-dev libboost-coroutine1.71-dev libboost-coroutine1.71.0
  libboost-date-time-dev libboost-date-time1.71-dev libboost-date-time1.71.0
  libboost-exception-dev libboost-exception1.71-dev libboost-fiber-dev
  libboost-fiber1.71-dev libboost-fiber1.71.0 libboost-filesystem-dev
  libboost-filesystem1.71-dev libboost-filesystem1.71.0 libboost-graph-dev
  libboost-graph-parallel-dev libboost-graph-parallel1.71-dev
  libboost-graph-parallel1.71.0 libboost-gra

### Format hadoop

In [16]:
# 10.0 Format hadoop
print("\n---24. Format namenode----\n")
!hdfs namenode  -format


---24. Format namenode----

2023-03-27 05:59:11,305 INFO namenode.NameNode: STARTUP_MSG: 
/************************************************************
STARTUP_MSG: Starting NameNode
STARTUP_MSG:   host = d89213ccffe7/172.28.0.12
STARTUP_MSG:   args = [-format]
STARTUP_MSG:   version = 3.3.5
STARTUP_MSG:   classpath = /opt/hadoop-3.3.5/etc/hadoop:/opt/hadoop-3.3.5/share/hadoop/common/lib/token-provider-1.0.1.jar:/opt/hadoop-3.3.5/share/hadoop/common/lib/jetty-webapp-9.4.48.v20220622.jar:/opt/hadoop-3.3.5/share/hadoop/common/lib/jakarta.activation-api-1.2.1.jar:/opt/hadoop-3.3.5/share/hadoop/common/lib/netty-codec-stomp-4.1.77.Final.jar:/opt/hadoop-3.3.5/share/hadoop/common/lib/kerby-pkix-1.0.1.jar:/opt/hadoop-3.3.5/share/hadoop/common/lib/curator-framework-4.2.0.jar:/opt/hadoop-3.3.5/share/hadoop/common/lib/jackson-databind-2.12.7.1.jar:/opt/hadoop-3.3.5/share/hadoop/common/lib/httpcore-4.4.13.jar:/opt/hadoop-3.3.5/share/hadoop/common/lib/netty-resolver-dns-4.1.77.Final.jar:/opt/hadoo

## Start and test hadoop
If namenode is in safemode, use the command:   
`!hdfs dfsadmin -safemode leave`

#### Start hadoop
If start fails with 'Connection refused', run `ssh_install()` once again

In [17]:
# 11.0 Start namenode
#      If this fails, run
#       ssh_install() below
#        and start hadoop again:

print("\n---25. Start namenode----\n")
! start-dfs.sh


---25. Start namenode----

Starting namenodes on [localhost]
Starting datanodes
Starting secondary namenodes [d89213ccffe7]


In [None]:
#ssh_install()

#### Start yarn

In [18]:
# 11.1 Start yarn
! start-yarn.sh

Starting resourcemanager
Starting nodemanagers


If `start-dfs.sh` fails, issue the following three commands, one after another:<br>  
`! sudo apt-get remove openssh-client openssh-server`<br>
`! sudo apt-get install openssh-client openssh-server`<br>
`! service ssh restart`<br>

And then try to start hadoop again, as: `start-dfs.sh`

#### Test hadoop
IF in safe mode, leave safe mode as:<br>
`!hdfs dfsadmin -safemode leave`

In [19]:
# 11.1
print("\n---26. Make folders in hadoop----\n")
! hdfs dfs -mkdir /user
! hdfs dfs -mkdir /user/ashok


---26. Make folders in hadoop----



In [20]:
# 11.2 Run hadoop commands
! hdfs dfs -ls /
! hdfs dfs -ls /user

Found 1 items
drwxr-xr-x   - root supergroup          0 2023-03-27 06:00 /user
Found 1 items
drwxr-xr-x   - root supergroup          0 2023-03-27 06:00 /user/ashok


In [None]:
# 11.3 Stopping hadoop
#      Gives some errors
#      But hadoop stops
#!stop-dfs.sh

Run the `ssh_install()` again if hadoop fails to start with `start-dfs.sh` and then try to start hadoop again.

## Install spark

### Define functions

`findspark`: PySpark isn't on `sys.path` by default, but that doesn't mean it can't be used as a regular library. You can address this by either symlinking pyspark into your site-packages, or adding `pyspark` to `sys.path` at runtime. `findspark` does the latter.

In [None]:
# 1.0 Function to download and unzip spark
def spark_koalas_install():
  print("\n--1.1 Install findspark----\n")
  !pip install -q findspark

  print("\n--1.2 Install databricks Koalas----\n")
  !pip install koalas

  print("\n--1.3 Download Apache tar.gz----\n")
  ! wget -c https://mirrors.estointernet.in/apache/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz

  print("\n--1.4 Transfer downloaded content and unzip tar.gz----\n")
  !  mv /content/spark*   /opt/
  ! tar -xzf /opt/spark-3.1.1-bin-hadoop3.2.tgz  --directory /opt/

  print("\n--1.5 Check folder for files----\n")
  ! ls -la /opt


In [None]:
# 1.1 Function to set environment
def set_spark_env():
  print("\n---2. Set Environment variables----\n")
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64" 
  os.environ["JRE_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64/jre" 
  os.environ["SPARK_HOME"] = "/opt/spark-3.1.1-bin-hadoop3.2"     
  os.environ["LD_LIBRARY_PATH"] += ":/opt/spark-3.1.1-bin-hadoop3.2/lib/native"
  os.environ["PATH"] += ":/opt/spark-3.1.1-bin-hadoop3.2/bin:/opt/spark-3.1.1-bin-hadoop3.2/sbin"
  print("\n---2.1. Check Environment variables----\n")
  # Check
  ! echo $PATH
  ! echo $LD_LIBRARY_PATH

In [None]:
# 1.2 Function to configure spark 
def spark_conf():
  print("\n---3. Configure spark to access hadoop----\n")
  !mv /opt/spark-3.1.1-bin-hadoop3.2/conf/spark-env.sh.template  /opt/spark-3.1.1-bin-hadoop3.2/conf/spark-env.sh
  !echo "HADOOP_CONF_DIR=/opt/hadoop-3.3.5/etc/hadoop/" >> /opt/spark-3.1.1-bin-hadoop3.2/conf/spark-env.sh
  print("\n---3.1 Check ----\n")
  #!cat /opt/spark-3.1.1-bin-hadoop3.2/conf/spark-env.sh

### Install spark

In [None]:
# 2.0 Call all the three functions
def install_spark():
  spark_koalas_install()
  set_spark_env()
  spark_conf()


In [None]:
# 2.1 
install_spark()


--1.1 Install findspark----


--1.2 Install databricks Koalas----

Collecting koalas
[?25l  Downloading https://files.pythonhosted.org/packages/40/de/87c016a3e5055251ed117c86eb3b0de2381518c7acae54e115711ff30ceb/koalas-1.7.0-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 5.6MB/s 
Installing collected packages: koalas
Successfully installed koalas-1.7.0

--1.3 Download Apache tar.gz----

--2021-03-30 11:29:04--  https://mirrors.estointernet.in/apache/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
Resolving mirrors.estointernet.in (mirrors.estointernet.in)... 43.255.166.254, 2403:8940:3:1::f
Connecting to mirrors.estointernet.in (mirrors.estointernet.in)|43.255.166.254|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228721937 (218M) [application/octet-stream]
Saving to: ‘spark-3.1.1-bin-hadoop3.2.tgz’


2021-03-30 11:29:27 (9.91 MB/s) - ‘spark-3.1.1-bin-hadoop3.2.tgz’ saved [228721937/228721937]


--1.4 Transfer downloaded content an

## Test spark
Hadoop should have been started

Call some libraries

In [None]:
# 3.0 Just call some libraries to test
import pandas as pd
import numpy as np

# 3.1 Get spark in sys.path
import findspark
findspark.init()

# 3.2 Call other spark libraries
#     Just to test
from pyspark.sql import SparkSession
import databricks.koalas as ks
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression



In [None]:
# 3.1 Build spark session
spark = SparkSession. \
                    builder. \
                    master("local[*]"). \
                    getOrCreate()


In [None]:
# 4.0 Pandas DataFrame
pdf = pd.DataFrame({
        'x1': ['a','a','b','b', 'b', 'c', 'd','d'],
        'x2': ['apple', 'orange', 'orange','orange', 'peach', 'peach','apple','orange'],
        'x3': [1, 1, 2, 2, 2, 4, 1, 2],
        'x4': [2.4, 2.5, 3.5, 1.4, 2.1,1.5, 3.0, 2.0],
        'y1': [1, 0, 1, 0, 0, 1, 1, 0],
        'y2': ['yes', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'yes']
    })

# 4.1
pdf

In [None]:
# 4.2 Transform to Spark DataFrame
df = spark.createDataFrame(pdf)
df.show()

In [None]:
# 4.3 Create a csv file 
#     and tranfer it to hdfs
!echo "a,b,c,d"   > /content/airports.csv
!echo "5,4,6,7"   >> /content/airports.csv
!echo "2,3,4,5"   >> /content/airports.csv
!echo "8,9,0,1"   >> /content/airports.csv
!echo "2,3,4,1"   >> /content/airports.csv
!echo "1,2,2,1"   >> /content/airports.csv
!echo "0,1,2,6"   >> /content/airports.csv
!echo "9,3,1,8"   >> /content/airports.csv
!ls -la /content

# 4.4
!hdfs dfs -rm -f /user/ashok/airports.csv
!hdfs dfs -put /content/airports.csv  /user/ashok/
!hdfs dfs -ls /user/ashok

In [None]:
# 5.0 Read file directly from hadoop
airports_df = spark.read.csv( 
                              "/user/ashok/airports.csv",
                              inferSchema = True,
                              header = True
                             )

# 5.1 Show file
airports_df.show()

## Test Koalas
Hadoop should have been started

Create a koalas dataframe

In [None]:
# 6.0
# If namenode is in safemode, first use:
# hdfs dfsadmin -safemode leave
kdf = ks.DataFrame(
                   {
                       'a': [1, 2, 3, 4, 5, 6],
                       'b': [100, 200, 300, 400, 500, 600],
                       'c': ["one", "two", "three", "four", "five", "six"]
                    },
                    index=[10, 20, 30, 40, 50, 60]
                   )

# 6.1 And show
kdf

In [None]:
# 6.2 Pandas DataFrame
pdf = pd.DataFrame({'x':range(3), 'y':['a','b','b'], 'z':['a','b','b']})

# 6.2.1 Transform to koalas DataFrame
df = ks.from_pandas(pdf)

In [None]:
# 6.3 Rename koalas dataframe columns
df.columns = ['x', 'y', 'z1']

# 6.4 Do some operations on koalas DF, in place:
df['x2'] = df.x * df.x

# 6.6 Finally show koalas df
df


In [None]:
# 6.7 Read csv file from hadoop
#     and create koalas df
ks.read_csv("/user/ashok/airports.csv").head(10)

In [None]:
###################