##  1) Install Hadoop

In [10]:
!apt-get update -qq
!apt-get install -y openjdk-11-jdk-headless

# Download and extract Hadoop 3.3.6
!wget -q https://downloads.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz
!tar -xzf hadoop-3.3.6.tar.gz
!mv hadoop-3.3.6 /usr/local/hadoop

# Set environment variables for this session
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["HADOOP_HOME"] = "/usr/local/hadoop"
os.environ["PATH"] += ":/usr/local/hadoop/bin:/usr/local/hadoop/sbin"

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
openjdk-11-jdk-headless is already the newest version (11.0.28+6-1ubuntu1~22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 43 not upgraded.
mv: cannot move 'hadoop-3.3.6' to '/usr/local/hadoop/hadoop-3.3.6': Directory not empty


## 2) Configure HDFS (single-node) & format

In [11]:
# === Format NameNode and start HDFS (NameNode + DataNode) ===
import os, pathlib, subprocess

HADOOP_HOME = "/usr/local/hadoop"
JAVA_HOME = "/usr/lib/jvm/java-11-openjdk-amd64"

# Ensure env vars are available
os.environ["JAVA_HOME"] = JAVA_HOME
os.environ["HADOOP_HOME"] = HADOOP_HOME
os.environ["PATH"] = f"{HADOOP_HOME}/bin:{HADOOP_HOME}/sbin:" + os.environ["PATH"]

# Create HDFS directories for NameNode and DataNode
nn_dir = "/content/hdfs/namenode"
dn_dir = "/content/hdfs/datanode"
pathlib.Path(nn_dir).mkdir(parents=True, exist_ok=True)
pathlib.Path(dn_dir).mkdir(parents=True, exist_ok=True)

# Append JAVA_HOME to hadoop-env.sh
with open(f"{HADOOP_HOME}/etc/hadoop/hadoop-env.sh", "a") as f:
    f.write(f"\nexport JAVA_HOME={JAVA_HOME}\n")

# Write minimal configs
core_site = """\
<?xml version="1.0"?>
<configuration>
  <property>
    <name>fs.defaultFS</name>
    <value>hdfs://localhost:9000</value>
  </property>
  <property>
    <name>hadoop.tmp.dir</name>
    <value>/content/hdfs/tmp</value>
  </property>
</configuration>
"""
with open(f"{HADOOP_HOME}/etc/hadoop/core-site.xml", "w") as f:
    f.write(core_site)

hdfs_site = f"""\
<?xml version="1.0"?>
<configuration>
  <property>
    <name>dfs.replication</name>
    <value>1</value>
  </property>
  <property>
    <name>dfs.namenode.name.dir</name>
    <value>file:{nn_dir}</value>
  </property>
  <property>
    <name>dfs.datanode.data.dir</name>
    <value>file:{dn_dir}</value>
  </property>
  <property>
    <name>dfs.namenode.http-address</name>
    <value>0.0.0.0:9870</value>
  </property>
</configuration>
"""
with open(f"{HADOOP_HOME}/etc/hadoop/hdfs-site.xml", "w") as f:
    f.write(hdfs_site)

# Format the NameNode (safe to rerun only on fresh setup)
print("Formatting NameNode…")
!$HADOOP_HOME/bin/hdfs namenode -format -nonInteractive

# Start NameNode and DataNode
print("Starting NameNode and DataNode…")
!$HADOOP_HOME/bin/hdfs --daemon start namenode
!$HADOOP_HOME/bin/hdfs --daemon start datanode

# Show running Java processes
print("\nJava processes (via jps):")
!jps || true

# Create base HDFS dirs
!$HADOOP_HOME/bin/hdfs dfs -mkdir -p /user/root || true
!$HADOOP_HOME/bin/hdfs dfs -mkdir -p /data || true

print("\nCheck HDFS root directory:")
!$HADOOP_HOME/bin/hdfs dfs -ls /

Formatting NameNode…
2025-09-19 12:52:39,521 INFO namenode.NameNode: STARTUP_MSG: 
/************************************************************
STARTUP_MSG: Starting NameNode
STARTUP_MSG:   host = 9c5f6c881f56/172.28.0.12
STARTUP_MSG:   args = [-format, -nonInteractive]
STARTUP_MSG:   version = 3.3.6
STARTUP_MSG:   classpath = /usr/local/hadoop/etc/hadoop:/usr/local/hadoop/share/hadoop/common/lib/jetty-servlet-9.4.51.v20230217.jar:/usr/local/hadoop/share/hadoop/common/lib/netty-transport-classes-epoll-4.1.89.Final.jar:/usr/local/hadoop/share/hadoop/common/lib/nimbus-jose-jwt-9.8.1.jar:/usr/local/hadoop/share/hadoop/common/lib/gson-2.9.0.jar:/usr/local/hadoop/share/hadoop/common/lib/kerby-asn1-1.0.1.jar:/usr/local/hadoop/share/hadoop/common/lib/netty-resolver-4.1.89.Final.jar:/usr/local/hadoop/share/hadoop/common/lib/kerb-simplekdc-1.0.1.jar:/usr/local/hadoop/share/hadoop/common/lib/jetty-util-ajax-9.4.51.v20230217.jar:/usr/local/hadoop/share/hadoop/common/lib/stax2-api-4.2.1.jar:/usr/

## 3) Start HDFS and prep dirs

In [12]:
HADOOP_HOME = "/usr/local/hadoop"

# Start daemons
!$HADOOP_HOME/bin/hdfs --daemon start namenode
!$HADOOP_HOME/bin/hdfs --daemon start datanode

# Show Java processes
!jps || true

# Prep common HDFS dirs
!$HADOOP_HOME/bin/hdfs dfs -mkdir -p /user/root || true
!$HADOOP_HOME/bin/hdfs dfs -mkdir -p /data || true

namenode is running as process 11796.  Stop it first and ensure /tmp/hadoop-root-namenode.pid file is empty before retry.
datanode is running as process 11858.  Stop it first and ensure /tmp/hadoop-root-datanode.pid file is empty before retry.
12528 Jps
11858 DataNode
11796 NameNode


## 4) Install KaggleHub & load dataset

In [13]:
!pip -q install kagglehub[pandas-datasets]

import kagglehub
from kagglehub import KaggleDatasetAdapter

# Dataset slug and file name (adjust the file name if needed)
DATASET = "maharshipandya/-spotify-tracks-dataset"
DATAFILE = "dataset.csv"

# Load directly to pandas
df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    DATASET,
    DATAFILE
)

# Save locally for HDFS upload
LOCAL_CSV = "/content/spotify.csv"
df.to_csv(LOCAL_CSV, index=False)

  df = kagglehub.load_dataset(


Using Colab cache for faster access to the '-spotify-tracks-dataset' dataset.


## 5) Upload to HDFS

In [14]:
HADOOP_HOME = "/usr/local/hadoop"
HDFS_TARGET_DIR = "/data/spotify"

!$HADOOP_HOME/bin/hdfs dfs -mkdir -p $HDFS_TARGET_DIR
!$HADOOP_HOME/bin/hdfs dfs -put -f /content/spotify.csv $HDFS_TARGET_DIR
!$HADOOP_HOME/bin/hdfs dfs -ls -h $HDFS_TARGET_DIR

Found 1 items
-rw-r--r--   1 root supergroup     19.2 M 2025-09-19 12:59 /data/spotify/spotify.csv


## 6) Verify HDFS upload + PySpark sanity check

In [17]:
# Inspect the file in HDFS
HADOOP_HOME = "/usr/local/hadoop"
HDFS_PATH = "hdfs://localhost:9000/data/spotify/spotify.csv"

print("Listing HDFS target directory:")
!$HADOOP_HOME/bin/hdfs dfs -ls -h /data/spotify || true

print("\nFirst few lines of the CSV in HDFS:")
!$HADOOP_HOME/bin/hdfs dfs -cat /data/spotify/spotify.csv | head -n 5 || true

# Install PySpark
!pip -q install pyspark

# Start Spark and point it at HDFS
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T

spark = SparkSession.builder \
    .appName("SpotifyHDFSCheck") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
    .getOrCreate()

# Read the CSV from HDFS
df = spark.read.option("header", "true").option("inferSchema", "true").csv(HDFS_PATH)

print("\n=== DataFrame Schema ===")
df.printSchema()

print("\n=== Row Count ===")
print(df.count())

print("\n=== Sample Rows ===")
df.show(5, truncate=False)

Listing HDFS target directory:
Found 1 items
-rw-r--r--   1 root supergroup     19.2 M 2025-09-19 12:59 /data/spotify/spotify.csv

First few lines of the CSV in HDFS:
Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1.01e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,5.56e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Soundtrack),Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18

## 7) Write a persistent Parquet copy of the dataset to Drive

In [19]:
from google.colab import drive
drive.mount('/content/drive')

spark = SparkSession.builder \
    .appName("PersistToDrive") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
    .getOrCreate()

sdf = spark.read.option("header", "true").option("inferSchema", "true") \
    .csv("hdfs://localhost:9000/data/spotify/spotify.csv")

# Write Parquet to Drive (persistent)
BRONZE = "file:///content/drive/MyDrive/data/spotify/bronze_parquet"
sdf.write.mode("overwrite").parquet(BRONZE)

spark.stop()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
