# Hadoop HDFS Basics

This notebook demonstrates basic HDFS operations using PySpark and the hdfs3 library.


In [None]:
# Import required libraries
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
import pandas as pd
import numpy as np

print("Libraries imported successfully!")

In [None]:
# Create Spark Session with Hadoop configuration
spark = SparkSession.builder \
    .appName("HDFS-Basics") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .enableHiveSupport() \
    .getOrCreate()

sc = spark.sparkContext
print(f"Spark Version: {spark.version}")
print(f"Spark Master: {sc.master}")
print(f"Default File System: {sc._jsc.hadoopConfiguration().get('fs.defaultFS')}")

In [None]:
# Test HDFS connectivity
from pyspark.sql.functions import *

# Create a simple DataFrame
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]
columns = ["name", "age"]
df = spark.createDataFrame(data, columns)

print("Sample DataFrame created:")
df.show()

In [None]:
# Write data to HDFS
hdfs_path = "hdfs://namenode:9000/user/data/sample_users"

df.write \
  .mode("overwrite") \
  .option("header", "true") \
  .csv(hdfs_path)

print(f"Data written to HDFS: {hdfs_path}")

In [None]:
# Read data back from HDFS
df_read = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(hdfs_path)

print("Data read from HDFS:")
df_read.show()
print(f"Schema: {df_read.printSchema()}")

In [None]:
# HDFS file operations using Spark
hadoop_fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jsc.hadoopConfiguration())
hadoop_path = sc._jvm.org.apache.hadoop.fs.Path

# List files in HDFS root directory
print("HDFS Root Directory Contents:")
root_path = hadoop_path("/")
file_status = hadoop_fs.listStatus(root_path)

for status in file_status:
    path = status.getPath().toString()
    is_dir = status.isDirectory()
    size = status.getLen()
    print(f"{'[DIR]' if is_dir else '[FILE]'} {path} ({size} bytes)")

In [None]:
# Create directories in HDFS
test_dir = "/user/test"
data_dir = "/user/data"
logs_dir = "/user/logs"

for directory in [test_dir, data_dir, logs_dir]:
    path = hadoop_path(directory)
    if not hadoop_fs.exists(path):
        hadoop_fs.mkdirs(path)
        print(f"Created directory: {directory}")
    else:
        print(f"Directory already exists: {directory}")

In [None]:
# Clean up
spark.stop()
print("Spark session stopped successfully!")