In [1]:

# Import necessary modules
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, rank, udf
from pyspark.sql.window import Window
from pyspark.sql.types import StringType


In [2]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("DataFrameExample") \
    .getOrCreate()



In [3]:
# -------------------------------
# 🟢 BASIC EXAMPLES
# -------------------------------


In [4]:

# 1. Create a DataFrame from a Python list
# Each tuple is a row, and we define column names
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]
df = spark.createDataFrame(data, ["Name", "Age"])
df.show()


+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 35|
+-------+---+



In [7]:
# 2. Read a CSV file into a DataFrame
# Assumes 'data.csv' exists with headers
df_csv = spark.read.csv("currency.csv", header=True, inferSchema=True)
df_csv.show()


+----+------+--------------------+
|Code|Symbol|                Name|
+----+------+--------------------+
| AED|   د.إ|United Arab Emira...|
| AFN|     ؋|      Afghan afghani|
| ALL|     L|        Albanian lek|
| AMD|   AMD|       Armenian dram|
| ANG|     ƒ|Netherlands Antil...|
| AOA|    Kz|      Angolan kwanza|
| ARS|     $|      Argentine peso|
| AUD|     $|   Australian dollar|
| AWG|  Afl.|       Aruban florin|
| AZN|   AZN|   Azerbaijani manat|
| BAM|    KM|Bosnia and Herzeg...|
| BBD|     $|    Barbadian dollar|
| BDT|    ৳ |    Bangladeshi taka|
| BGN|   лв.|       Bulgarian lev|
| BHD|  .د.ب|      Bahraini dinar|
| BIF|    Fr|     Burundian franc|
| BMD|     $|    Bermudian dollar|
| BND|     $|       Brunei dollar|
| BOB|   Bs.|  Bolivian boliviano|
| BRL|    R$|      Brazilian real|
+----+------+--------------------+
only showing top 20 rows



In [8]:
# 3. Select specific columns and filter rows
# Select only the 'Name' column
df.select("Name").show()

+-------+
|   Name|
+-------+
|  Alice|
|    Bob|
|Charlie|
+-------+



In [9]:
# Filter rows where Age is greater than 30
df.filter(df.Age > 30).show()

+-------+---+
|   Name|Age|
+-------+---+
|Charlie| 35|
+-------+---+



In [10]:
# 4. Add a new column by transforming existing data
# Adds 10 to each person's age
df = df.withColumn("AgePlusTen", col("Age") + 10)
df.show()


+-------+---+----------+
|   Name|Age|AgePlusTen|
+-------+---+----------+
|  Alice| 25|        35|
|    Bob| 30|        40|
|Charlie| 35|        45|
+-------+---+----------+



# -------------------------------
# 🟡 INTERMEDIATE EXAMPLES
# -------------------------------


In [11]:
# 5. Group by a column and perform aggregation
# Calculates average salary per department
data_group = [("Alice", "HR", 5000), ("Bob", "IT", 6000), ("Charlie", "HR", 5500)]
df_group = spark.createDataFrame(data_group, ["Name", "Department", "Salary"])
df_group.groupBy("Department").agg({"Salary": "avg"}).show()


+----------+-----------+
|Department|avg(Salary)|
+----------+-----------+
|        HR|     5250.0|
|        IT|     6000.0|
+----------+-----------+



In [12]:
# 6. Join two DataFrames on a common column
# Joins employee names with department salaries
data1 = [("Alice", "HR"), ("Bob", "IT")]
data2 = [("HR", 5000), ("IT", 6000)]
df1 = spark.createDataFrame(data1, ["Name", "Dept"])
df2 = spark.createDataFrame(data2, ["Dept", "Salary"])
joined = df1.join(df2, on="Dept", how="inner")
joined.show()

+----+-----+------+
|Dept| Name|Salary|
+----+-----+------+
|  HR|Alice|  5000|
|  IT|  Bob|  6000|
+----+-----+------+



In [13]:
# 7. Run SQL queries on a DataFrame
# Creates a temporary SQL view and queries it
df.createOrReplaceTempView("people")
spark.sql("SELECT Name, Age FROM people WHERE Age > 30").show()


+-------+---+
|   Name|Age|
+-------+---+
|Charlie| 35|
+-------+---+



In [14]:
# 8. Handle missing data
# Fill missing Age values with 0
df_missing = spark.createDataFrame([("Alice", None), ("Bob", 30)], ["Name", "Age"])
df_missing.na.fill({"Age": 0}).show()


+-----+---+
| Name|Age|
+-----+---+
|Alice|  0|
|  Bob| 30|
+-----+---+



In [15]:
# Drop rows with any missing values
df_missing.na.drop().show()


+----+---+
|Name|Age|
+----+---+
| Bob| 30|
+----+---+



In [None]:
# -------------------------------
# 🔵 MODERATE EXAMPLES
# -------------------------------

In [16]:
# 9. Use window functions for ranking
# Ranks employees by salary within each department
data_window = [("Alice", "HR", 5000), ("Bob", "HR", 5500), ("Charlie", "IT", 6000)]
df_window = spark.createDataFrame(data_window, ["Name", "Dept", "Salary"])
windowSpec = Window.partitionBy("Dept").orderBy(col("Salary").desc())
df_window.withColumn("Rank", rank().over(windowSpec)).show()


+-------+----+------+----+
|   Name|Dept|Salary|Rank|
+-------+----+------+----+
|    Bob|  HR|  5500|   1|
|  Alice|  HR|  5000|   2|
|Charlie|  IT|  6000|   1|
+-------+----+------+----+



In [19]:
# 10. Create and use a User Defined Function (UDF)
# Categorizes people as 'Young' or 'Old' based on age
def age_category(age):
    return "Young" if age < 30 else "Old"

age_udf = udf(age_category, StringType())
df = df.withColumn("Category", age_udf(col("Age")))
df.show()

+-------+---+----------+--------+
|   Name|Age|AgePlusTen|Category|
+-------+---+----------+--------+
|  Alice| 25|        35|   Young|
|    Bob| 30|        40|     Old|
|Charlie| 35|        45|     Old|
+-------+---+----------+--------+

