In [1]:
import os
import sys

# Set Python paths to your conda environment
os.environ['PYSPARK_PYTHON'] = sys.executable  # Worker Python
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable  # Driver Python

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,lit,count, sum, avg

In [3]:
#create a spark session

In [4]:
spark= SparkSession.builder.appName("Basic_DataFrame").getOrCreate()

In [5]:
#creating a dataframe from a list of tuples defining data and schema

In [6]:
data =[("John Smith", 32, "Male", 75000.50),
       ("Emily Davis", 28, "Female", 82000.75),
       ("Michael Johnson", 45, "Male", 95000.00),
       ("Sarah Wilson", 36, "Female", 68000.25)]
columns = ["name","age","gender","salary"]

In [7]:
#creating a dataframe var that takes in data from the listoftuples and schema from columns

In [8]:
df = spark.createDataFrame(data,columns)

In [9]:
# show the dataframe

In [10]:
df.show()

+---------------+---+------+--------+
|           name|age|gender|  salary|
+---------------+---+------+--------+
|     John Smith| 32|  Male| 75000.5|
|    Emily Davis| 28|Female|82000.75|
|Michael Johnson| 45|  Male| 95000.0|
|   Sarah Wilson| 36|Female|68000.25|
+---------------+---+------+--------+



In [None]:
# show specific columns by slecting them

In [12]:
df.select("name","age").show()

+---------------+---+
|           name|age|
+---------------+---+
|     John Smith| 32|
|    Emily Davis| 28|
|Michael Johnson| 45|
|   Sarah Wilson| 36|
+---------------+---+



In [None]:
# filter rows based on condition given

In [13]:
df.filter(df["age"]>35).show()

+---------------+---+------+--------+
|           name|age|gender|  salary|
+---------------+---+------+--------+
|Michael Johnson| 45|  Male| 95000.0|
|   Sarah Wilson| 36|Female|68000.25|
+---------------+---+------+--------+



In [15]:
df.filter(df["age"]>35).select("name","age").show()

+---------------+---+
|           name|age|
+---------------+---+
|Michael Johnson| 45|
|   Sarah Wilson| 36|
+---------------+---+



In [17]:
df.groupBy("gender").agg(count("*").alias("count"),
                        sum("salary").alias("total salary"),
                        avg("age").alias("average age")).show()

+------+-----+------------+-----------+
|gender|count|total salary|average age|
+------+-----+------------+-----------+
|  Male|    2|    170000.5|       38.5|
|Female|    2|    150001.0|       32.0|
+------+-----+------------+-----------+



In [18]:
df.sort("age",ascending=True).show()

+---------------+---+------+--------+
|           name|age|gender|  salary|
+---------------+---+------+--------+
|    Emily Davis| 28|Female|82000.75|
|     John Smith| 32|  Male| 75000.5|
|   Sarah Wilson| 36|Female|68000.25|
|Michael Johnson| 45|  Male| 95000.0|
+---------------+---+------+--------+



In [19]:
df.withColumn("bonus",lit(1000)).drop("salary").show()

+---------------+---+------+-----+
|           name|age|gender|bonus|
+---------------+---+------+-----+
|     John Smith| 32|  Male| 1000|
|    Emily Davis| 28|Female| 1000|
|Michael Johnson| 45|  Male| 1000|
|   Sarah Wilson| 36|Female| 1000|
+---------------+---+------+-----+



In [20]:
df.show()

+---------------+---+------+--------+
|           name|age|gender|  salary|
+---------------+---+------+--------+
|     John Smith| 32|  Male| 75000.5|
|    Emily Davis| 28|Female|82000.75|
|Michael Johnson| 45|  Male| 95000.0|
|   Sarah Wilson| 36|Female|68000.25|
+---------------+---+------+--------+



In [21]:
spark.stop()