In [3]:
pip install pyspark



In [6]:
from pyspark.sql import SparkSession

# Creatinga spark session
spark = SparkSession.builder \
     .appName('Data ingestion') \
     .getOrCreate()

###  **READING CSV FILE USING PYSPARK**

In [9]:
csv_file_path = "/content/sample_data/people.txt"

# Reading the csv file with pyspark
df_csv =  spark.read.format("csv").option("header","true").load(csv_file_path)
df_csv.show()


+----+----+-------+
|Name| Age| Gender|
+----+----+-------+
|John|  28|   Male|
|Jane|  32| Female|
+----+----+-------+



# **READING JSON FILE USING PYSPARK**

In [16]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Define the schema for the JSON file
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("gender", StringType(), True),
    StructField("address", StructType([
        StructField("street", StringType(), True),
        StructField("city", StringType(), True)
    ]), True)
])

# Loading the complex JSON file with schema
json_file_path = "/content/sample_data/JSON.txt"

# Read the json file with schema
df_json_complex = spark.read.schema(schema).json(json_file_path)

# Read the file as tesx to inspect its contents
with open(json_file_path,"r") as f:
  data = f.read()
  print(data)


[
  {
    "name": "John",
    "age": 28,
    "gender": "Male",
    "address": {
      "street": "123 Main St",
      "city": "New York"
    }
  },
  {
    "name": "Jane",
    "age": 32,
    "gender": "Female",
    "address": {
      "street": "456 Elm St",
      "city": "San Francisco"
    }
  }
]


# **VIEW**

In [18]:
import pandas as pd

# create a sample dataframe
data = {
    "name":["John","Jane","Mike","Emily"],
    "age":[28,32,45,23],
    "gender":["Mlae","Female","Male","Female"],
    "city":["New York","San Fransico","Los Angeles","Chicago"]
}

df = pd.DataFrame(data)

# save the dataframe to a csv file in a colab environment
csv_file_path = "/content/sample_data/sample_people_data.csv"
df.to_csv(csv_file_path,index=False)

# confirm the file is create
print(f"CSV file created at {csv_file_path}")


from pyspark.sql import SparkSession

# initialize sparksession
spark = SparkSession.builder.appName("Create View Example").getOrCreate()

# Load the csv file into a pyspark environment
df_people = spark.read.format("csv").option("header","true").load(csv_file_path)

# show the DataFrame
df_people.show()



CSV file created /content/sample_data/sample_people_data.csv
+-----+---+------+------------+
| name|age|gender|        city|
+-----+---+------+------------+
| John| 28|  Mlae|    New York|
| Jane| 32|Female|San Fransico|
| Mike| 45|  Male| Los Angeles|
|Emily| 23|Female|     Chicago|
+-----+---+------+------------+



# **TEMPORARY AND GLOBAL VIEW**

In [19]:
# CREATE a temporary view
df_people.createOrReplaceTempView("people_temp_view")

# Run a sql query on the VIEW
result_temp_view = spark.sql("SELECT name,age,gender,city FROM people_temp_view WHERE age > 30")

#show the temp view result
print("Temp view result:")
result_temp_view.show()

# Create a global view
df_people.createOrReplaceGlobalTempView("people_global_view")


# sql query for gloabl temp view
result_global_view = spark.sql("SELECT name,age FROM global_temp.people_global_view WHERE age > 25")

# show the global view result
print("Global view result:")
result_global_view.show()

# List all temporary views and tables
print("Listing all temporary views and tables:")
spark.catalog.listTables()

# Drop the local temporary view
print("Dropping temp view:")
spark.catalog.dropTempView("people_temp_view")

# Drop the global temporary view
print("Dropping gloabl temp view:")
spark.catalog.dropGlobalTempView("people_global_view")



Temp view result:
+----+---+------+------------+
|name|age|gender|        city|
+----+---+------+------------+
|Jane| 32|Female|San Fransico|
|Mike| 45|  Male| Los Angeles|
+----+---+------+------------+

Global view result:
+----+---+
|name|age|
+----+---+
|John| 28|
|Jane| 32|
|Mike| 45|
+----+---+

Listing all temporary views and tables:
Dropping temp view:
Dropping gloabl temp view:


True