In [3]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("MyApp") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.4.0") \
    .getOrCreate()

# Extraction from CSV

In [3]:
path = '/Users/kiranchinta/JupyterProjects/sparkLearning/data/output.csv'

In [4]:
spark_df = spark.read.load(path,format='csv')
spark_df.show()

+----+-------+------+---+
| _c0|    _c1|   _c2|_c3|
+----+-------+------+---+
|NULL|user_id|  name|age|
|   0|      1| kiran| 10|
|   1|      2| kumar| 20|
|   2|      3|chinta| 30|
+----+-------+------+---+



In [5]:
spark_df = spark.read.load(path,format='csv',header=True)
spark_df.show()

+---+-------+------+---+
|_c0|user_id|  name|age|
+---+-------+------+---+
|  0|      1| kiran| 10|
|  1|      2| kumar| 20|
|  2|      3|chinta| 30|
+---+-------+------+---+



25/06/14 14:52:25 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , user_id, name, age
 Schema: _c0, user_id, name, age
Expected: _c0 but found: 
CSV file: file:///Users/kiranchinta/JupyterProjects/sparkLearning/data/output.csv


# Extraction from TEXT

In [6]:
path = '/Users/kiranchinta/JupyterProjects/sparkLearning/data/output.txt'

In [7]:
spark_df = spark.read.load(path,format='text')
spark_df.show(truncate=False)

+-----------------------+
|value                  |
+-----------------------+
|   user_id    name  age|
|0        1   kiran   10|
|1        2   kumar   20|
|2        3  chinta   30|
+-----------------------+



In [8]:
spark_df = spark.read.load(path,format='text',header=True)
spark_df.show(truncate=False)

+-----------------------+
|value                  |
+-----------------------+
|   user_id    name  age|
|0        1   kiran   10|
|1        2   kumar   20|
|2        3  chinta   30|
+-----------------------+



# Extraction from PARQUET

In [9]:
path = '/Users/kiranchinta/JupyterProjects/sparkLearning/data/output.parquet'

In [10]:
spark_df = spark.read.load(path,format='parquet')
spark_df.show(truncate=False)

+-------+------+---+
|user_id|name  |age|
+-------+------+---+
|1      |kiran |10 |
|2      |kumar |20 |
|3      |chinta|30 |
+-------+------+---+



# Extraction from AVRO

In [11]:
path = '/Users/kiranchinta/JupyterProjects/sparkLearning/data/output.avro'

spark_df = spark.read.load(path,format='avro')
spark_df.show(truncate=False)

+-------+------+---+
|user_id|name  |age|
+-------+------+---+
|1      |kiran |10 |
|2      |kumar |20 |
|3      |chinta|30 |
+-------+------+---+



# Extraction from ORC JSON

In [15]:
path = '/Users/kiranchinta/JupyterProjects/sparkLearning/data/output.orc'

spark_df = spark.read.load(path,format='orc')
spark_df.show(truncate=False)

+-------+------+---+
|user_id|name  |age|
+-------+------+---+
|1      |kiran |10 |
|2      |kumar |20 |
|3      |chinta|30 |
+-------+------+---+



# Reading CSV data -> inferSchema

In [7]:
path = '/Users/kiranchinta/JupyterProjects/sparkLearning/data/taxi-fares.csv'
spark_df = spark.read.load(path,format='csv',header=True)
spark_df.show()

+--------------------+-----------+--------------------+-------------------+------------------+-------------------+------------------+---------------+
|                 key|fare_amount|     pickup_datetime|   pickup_longitude|   pickup_latitude|  dropoff_longitude|  dropoff_latitude|passenger_count|
+--------------------+-----------+--------------------+-------------------+------------------+-------------------+------------------+---------------+
|2014-06-15 17:11:...|          7|2014-06-15 17:11:...|          -73.99542|         40.759662|         -73.987607|         40.751247|              1|
|2011-03-14 22:43:...|        4.9|2011-03-14 22:43:...|         -73.993552|          40.73111|         -73.998497|           40.7372|              5|
|2011-02-14 15:14:...|        6.1|2011-02-14 15:14:...|          -73.97238|         40.749527|         -73.990638|         40.745328|              1|
|2009-10-29 11:29:...|        6.9|2009-10-29 11:29:...|         -73.973703|         40.763542|      

In [8]:
spark_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- pickup_longitude: string (nullable = true)
 |-- pickup_latitude: string (nullable = true)
 |-- dropoff_longitude: string (nullable = true)
 |-- dropoff_latitude: string (nullable = true)
 |-- passenger_count: string (nullable = true)



In [9]:
spark_df = spark.read.load(path,format='csv',header=True,inferSchema=True)
spark_df.show()

+-------------------+-----------+-------------------+------------------+-----------------+------------------+------------------+---------------+
|                key|fare_amount|    pickup_datetime|  pickup_longitude|  pickup_latitude| dropoff_longitude|  dropoff_latitude|passenger_count|
+-------------------+-----------+-------------------+------------------+-----------------+------------------+------------------+---------------+
|2014-06-15 17:11:00|        7.0|2014-06-15 12:11:00|         -73.99542|        40.759662|        -73.987607|         40.751247|              1|
|2011-03-14 22:43:00|        4.9|2011-03-14 17:43:00|        -73.993552|         40.73111|        -73.998497|           40.7372|              5|
|2011-02-14 15:14:00|        6.1|2011-02-14 09:14:00|         -73.97238|        40.749527|        -73.990638|         40.745328|              1|
|2009-10-29 11:29:00|        6.9|2009-10-29 06:29:00|        -73.973703|        40.763542|        -73.984253|         40.758603|  

In [10]:
spark_df.printSchema()

root
 |-- key: timestamp (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- passenger_count: integer (nullable = true)



# explicit schema

In [11]:
taxiSchema = "key TIMESTAMP, fare_amount FLOAT, pickup_datetime TIMESTAMP, pickup_longitude FLOAT, pickup_latitude FLOAT, dropoff_longitude FLOAT,dropoff_latitude FLOAT, passenger_count INT "

In [15]:
spark_df = spark.read.load(path,format='csv',header=True,schema=taxiSchema)
spark_df.show()

+-------------------+-----------+-------------------+----------------+---------------+-----------------+----------------+---------------+
|                key|fare_amount|    pickup_datetime|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|passenger_count|
+-------------------+-----------+-------------------+----------------+---------------+-----------------+----------------+---------------+
|2014-06-15 17:11:00|        7.0|2014-06-15 12:11:00|       -73.99542|      40.759663|        -73.98761|       40.751247|              1|
|2011-03-14 22:43:00|        4.9|2011-03-14 17:43:00|       -73.99355|       40.73111|         -73.9985|         40.7372|              5|
|2011-02-14 15:14:00|        6.1|2011-02-14 09:14:00|       -73.97238|      40.749527|        -73.99064|       40.745327|              1|
|2009-10-29 11:29:00|        6.9|2009-10-29 06:29:00|        -73.9737|      40.763542|        -73.98425|       40.758602|              5|
|2011-07-02 10:38:00|       10.5|2

In [16]:
spark_df.printSchema()

root
 |-- key: timestamp (nullable = true)
 |-- fare_amount: float (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- pickup_longitude: float (nullable = true)
 |-- pickup_latitude: float (nullable = true)
 |-- dropoff_longitude: float (nullable = true)
 |-- dropoff_latitude: float (nullable = true)
 |-- passenger_count: integer (nullable = true)



In [19]:
# using spark datatypes
from pyspark.sql.types import *

taxiSchema = StructType((
    StructField('key',TimestampType()),
    StructField('fare_amount',FloatType()),
    StructField('pickup_datetime',TimestampType()),
    StructField('pickup_longitude',FloatType()),
    StructField('pickup_latitude',FloatType()),
    StructField('dropoff_longitude',FloatType()),
    StructField('dropoff_latitude',FloatType()),
    StructField('passenger_count',IntegerType())
))

In [20]:
spark_df = spark.read.load(path,format='csv',header=True,schema=taxiSchema)
spark_df.show()

+-------------------+-----------+-------------------+----------------+---------------+-----------------+----------------+---------------+
|                key|fare_amount|    pickup_datetime|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|passenger_count|
+-------------------+-----------+-------------------+----------------+---------------+-----------------+----------------+---------------+
|2014-06-15 17:11:00|        7.0|2014-06-15 12:11:00|       -73.99542|      40.759663|        -73.98761|       40.751247|              1|
|2011-03-14 22:43:00|        4.9|2011-03-14 17:43:00|       -73.99355|       40.73111|         -73.9985|         40.7372|              5|
|2011-02-14 15:14:00|        6.1|2011-02-14 09:14:00|       -73.97238|      40.749527|        -73.99064|       40.745327|              1|
|2009-10-29 11:29:00|        6.9|2009-10-29 06:29:00|        -73.9737|      40.763542|        -73.98425|       40.758602|              5|
|2011-07-02 10:38:00|       10.5|2

In [21]:
spark_df.printSchema()

root
 |-- key: timestamp (nullable = true)
 |-- fare_amount: float (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- pickup_longitude: float (nullable = true)
 |-- pickup_latitude: float (nullable = true)
 |-- dropoff_longitude: float (nullable = true)
 |-- dropoff_latitude: float (nullable = true)
 |-- passenger_count: integer (nullable = true)



In [22]:
spark.stop()