# **Create DataFrame from CSV File in PySpark 3.0 on Google Colab | Part 3**

In [None]:
!pwd
!ls
!python --version

/content
sample_data
Python 3.6.9


In [None]:
#!wget https://mirrors.estointernet.in/apache/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz
!wget https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz
!tar -xvzf spark-3.0.0-bin-hadoop2.7.tgz
!pip install findspark

--2020-09-11 02:50:05--  https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz
Resolving archive.apache.org (archive.apache.org)... 138.201.131.134, 2a01:4f8:172:2ec5::2
Connecting to archive.apache.org (archive.apache.org)|138.201.131.134|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 220272364 (210M) [application/x-gzip]
Saving to: ‘spark-3.0.0-bin-hadoop2.7.tgz’


2020-09-11 02:50:14 (24.8 MB/s) - ‘spark-3.0.0-bin-hadoop2.7.tgz’ saved [220272364/220272364]

spark-3.0.0-bin-hadoop2.7/
spark-3.0.0-bin-hadoop2.7/NOTICE
spark-3.0.0-bin-hadoop2.7/kubernetes/
spark-3.0.0-bin-hadoop2.7/kubernetes/tests/
spark-3.0.0-bin-hadoop2.7/kubernetes/tests/worker_memory_check.py
spark-3.0.0-bin-hadoop2.7/kubernetes/tests/py_container_checks.py
spark-3.0.0-bin-hadoop2.7/kubernetes/tests/pyfiles.py
spark-3.0.0-bin-hadoop2.7/kubernetes/dockerfiles/
spark-3.0.0-bin-hadoop2.7/kubernetes/dockerfiles/spark/
spark-3.0.0-bin-hadoop2.7/kubernetes/dockerfiles

In [None]:
import os
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop2.7"
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Create DataFrame from CSV File in PySpark 3.0").getOrCreate()
print(spark.sparkContext.appName)

Create DataFrame from CSV File in PySpark 3.0


In [None]:
csv_file_path = "/content/data/csv/user_detail_comma_delimiter.csv"

df = spark.read.csv(csv_file_path)

df.show()

df.printSchema()

+-------+---------+-----------+
|    _c0|      _c1|        _c2|
+-------+---------+-----------+
|user_id|user_name|  user_city|
|      1|     John|     London|
|      2|   Martin|   New York|
|      3|      Sam|     Sydney|
|      4|     Alan|Mexico City|
|      5|    Jacob|    Florida|
+-------+---------+-----------+

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)



In [None]:
csv_file_path = "/content/data/csv/user_detail_comma_delimiter.csv"

df = spark.read.csv(path=csv_file_path, sep=",", header=True)

df.show()

df.printSchema()

+-------+---------+-----------+
|user_id|user_name|  user_city|
+-------+---------+-----------+
|      1|     John|     London|
|      2|   Martin|   New York|
|      3|      Sam|     Sydney|
|      4|     Alan|Mexico City|
|      5|    Jacob|    Florida|
+-------+---------+-----------+

root
 |-- user_id: string (nullable = true)
 |-- user_name: string (nullable = true)
 |-- user_city: string (nullable = true)



In [None]:
csv_file_path = "/content/data/csv/user_detail_comma_delimiter.csv"

df = spark.read.csv(path=csv_file_path, sep=",", header=True, inferSchema=True)

df.show()

df.printSchema()

+-------+---------+-----------+
|user_id|user_name|  user_city|
+-------+---------+-----------+
|      1|     John|     London|
|      2|   Martin|   New York|
|      3|      Sam|     Sydney|
|      4|     Alan|Mexico City|
|      5|    Jacob|    Florida|
+-------+---------+-----------+

root
 |-- user_id: integer (nullable = true)
 |-- user_name: string (nullable = true)
 |-- user_city: string (nullable = true)



In [None]:
csv_file_path = "/content/data/csv/user_detail_pipe_delimiter.csv"

df = spark.read.csv(path=csv_file_path, sep="|", header=True, inferSchema=True)

df.show()

df.printSchema()

+-------+---------+-----------+
|user_id|user_name|  user_city|
+-------+---------+-----------+
|      1|     John|     London|
|      2|   Martin|   New York|
|      3|      Sam|     Sydney|
|      4|     Alan|Mexico City|
|      5|    Jacob|    Florida|
+-------+---------+-----------+

root
 |-- user_id: integer (nullable = true)
 |-- user_name: string (nullable = true)
 |-- user_city: string (nullable = true)



In [None]:
csv_file_path = "/content/data/csv/user_detail_pipe_delimiter.csv"

from pyspark.sql.types import *

user_schema = StructType([
                     StructField("user_id", IntegerType(), True),
                     StructField("user_name", StringType(), True),
                     StructField("user_city", StringType(), True)
])

df = spark.read.csv(path=csv_file_path, sep="|", header=True, schema=user_schema)

df.show()

df.printSchema()

+-------+---------+-----------+
|user_id|user_name|  user_city|
+-------+---------+-----------+
|      1|     John|     London|
|      2|   Martin|   New York|
|      3|      Sam|     Sydney|
|      4|     Alan|Mexico City|
|      5|    Jacob|    Florida|
+-------+---------+-----------+

root
 |-- user_id: integer (nullable = true)
 |-- user_name: string (nullable = true)
 |-- user_city: string (nullable = true)



In [None]:
spark.stop()