In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
.appName("Read Data From File") \
.master("local[2]") \
.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/28 15:42:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
! curl -o /opt/examples/datasets/Mall_Customers.csv \
https://raw.githubusercontent.com/erkansirin78/datasets/master/Mall_Customers.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  4365  100  4365    0     0   9273      0 --:--:-- --:--:-- --:--:--  9287


In [4]:
! ls -l /opt/examples/datasets | grep Mall

-rw-r--r--. 1 root root 4365 Sep 28 15:43 Mall_Customers.csv


In [5]:
df = spark.read.csv("file:///opt/examples/datasets/Mall_Customers.csv")

                                                                                

In [6]:
df.show(5)

+----------+------+---+------------+-------------+
|       _c0|   _c1|_c2|         _c3|          _c4|
+----------+------+---+------------+-------------+
|CustomerID|Gender|Age|AnnualIncome|SpendingScore|
|         1|  Male| 19|       15000|           39|
|         2|  Male| 21|       15000|           81|
|         3|Female| 20|       16000|            6|
|         4|Female| 23|       16000|           77|
+----------+------+---+------------+-------------+
only showing top 5 rows



In [7]:
df.count()

201

In [8]:
df.limit(5).toPandas()

Unnamed: 0,_c0,_c1,_c2,_c3,_c4
0,CustomerID,Gender,Age,AnnualIncome,SpendingScore
1,1,Male,19,15000,39
2,2,Male,21,15000,81
3,3,Female,20,16000,6
4,4,Female,23,16000,77


In [9]:
# We have to filter headers

In [11]:
df = spark.read.option("header", True) \
.csv("file:///opt/examples/datasets/Mall_Customers.csv")

In [12]:
df.limit(5).toPandas()

Unnamed: 0,CustomerID,Gender,Age,AnnualIncome,SpendingScore
0,1,Male,19,15000,39
1,2,Male,21,15000,81
2,3,Female,20,16000,6
3,4,Female,23,16000,77
4,5,Female,31,17000,40


In [13]:
df.printSchema()

root
 |-- CustomerID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- AnnualIncome: string (nullable = true)
 |-- SpendingScore: string (nullable = true)



In [14]:
# All datatypes are string. This is not we want
# We can ask from Spark to infer data types

In [15]:
df = spark.read.option("header", True) \
.option("inferSchema", True) \
.csv("file:///opt/examples/datasets/Mall_Customers.csv")

In [16]:
df.printSchema()

root
 |-- CustomerID: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- AnnualIncome: integer (nullable = true)
 |-- SpendingScore: integer (nullable = true)



In [17]:
# What if seperator is not comma (,)

In [18]:
df = spark.read.option("header", True) \
.option("inferSchema", True) \
.option("sep",",") \
.csv("file:///opt/examples/datasets/Mall_Customers.csv")

In [19]:
df.limit(5).toPandas()

Unnamed: 0,CustomerID,Gender,Age,AnnualIncome,SpendingScore
0,1,Male,19,15000,39
1,2,Male,21,15000,81
2,3,Female,20,16000,6
3,4,Female,23,16000,77
4,5,Female,31,17000,40


<h1 style="color:red;"> Warning!!!! You have to use limit() before using toPandas()</h1>
<h1 style="color:red;"> oherwise all data would rush to driver.</h1>

In [20]:
spark.stop()