In [1]:
sc

### Create dataframe

In [2]:
header = ['city', 'type', 'price']

In [3]:
data   = map(lambda r: (r[0], r[1], float(r[2])), #header
  map(lambda x: x.split(","),  #contents
    ["Paris,Food,19.00", "Marseille,Clothing,12.00",
     "Paris,Food,8.00", "Paris,Clothing,15.00",
     "Marseille,Food,20.00", "Lyon,Book,10.00"]))
df = spark.createDataFrame(data, header)  #create the dataframe

In [4]:
df.show()

+---------+--------+-----+
|     city|    type|price|
+---------+--------+-----+
|    Paris|    Food| 19.0|
|Marseille|Clothing| 12.0|
|    Paris|    Food|  8.0|
|    Paris|Clothing| 15.0|
|Marseille|    Food| 20.0|
|     Lyon|    Book| 10.0|
+---------+--------+-----+



##### Take 

In [5]:
df.take(2)

[Row(city='Paris', type='Food', price=19.0),
 Row(city='Marseille', type='Clothing', price=12.0)]

In [6]:
df.printSchema()

root
 |-- city: string (nullable = true)
 |-- type: string (nullable = true)
 |-- price: double (nullable = true)



In [7]:
for x in df.dtypes:
    print(x)

('city', 'string')
('type', 'string')
('price', 'double')


#### Selecting Columns

In [8]:
from pyspark.sql.types import StringType, FloatType, StructType, StructField
data = map(lambda r: (r[0], r[1], float(r[2])),
  map(lambda x: x.split(","),
    ["Paris,Food,19.00", "Marseille,Clothing,12.00",
     "Paris,Food,8.00", "Paris,Clothing,15.00",
     "Marseille,Food,20.00", "Lyon,Book,10.00"]))

schema = StructType([
	StructField("city",  StringType(), nullable=True),
	StructField("type",  StringType(), nullable=True),
	StructField("price", FloatType(),  nullable=True)
])

df = spark.createDataFrame(data, schema=schema)
df.show()

+---------+--------+-----+
|     city|    type|price|
+---------+--------+-----+
|    Paris|    Food| 19.0|
|Marseille|Clothing| 12.0|
|    Paris|    Food|  8.0|
|    Paris|Clothing| 15.0|
|Marseille|    Food| 20.0|
|     Lyon|    Book| 10.0|
+---------+--------+-----+



In [9]:
df.select("city").show()

+---------+
|     city|
+---------+
|    Paris|
|Marseille|
|    Paris|
|    Paris|
|Marseille|
|     Lyon|
+---------+



In [11]:
df.select(["city", "type"]).show()

+---------+--------+
|     city|    type|
+---------+--------+
|    Paris|    Food|
|Marseille|Clothing|
|    Paris|    Food|
|    Paris|Clothing|
|Marseille|    Food|
|     Lyon|    Book|
+---------+--------+



#### Filter Commands

In [12]:
df.filter(df.city == "Paris").show()

+-----+--------+-----+
| city|    type|price|
+-----+--------+-----+
|Paris|    Food| 19.0|
|Paris|    Food|  8.0|
|Paris|Clothing| 15.0|
+-----+--------+-----+



In [13]:
df.filter(df.city == "Paris").select("city", "type").show()

+-----+--------+
| city|    type|
+-----+--------+
|Paris|    Food|
|Paris|    Food|
|Paris|Clothing|
+-----+--------+



In [14]:
df.filter(df.city == 'Paris').filter(df.type == 'Food').show()

+-----+----+-----+
| city|type|price|
+-----+----+-----+
|Paris|Food| 19.0|
|Paris|Food|  8.0|
+-----+----+-----+



In [17]:
df.filter(df.city == 'Paris').filter(df.type == 'Food').filter(df.price < 18).show()

+-----+----+-----+
| city|type|price|
+-----+----+-----+
|Paris|Food|  8.0|
+-----+----+-----+



In [26]:
df.filter(
    (df.city == "Paris") & (df.price > 18)  ).select(["city", "price"]).show()

+-----+-----+
| city|price|
+-----+-----+
|Paris| 19.0|
+-----+-----+



In [31]:
df.filter(df.city == "Paris").filter(df.type == "Food").sort(df.price.asc()).show()

+-----+----+-----+
| city|type|price|
+-----+----+-----+
|Paris|Food|  8.0|
|Paris|Food| 19.0|
+-----+----+-----+



#### Manipulating Columns 