In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
from pyspark.sql import Row, SQLContext, SparkSession

In [3]:
sc = SparkContext()
spark = SparkSession(sc)

In [4]:
file_name = 'hdfs://bigdata.laptrinhpython.net:19000/people.csv'
people = spark.read.csv(file_name, inferSchema=True, header=True)
people.show(5)

+---+---------+--------------+------+-------------+
|_c0|person_id|          name|   sex|date of birth|
+---+---------+--------------+------+-------------+
|  0|      100|Penelope Lewis|female|   1990-08-31|
|  1|      101| David Anthony|  male|   1971-10-14|
|  2|      102|     Ida Shipp|female|   1962-05-24|
|  3|      103|  Joanna Moore|female|   2017-03-10|
|  4|      104|Lisandra Ortiz|female|   2020-08-05|
+---+---------+--------------+------+-------------+
only showing top 5 rows



In [5]:
people_view = people.createOrReplaceTempView("people")

In [6]:
print(people_view)

None


### Truy vấn có điều kiện với where: dùng spark.sql(query)

In [7]:
# Construct a query to select names of the people fron the temporary table "people"
query = 'SELECT name FROM people'
# Assigin the result of Spark's query to people_df_names
people_df_names = spark.sql(query)
# Print the top 10 names of the people
people_df_names.show(10)

+----------------+
|            name|
+----------------+
|  Penelope Lewis|
|   David Anthony|
|       Ida Shipp|
|    Joanna Moore|
|  Lisandra Ortiz|
|   David Simmons|
|   Edward Hudson|
|    Albert Jones|
|Leonard Cavender|
|  Everett Vadala|
+----------------+
only showing top 10 rows



In [8]:
# Filter the people table to select female sex
people_female_df = spark.sql('SELECT * FROM people WHERE sex=="female"')
# Filter the people table DataFrame to select male sex
people_male_df = spark.sql('SELECT * FROM people WHERE sex=="male"')
# Count the number of the rows in both DataFrames
print("There are", people_female_df.count(), "rows in the people_female_df and", people_male_df.count(), "rows in the people_male_df")

There are 49014 rows in the people_female_df and 49066 rows in the people_male_df


### Truy vấn vói LIKE

In [9]:
people_Vadala = spark.sql('SELECT * FROM people WHERE name LIKE "%Vadala"')
people_Vadala.show()

+-----+---------+--------------+------+-------------+
|  _c0|person_id|          name|   sex|date of birth|
+-----+---------+--------------+------+-------------+
|    9|      109|Everett Vadala|  male|   2005-05-24|
|77207|    77307| Marlyn Vadala|female|   1992-07-02|
+-----+---------+--------------+------+-------------+



In [10]:
people_Albert = spark.sql('SELECT * FROM people WHERE name LIKE "Albert%"')
people_Albert.show()

+-----+---------+------------------+------+-------------+
|  _c0|person_id|              name|   sex|date of birth|
+-----+---------+------------------+------+-------------+
|    7|      107|      Albert Jones|  male|   1990-09-13|
|  380|      480|   Albert Guillory|  male|   2001-02-10|
| 1577|     1677|     Albert Miller|  male|   1976-09-16|
| 1914|     2014|      Albert Goetz|  male|   1978-03-06|
| 2011|     2111|     Albert Clever|  male|   1985-12-22|
| 3193|     3293|    Albert Griffin|  male|   1988-11-03|
| 3419|     3519|   Albert Amundsen|  male|   2031-09-20|
| 3760|     3860|  Albert Pritchard|  male|   1973-01-02|
| 4714|     4814|     Albert Omeara|  male|   1989-06-07|
| 5777|     5877| Albert Villarreal|  male|   1946-04-11|
| 7081|     7181|     Albert Tshudy|  male|   1930-08-26|
| 7410|     7510|Alberta Abramowski|female|   1989-01-25|
| 7705|     7805|    Albert Painter|  male|   1961-12-16|
| 8894|     8994|   Alberto Gariepy|  male|   2001-11-03|
| 9407|     95

### Truy vấn với SUBSTRING(column_name, from, length)

In [11]:
name_sub = spark.sql('SELECT SUBSTRING(name, 8 , 10) FROM people WHERE name LIKE "Albert%"')
name_sub.show(10)

+----------------------+
|substring(name, 8, 10)|
+----------------------+
|                 Jones|
|              Guillory|
|                Miller|
|                 Goetz|
|                Clever|
|               Griffin|
|              Amundsen|
|             Pritchard|
|                Omeara|
|            Villarreal|
+----------------------+
only showing top 10 rows



### Startswith / Endswith

In [12]:
people_Albert_1 = people.where(people.name.startswith("Albert"))
people_Albert_1.count()

234