In [None]:
import findspark
findspark.init()

In [None]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

In [None]:
sc = SparkContext()

In [None]:
spark = SparkSession(sc)

In [None]:
#1. 
df = spark.read.csv('Cung cap du lieu buoi 4/voters_data/DallasCouncilVoters.csv', header = True,
                   inferSchema = True)

In [None]:
#2. 
df.count()

44625

In [None]:
df.printSchema()

root
 |-- DATE: string (nullable = true)
 |-- TITLE: string (nullable = true)
 |-- VOTER_NAME: string (nullable = true)



In [None]:
df.show(5)

+----------+-------------+-------------------+
|      DATE|        TITLE|         VOTER_NAME|
+----------+-------------+-------------------+
|02/08/2017|Councilmember|  Jennifer S. Gates|
|02/08/2017|Councilmember| Philip T. Kingston|
|02/08/2017|        Mayor|Michael S. Rawlings|
|02/08/2017|Councilmember|       Adam Medrano|
|02/08/2017|Councilmember|       Casey Thomas|
+----------+-------------+-------------------+
only showing top 5 rows



In [None]:
from pyspark.sql.functions import col, udf
from pyspark.sql.functions import isnan, when, count, col

In [None]:
#3. Kiểm tra dữ liệu NaN, null
df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).toPandas().T

Unnamed: 0,0
DATE,0
TITLE,0
VOTER_NAME,0


In [None]:
# => No Nan values

In [None]:
df.select([count(when(col(c).isNull(),c)).alias(c) for c in 
          df.columns]).toPandas().T

Unnamed: 0,0
DATE,0
TITLE,195
VOTER_NAME,503


In [None]:
# => There are some null values. Delete data has VOTER_NAME null

In [None]:
df = df.dropna(subset='VOTER_NAME')

In [None]:
df.select([count(when(col(c).isNull(), c)).alias(c) for c in
           df.columns]).toPandas().T

Unnamed: 0,0
DATE,0
TITLE,0
VOTER_NAME,0


In [None]:
# => No Null value left

In [None]:
#4. 
num_rows = df.count()
num_dist_rows = df.distinct().count()
dup_rows = num_rows - num_dist_rows

In [None]:
display(num_rows, num_dist_rows, dup_rows)

44122

1273

42849

In [None]:
# Check duplicate
df.filter(df['VOTER_NAME'] == 'Philip T. Kingston').show(5)

+----------+-------------+------------------+
|      DATE|        TITLE|        VOTER_NAME|
+----------+-------------+------------------+
|02/08/2017|Councilmember|Philip T. Kingston|
|02/08/2017|Councilmember|Philip T. Kingston|
|01/11/2017|Councilmember|Philip T. Kingston|
|09/14/2016|Councilmember|Philip T. Kingston|
|01/04/2017|Councilmember|Philip T. Kingston|
+----------+-------------+------------------+
only showing top 5 rows



In [None]:
df = df.drop_duplicates()

In [None]:
df.count()

1273

In [None]:
#5. Show the distinct VOTER_NAME entries
df.select(df['VOTER_NAME']).distinct().show(10)

+--------------------+
|          VOTER_NAME|
+--------------------+
|      Tennell Atkins|
|  the  final   20...|
|        Scott Griggs|
|       Scott  Griggs|
|       Sandy Greyson|
| Michael S. Rawlings|
| the final 2018 A...|
|        Kevin Felder|
|        Adam Medrano|
|       Casey  Thomas|
+--------------------+
only showing top 10 rows



In [None]:
from pyspark.sql.functions import *

In [None]:
#6. Filter df where the VOTER_NAME is 1-20 characters in length
df = df.filter('length(VOTER_NAME) > 0 and length(VOTER_NAME) < 20')

In [None]:
df.show(5)

+----------+--------------------+------------------+
|      DATE|               TITLE|        VOTER_NAME|
+----------+--------------------+------------------+
|04/11/2018|Deputy Mayor Pro Tem|      Adam Medrano|
|02/14/2018|       Councilmember|   Lee M. Kleinman|
|04/25/2018|       Councilmember|    Tennell Atkins|
|08/29/2018|       Councilmember|      Kevin Felder|
|10/18/2017|       Councilmember|Jennifer S.  Gates|
+----------+--------------------+------------------+
only showing top 5 rows



In [None]:
#7. Filter out df where the VOTER_NAME contains an underscore
df = df.filter(~col('VOTER_NAME').contains('_'))

In [None]:
# Show the distinct VOTER_NAME entries again
df.select('VOTER_NAME').distinct().show(10, truncate=False)

+-------------------+
|VOTER_NAME         |
+-------------------+
|Tennell Atkins     |
|Scott Griggs       |
|Scott  Griggs      |
|Sandy Greyson      |
|Michael S. Rawlings|
|Kevin Felder       |
|Adam Medrano       |
|Casey  Thomas      |
|Mark  Clayton      |
|Casey Thomas       |
+-------------------+
only showing top 10 rows



## Modifying DataFrame

In [None]:
#8. Add a new column called splits separated on whitespace
df = df.withColumn('splits', split(df.VOTER_NAME, '\s+'))

In [None]:
#9. Create a new column called first_name based on the first item in splits
df = df.withColumn('first_name', df.splits.getItem(0))

In [None]:
#10. Get the last entry of the splits list and create a column called last_name
df = df.withColumn('last_name', df.splits.getItem(size('splits') - 1))

In [None]:
# Drop the splits column
df = df.drop('splits')

In [None]:
# Show the voter_df DataFrame
df.show(3)

+----------+--------------------+---------------+----------+---------+
|      DATE|               TITLE|     VOTER_NAME|first_name|last_name|
+----------+--------------------+---------------+----------+---------+
|04/11/2018|Deputy Mayor Pro Tem|   Adam Medrano|      Adam|  Medrano|
|02/14/2018|       Councilmember|Lee M. Kleinman|       Lee| Kleinman|
|04/25/2018|       Councilmember| Tennell Atkins|   Tennell|   Atkins|
+----------+--------------------+---------------+----------+---------+
only showing top 3 rows



In [None]:
#11. Add a column to df for any voter with the title 'Councilmember'
df = df.withColumn('random_val', when(df.TITLE == 'Councilmember', rand()))

In [None]:
# Show some of the DataFrame rows, noting whether the when() clause worked
df.show(5)

+----------+--------------------+------------------+----------+---------+-------------------+
|      DATE|               TITLE|        VOTER_NAME|first_name|last_name|         random_val|
+----------+--------------------+------------------+----------+---------+-------------------+
|04/11/2018|Deputy Mayor Pro Tem|      Adam Medrano|      Adam|  Medrano|               null|
|02/14/2018|       Councilmember|   Lee M. Kleinman|       Lee| Kleinman| 0.7381034495972109|
|04/25/2018|       Councilmember|    Tennell Atkins|   Tennell|   Atkins|0.14629434190114454|
|08/29/2018|       Councilmember|      Kevin Felder|     Kevin|   Felder|0.34711766608883377|
|10/18/2017|       Councilmember|Jennifer S.  Gates|  Jennifer|    Gates|0.37619221876031905|
+----------+--------------------+------------------+----------+---------+-------------------+
only showing top 5 rows



In [None]:
# Add a column to df for a voter based on their position
df = df.withColumn('random_val', 
                                when(df.TITLE=='Councilmember', rand())
                                .when(df.TITLE=='Mayor', 2)
                                .otherwise(0))

In [None]:
# Show some of the DataFrame rows
df.show(5)

+----------+--------------------+------------------+----------+---------+------------------+
|      DATE|               TITLE|        VOTER_NAME|first_name|last_name|        random_val|
+----------+--------------------+------------------+----------+---------+------------------+
|04/11/2018|Deputy Mayor Pro Tem|      Adam Medrano|      Adam|  Medrano|               0.0|
|02/14/2018|       Councilmember|   Lee M. Kleinman|       Lee| Kleinman|0.3681123969623975|
|04/25/2018|       Councilmember|    Tennell Atkins|   Tennell|   Atkins|0.4396450616742833|
|08/29/2018|       Councilmember|      Kevin Felder|     Kevin|   Felder|0.3039368955876056|
|10/18/2017|       Councilmember|Jennifer S.  Gates|  Jennifer|    Gates|0.5820830753584342|
+----------+--------------------+------------------+----------+---------+------------------+
only showing top 5 rows



In [None]:
#12. Use the .filter() clause with random_val
df.filter(df.random_val == 0).show(5)

+----------+--------------------+-----------------+----------+---------+----------+
|      DATE|               TITLE|       VOTER_NAME|first_name|last_name|random_val|
+----------+--------------------+-----------------+----------+---------+----------+
|04/11/2018|Deputy Mayor Pro Tem|     Adam Medrano|      Adam|  Medrano|       0.0|
|04/12/2017|       Mayor Pro Tem| Monica R. Alonzo|    Monica|   Alonzo|       0.0|
|06/28/2017|Deputy Mayor Pro Tem|     Adam Medrano|      Adam|  Medrano|       0.0|
|01/03/2018|Deputy Mayor Pro Tem|     Adam Medrano|      Adam|  Medrano|       0.0|
|01/17/2018|       Mayor Pro Tem|Dwaine R. Caraway|    Dwaine|  Caraway|       0.0|
+----------+--------------------+-----------------+----------+---------+----------+
only showing top 5 rows



## UDF

In [None]:
from pyspark.sql.types import *

In [None]:
def getFirstAndMiddle(names):
    # Return a space separated string of names
    return ' '.join(names[:-1])

In [None]:
#13. Define the method as a UDF
udfFirstAndMiddle = udf(getFirstAndMiddle, StringType())

In [None]:
#14. Create a new column using your UDF
df = df.withColumn('first_and_middle_name', udfFirstAndMiddle(df.splits))

In [None]:
#15. Drop the unnecessary columns then show the DataFrame
df = df.drop('first_name')
df = df.drop('splits')

In [None]:
df.show(5)

+----------+--------------------+------------------+---------+------------------+---------------------+
|      DATE|               TITLE|        VOTER_NAME|last_name|        random_val|first_and_middle_name|
+----------+--------------------+------------------+---------+------------------+---------------------+
|04/11/2018|Deputy Mayor Pro Tem|      Adam Medrano|  Medrano|               0.0|                 Adam|
|02/14/2018|       Councilmember|   Lee M. Kleinman| Kleinman|0.3681123969623975|               Lee M.|
|04/25/2018|       Councilmember|    Tennell Atkins|   Atkins|0.4396450616742833|              Tennell|
|08/29/2018|       Councilmember|      Kevin Felder|   Felder|0.3039368955876056|                Kevin|
|10/18/2017|       Councilmember|Jennifer S.  Gates|    Gates|0.5820830753584342|          Jennifer S.|
+----------+--------------------+------------------+---------+------------------+---------------------+
only showing top 5 rows



## Adding an ID Field 

In [None]:
# Select all the unique council voters
df = df.select(df['VOTER_NAME']).distinct()

# Count the rows in voter_df
print('\nThere are %d rows in the df DataFrame.\n' % df.count())


There are 27 rows in the df DataFrame.



In [None]:
#16. Add a ROW_ID
df = df.withColumn('ROW_ID', monotonically_increasing_id())

In [None]:
#17. Show the rows with 10 highest IDs in the set
df.orderBy(df.ROW_ID.desc()).show(10)

+-------------------+-------------+
|         VOTER_NAME|       ROW_ID|
+-------------------+-------------+
|       Lee Kleinman|1709396983808|
|        Erik Wilson|1700807049216|
|Carolyn King Arnold|1632087572480|
|Rickey D.  Callahan|1597727834112|
|   Monica R. Alonzo|1382979469312|
|    Lee M. Kleinman|1228360646656|
|  Jennifer S. Gates|1194000908288|
|Philip T.  Kingston|1185410973696|
|  Dwaine R. Caraway|1142461300736|
| Rickey D. Callahan|1125281431553|
+-------------------+-------------+
only showing top 10 rows



## IDs with different partitions

In [None]:
# Extend part

In [None]:
# Print the number of partitions in each DataFrame
print('\nThere are %d partitions in the df DataFrame.\n' % df.rdd.getNumPartitions())


There are 200 partitions in the df DataFrame.



- Make sure to store the result of .rdd.max()[0] in the variable.
- monotonically_increasing_id() returns an integer. You can modify that value in-line.
- Make sure to show both DataFrame

In [None]:
# Determine the highest ROW_ID and save it in previous_max_ID
previous_max_ID = df.select('ROW_ID').rdd.max()[0]

# Add a ROW_ID column to df_april starting at the desired value
voter_df_april = df.withColumn('ROW_ID', 
                              monotonically_increasing_id() + previous_max_ID)

In [None]:
# Show the ROW_ID from both DataFrame and compare
df.select('ROW_ID').show(5)
voter_df_april.select('ROW_ID').show(5)

+------------+
|      ROW_ID|
+------------+
|  8589934592|
| 34359738368|
| 42949672960|
| 51539607552|
|103079215104|
+------------+
only showing top 5 rows

+-------------+
|       ROW_ID|
+-------------+
|1717986918400|
|1743756722176|
|1752346656768|
|1760936591360|
|1812476198912|
+-------------+
only showing top 5 rows

