# This notebook is going to teach you how to perform basic pyspark operations

In [49]:
# Initialise Input And Output Dataset Path Variables
input_dataset_path = "C:/Users/Jasjyot Singh Jaswal/Documents/JupyterWB/BigDataOrclProject/InputDataset"
output_datset_path = "C:/Users/Jasjyot Singh Jaswal/Documents/JupyterWB/BigDataOrclProject/OutputDataset"
output_datset_path_csv = f"{output_datset_path}/CSV"
output_datset_path_pipe = f"{output_datset_path}/PIPE"
output_datset_path_tab = f"{output_datset_path}/TAB"
output_datset_path_json = f"{output_datset_path}/JSON"

## Import Dependencies

In [44]:
import findspark
findspark.init()
import pyspark # only run after findspark.init()
from pyspark.sql.functions import col, explode,coalesce
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import LongType
from pyspark.sql.types import StructType
from pyspark.sql.functions import udf
from pyspark.sql import functions as f
spark = SparkSession.builder.getOrCreate()
import datetime
from datetime import timedelta 
from pyspark.sql.types import ArrayType
from pyspark.sql.functions import split
from pyspark.sql.types import StringType
from pyspark.sql import functions
from pyspark.sql.window import Window
from pyspark.sql.functions import *

## Load Input Dataset for Web Visitors and Display

In [3]:
web_visits = spark.read.option("header", "true").csv(f"{input_dataset_path}/webiste_visitor_data.csv")
web_visits.show(truncate=False)

+----------+----------+----------+----------------------------------+------+---------------+
|visitor_id|first_name|last_name |email                             |gender|ip_address     |
+----------+----------+----------+----------------------------------+------+---------------+
|1         |Marysa    |Arnull    |marnull0@t.co                     |Female|148.118.170.117|
|2         |Alys      |Mapledoram|amapledoram1@amazon.co.jp         |Female|189.249.7.22   |
|null      |Pepi      |Kilbourne |null                              |Female|84.196.181.206 |
|4         |Irita     |Coltan    |icoltan3@reddit.com               |Female|103.96.233.190 |
|5         |Wilburt   |Sheals    |wsheals4@ebay.com                 |Male  |51.106.36.130  |
|6         |Viviene   |Ewestace  |vewestace5@cnbc.com               |Female|189.8.44.77    |
|7         |Grange    |Bernhardt |gbernhardt6@nationalgeographic.com|Male  |237.92.0.252   |
|8         |Boris     |Gladdish  |bgladdish7@rakuten.co.jp          |M

In [4]:
###  A. Selecting only First_Name,Last_Name,Email using Select API. 
###     Also renaming field Email to MailID using withColumnRenamed

In [5]:
web_visit_with_name_mail = web_visits.select("first_name","last_name","email").withColumnRenamed("email","mail-id")
web_visit_with_name_mail.show(truncate=False)

+----------+----------+----------------------------------+
|first_name|last_name |mail-id                           |
+----------+----------+----------------------------------+
|Marysa    |Arnull    |marnull0@t.co                     |
|Alys      |Mapledoram|amapledoram1@amazon.co.jp         |
|Pepi      |Kilbourne |null                              |
|Irita     |Coltan    |icoltan3@reddit.com               |
|Wilburt   |Sheals    |wsheals4@ebay.com                 |
|Viviene   |Ewestace  |vewestace5@cnbc.com               |
|Grange    |Bernhardt |gbernhardt6@nationalgeographic.com|
|Boris     |Gladdish  |bgladdish7@rakuten.co.jp          |
|Eldon     |Hamstead  |ehamstead8@issuu.com              |
|Patrice   |Buske     |pbuske9@shop-pro.jp               |
|Fallon    |Saint     |fsainta@youtu.be                  |
|Falkner   |Merrgan   |fmerrganb@mac.com                 |
|Shanta    |Tumber    |null                              |
|Daloris   |Howood    |dhowoodd@jiathis.com             

In [6]:
### B. Selecting only IP Address & Gender by dropping other fields using DROP API

In [7]:
visitor_ip_gender = web_visits.drop("first_name","last_name","email","visitor_id")
visitor_ip_gender.show(truncate=False)

+------+---------------+
|gender|ip_address     |
+------+---------------+
|Female|148.118.170.117|
|Female|189.249.7.22   |
|Female|84.196.181.206 |
|Female|103.96.233.190 |
|Male  |51.106.36.130  |
|Female|189.8.44.77    |
|Male  |237.92.0.252   |
|Male  |214.120.52.45  |
|Male  |181.51.131.114 |
|Male  |8.4.93.195     |
|Female|28.41.126.59   |
|Male  |75.160.5.11    |
|Female|71.102.214.197 |
|Female|108.63.219.36  |
|Male  |62.178.138.189 |
|Female|49.136.89.178  |
|Male  |109.147.148.147|
|Male  |88.111.38.168  |
|Male  |205.216.12.141 |
|Male  |0.148.192.9    |
+------+---------------+
only showing top 20 rows



In [8]:
### C. Show Simple Transformation such as Concat by using Oringinal Input by contactenating First & Last Name. 
###    Also Prefix Constant Mr. Or Mrs. depending on value of gender.
###    Use function lit to append constants & When/Otherwise to evaluate condition for Gender
###    For inappropriate gender or null values omit Mr/Mrs and simply concatenate

In [9]:
web_visit_with_fullname = web_visits.withColumn("fullName",
 when(col("gender") == "Male",f.concat(lit("Mr. "),col("first_name"),col("last_name"))) \
.when(col("gender") == "Female",f.concat(lit("Mrs. "),col("first_name"),col("last_name"))) \
.otherwise(f.concat(col("first_name"),col("last_name"))))

web_visit_with_fullname.show(truncate=False)

+----------+----------+----------+----------------------------------+------+---------------+------------------------+
|visitor_id|first_name|last_name |email                             |gender|ip_address     |fullName                |
+----------+----------+----------+----------------------------------+------+---------------+------------------------+
|1         |Marysa    |Arnull    |marnull0@t.co                     |Female|148.118.170.117|Mrs. MarysaArnull       |
|2         |Alys      |Mapledoram|amapledoram1@amazon.co.jp         |Female|189.249.7.22   |Mrs. AlysMapledoram     |
|null      |Pepi      |Kilbourne |null                              |Female|84.196.181.206 |Mrs. PepiKilbourne      |
|4         |Irita     |Coltan    |icoltan3@reddit.com               |Female|103.96.233.190 |Mrs. IritaColtan        |
|5         |Wilburt   |Sheals    |wsheals4@ebay.com                 |Male  |51.106.36.130  |Mr. WilburtSheals       |
|6         |Viviene   |Ewestace  |vewestace5@cnbc.com   

In [10]:
### D.Filter Rows from Original source where Gender is Invalid & Display the sample along with count.
###   Also validate the FullName values which do not have Mr. / Mrs. & Display the sample along with count from C.
###   The Count for both should be the same validating the transformation applied is correct

In [11]:
### PART1 Check Source

In [12]:
org_rec_with_invalid_gender = web_visits.filter( (((col("gender") == "Male") | (col("gender") == "Female" )) != True)  | (col("gender").isNull()) == True ) 
src_invalid_gender_count = org_rec_with_invalid_gender.count()

In [13]:
org_rec_with_invalid_gender.show(truncate=False)

+----------+----------+---------+--------------------------+-------------+--------------+
|visitor_id|first_name|last_name|email                     |gender       |ip_address    |
+----------+----------+---------+--------------------------+-------------+--------------+
|null      |Vasili    |Notman   |null                      |HRWMale      |240.10.38.196 |
|109       |Ferrel    |Leopard  |fleopard30@feedburner.com |MaleDER      |175.110.87.150|
|135       |Lavina    |Hardway  |lhardway3q@e-recht24.de   |FemalePNPSPAP|49.91.83.178  |
|144       |Tabitha   |Wallen   |twallen3z@altervista.org  |null         |17.250.15.126 |
|991       |Davin     |Ayer     |dayerri@barnesandnoble.com|null         |88.138.125.228|
+----------+----------+---------+--------------------------+-------------+--------------+



In [14]:
src_invalid_gender_count

5

In [15]:
### PART2 Check Transformed Values.
### Use startswith Function to ensure column Value Starts with Mr. or Mrs.

In [16]:
output_rec_with_invalid_fullnm = web_visit_with_fullname.filter((col("fullName").startswith("Mr.") != True) & (col("fullName").startswith("Mrs.") != True))
invalid_fullnm_count = output_rec_with_invalid_fullnm.count()

In [17]:
output_rec_with_invalid_fullnm.show(truncate=False)

+----------+----------+---------+--------------------------+-------------+--------------+-------------+
|visitor_id|first_name|last_name|email                     |gender       |ip_address    |fullName     |
+----------+----------+---------+--------------------------+-------------+--------------+-------------+
|null      |Vasili    |Notman   |null                      |HRWMale      |240.10.38.196 |VasiliNotman |
|109       |Ferrel    |Leopard  |fleopard30@feedburner.com |MaleDER      |175.110.87.150|FerrelLeopard|
|135       |Lavina    |Hardway  |lhardway3q@e-recht24.de   |FemalePNPSPAP|49.91.83.178  |LavinaHardway|
|144       |Tabitha   |Wallen   |twallen3z@altervista.org  |null         |17.250.15.126 |TabithaWallen|
|991       |Davin     |Ayer     |dayerri@barnesandnoble.com|null         |88.138.125.228|DavinAyer    |
+----------+----------+---------+--------------------------+-------------+--------------+-------------+



In [18]:
invalid_fullnm_count

5

In [None]:
### E. Find Counts And Display Rows Having null values for Visitor ID.
### Use Coalesce to use ip_address as visitor id, if visitor id is blank

In [None]:
### PART1 Find Counts of source with null values for visitor id and display sample
### Sort/OrderBy by ip_address to ensure same 20 results are visible before and after coalsesce

In [20]:
web_visits_null_visitorid = web_visits.filter(col("visitor_id").isNull())
null_visitorid_count = web_visits_null_visitorid.count()

In [30]:
web_visits_null_visitorid.orderBy(col("ip_address").desc()).show(truncate=False)

+----------+----------+---------+-----+-------+---------------+
|visitor_id|first_name|last_name|email|gender |ip_address     |
+----------+----------+---------+-----+-------+---------------+
|null      |Kerk      |Realph   |null |Male   |97.11.198.152  |
|null      |Mersey    |Edgeworth|null |Female |94.248.246.40  |
|null      |Nobe      |Graalman |null |Male   |92.191.8.232   |
|null      |Pepi      |Kilbourne|null |Female |84.196.181.206 |
|null      |Carmelita |Camock   |null |Female |75.195.58.173  |
|null      |Mack      |Poser    |null |Male   |72.122.182.45  |
|null      |Shanta    |Tumber   |null |Female |71.102.214.197 |
|null      |Irita     |Folli    |null |Female |7.217.3.163    |
|null      |Rupert    |Stiegars |null |Male   |69.237.226.67  |
|null      |Conant    |Hawler   |null |Male   |57.94.26.165   |
|null      |Kippy     |Glisenan |null |Male   |55.81.243.209  |
|null      |Stephenie |Kilpin   |null |Female |53.48.126.221  |
|null      |Vania     |Wrassell |null |F

In [31]:
null_visitorid_count

65

In [32]:
### PART2 ApplyCoalesce Transformation for Source
### Filter visitorid having '.' using contains function to find how many records were impacted by coalesce & Take count.
### Display Sample Records and Sort/OrderBy IP Address descending to compare same set of records

In [34]:
web_visits_coalesced_visitorid = web_visits.withColumn("visitor_id",coalesce(col("visitor_id"),col("ip_address")))

In [35]:
web_visits_coalesced_visitorid.show(truncate=False)

+--------------+----------+----------+----------------------------------+------+---------------+
|visitor_id    |first_name|last_name |email                             |gender|ip_address     |
+--------------+----------+----------+----------------------------------+------+---------------+
|1             |Marysa    |Arnull    |marnull0@t.co                     |Female|148.118.170.117|
|2             |Alys      |Mapledoram|amapledoram1@amazon.co.jp         |Female|189.249.7.22   |
|84.196.181.206|Pepi      |Kilbourne |null                              |Female|84.196.181.206 |
|4             |Irita     |Coltan    |icoltan3@reddit.com               |Female|103.96.233.190 |
|5             |Wilburt   |Sheals    |wsheals4@ebay.com                 |Male  |51.106.36.130  |
|6             |Viviene   |Ewestace  |vewestace5@cnbc.com               |Female|189.8.44.77    |
|7             |Grange    |Bernhardt |gbernhardt6@nationalgeographic.com|Male  |237.92.0.252   |
|8             |Boris     |Gla

In [40]:
coalesced_visitorid_records = web_visits_coalesced_visitorid.filter(col("visitor_id").contains("."))
coalesced_records_count = coalesced_visitorid_records.count()

In [42]:
coalesced_visitorid_records.orderBy(col("ip_address").desc()).show(truncate=False)

+---------------+----------+---------+-----+-------+---------------+
|visitor_id     |first_name|last_name|email|gender |ip_address     |
+---------------+----------+---------+-----+-------+---------------+
|97.11.198.152  |Kerk      |Realph   |null |Male   |97.11.198.152  |
|94.248.246.40  |Mersey    |Edgeworth|null |Female |94.248.246.40  |
|92.191.8.232   |Nobe      |Graalman |null |Male   |92.191.8.232   |
|84.196.181.206 |Pepi      |Kilbourne|null |Female |84.196.181.206 |
|75.195.58.173  |Carmelita |Camock   |null |Female |75.195.58.173  |
|72.122.182.45  |Mack      |Poser    |null |Male   |72.122.182.45  |
|71.102.214.197 |Shanta    |Tumber   |null |Female |71.102.214.197 |
|7.217.3.163    |Irita     |Folli    |null |Female |7.217.3.163    |
|69.237.226.67  |Rupert    |Stiegars |null |Male   |69.237.226.67  |
|57.94.26.165   |Conant    |Hawler   |null |Male   |57.94.26.165   |
|55.81.243.209  |Kippy     |Glisenan |null |Male   |55.81.243.209  |
|53.48.126.221  |Stephenie |Kilpin

In [43]:
coalesced_records_count

65

# Store all output records in CSV,TSV,| as well as JSON from Example Transformations in A,B,C,D,E

In [45]:
# Persist all the output transformation to ensure they are not computed multiple times

In [46]:
web_visit_with_name_mail.persist()
visitor_ip_gender.persist()
web_visit_with_fullname.persist()
web_visits_coalesced_visitorid.persist()

DataFrame[visitor_id: string, first_name: string, last_name: string, email: string, gender: string, ip_address: string]

In [50]:
# Writing Records in CSV,TSV,| and JSON for web_visit_with_name_mail
web_visit_with_name_mail.write.format('csv').options(delimiter=',').option("header","true").save(f'{output_datset_path_csv}/web_visit_with_name_mail')
web_visit_with_name_mail.write.format('csv').options(delimiter='\t').option("header","true").save(f'{output_datset_path_tab}/web_visit_with_name_mail')
web_visit_with_name_mail.write.format('csv').options(delimiter='|').option("header","true").save(f'{output_datset_path_pipe}/web_visit_with_name_mail')
web_visit_with_name_mail.write.format('json').save(f'{output_datset_path_json}/web_visit_with_name_mail')

In [51]:
# Writing Records in CSV,TSV,| and JSON for visitor_ip_gender
visitor_ip_gender.write.format('csv').options(delimiter=',').option("header","true").save(f'{output_datset_path_csv}/visitor_ip_gender')
visitor_ip_gender.write.format('csv').options(delimiter='\t').option("header","true").save(f'{output_datset_path_tab}/visitor_ip_gender')
visitor_ip_gender.write.format('csv').options(delimiter='|').option("header","true").save(f'{output_datset_path_pipe}/visitor_ip_gender')
visitor_ip_gender.write.format('json').save(f'{output_datset_path_json}/visitor_ip_gender')

In [52]:
# Writing Records in CSV,TSV,| and JSON for web_visit_with_fullname
web_visit_with_fullname.write.format('csv').options(delimiter=',').option("header","true").save(f'{output_datset_path_csv}/web_visit_with_fullname')
web_visit_with_fullname.write.format('csv').options(delimiter='\t').option("header","true").save(f'{output_datset_path_tab}/web_visit_with_fullname')
web_visit_with_fullname.write.format('csv').options(delimiter='|').option("header","true").save(f'{output_datset_path_pipe}/web_visit_with_fullname')
web_visit_with_fullname.write.format('json').save(f'{output_datset_path_json}/web_visit_with_fullname')

In [53]:
# Writing Records in CSV,TSV,| and JSON for web_visit_with_fullname
web_visits_coalesced_visitorid.write.format('csv').options(delimiter=',').option("header","true").save(f'{output_datset_path_csv}/web_visits_coalesced_visitorid')
web_visits_coalesced_visitorid.write.format('csv').options(delimiter='\t').option("header","true").save(f'{output_datset_path_tab}/web_visits_coalesced_visitorid')
web_visits_coalesced_visitorid.write.format('csv').options(delimiter='|').option("header","true").save(f'{output_datset_path_pipe}/web_visits_coalesced_visitorid')
web_visits_coalesced_visitorid.write.format('json').save(f'{output_datset_path_json}/web_visits_coalesced_visitorid')