In [0]:
#%run "./Includes/Classroom-Setup"

In [0]:
from pyspark.sql.functions import col, rand

df = (spark.range(1,100000)
     .withColumn("id", (col('id') / 1000).cast('integer')) # get the id and divided to obtain a new one
     .withColumn('v', rand(seed=1))) # random column 
      
df

In [0]:
df.count()

In [0]:
df.explain(extended=True) 

In [0]:
display(df.sample(.001))

id,v
2,0.2972678830040518
2,0.3600125197796093
3,0.6008990160003462
5,0.5408346498257187
5,0.5435817415135986
5,0.5398515256187291
8,0.6697933372886988
10,0.7684137580180018
10,0.5307415383445514
11,0.4866479907849044


In [0]:
# Transform python table to sql spark table (view sql) in order to query our table with SQL command
df.createOrReplaceTempView('df_temp')

In [0]:
%sql
SELECT * 
FROM df_temp
LIMIT 10

id,v
0,0.6363787615254752
0,0.5993846534021868
0,0.134842710012538
0,0.076841639054609
0,0.8539211111755448
0,0.7167704217972344
0,0.2473902407597975
0,0.1367450741851369
0,0.3869569887491171
0,0.6051540605040805


In [0]:
# at each execution, everything (id, table, project creation) is done 
# to avoid this and gain time, we can save it in cache
df.cache().count()

In [0]:
df.count() # (time divided by 3)

In [0]:
%python
# read data 
query = "select * from listings"
dataspark = spark.sql(query)
data = dataspark.toPandas()
display(dataspark)

id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
958,https://www.airbnb.com/rooms/958,20210804005755,2021-08-04,"Bright, Modern Garden Unit - 1BR/1BTH","Please check local laws re Covid before you request a reservation. Our bright garden unit overlooks a grassy backyard area with fruit trees and native plants. It is an oasis in a big city. The apartment comfortably fits a couple or small family. It is located on a cul de sac street that ends at lovely Duboce Park. The space Newly remodeled, modern, and bright garden unit in historic Victorian home. New fixtures and finishes. Zero VOC and non-toxic paint. Organic and fair-trade teas, fresh local ground coffee. Local art on walls. Sofa bed and Queen bed are in the same room. More of a petite apartment with a separate room for dining and kitchen. Guest access Full access to patio and backyard (Shared) Beautiful garden with fruit trees & native plants Washer & dryer (Shared) Children's toys Charcoal grill Other things to note Due to the fact that we have chil","Quiet cul de sac in friendly neighborhood Steps away from grassy park with 2 playgrounds and Recreational Center Very family-friendly neighborhood Quaint shops, grocery stores and restaurants all within a 5-10 minute walk",https://a0.muscache.com/pictures/b7c2a199-4c17-4ba6-b81d-751719d2dac6.jpg,1169,https://www.airbnb.com/users/show/1169,Holly,2008-07-31,"San Francisco, California, United States",We are a family of four that live upstairs. We have a large dog who occasionally can be seen in the backyard. We have lived in our home since 2005 and have been renting our apartment since 2008.,within an hour,100%,92%,t,https://a0.muscache.com/im/pictures/user/efdad96a-3efc-4bc2-bdc9-f69740a5a818.jpg?aki_policy=profile_small,https://a0.muscache.com/im/pictures/user/efdad96a-3efc-4bc2-bdc9-f69740a5a818.jpg?aki_policy=profile_x_medium,Duboce Triangle,1,1,"['email', 'phone', 'facebook', 'reviews', 'kba']",t,t,"San Francisco, California, United States",Western Addition,,37.77028,-122.43317,Entire serviced apartment,Entire home/apt,3,,1 bath,1,2,"[""Iron"", ""Hot water"", ""Essentials"", ""Oven"", ""Hair dryer"", ""Room-darkening shades"", ""Backyard"", ""Stove"", ""BBQ grill"", ""Kitchen"", ""Hangers"", ""Long term stays allowed"", ""Microwave"", ""Dishes and silverware"", ""Smoke alarm"", ""Shampoo"", ""Coffee maker"", ""TV with standard cable"", ""Outdoor furniture"", ""Washer"", ""Wifi"", ""Cooking basics"", ""Refrigerator"", ""Patio or balcony"", ""Carbon monoxide alarm"", ""Free street parking"", ""Dedicated workspace"", ""Cable TV"", ""Pack \u2019n Play/travel crib"", ""Heating"", ""First aid kit"", ""Private entrance"", ""Keypad"", ""Dryer""]",$160.00,2,1125,2,2,1125,1125,2.0,1125.0,,t,2,10,18,129,2021-08-04,291,34,4,2014-10-05,2021-07-29,4.87,4.94,4.94,4.96,4.9,4.98,4.78,City Registration Pending,f,1.0,1,0,0,3.5
5858,https://www.airbnb.com/rooms/5858,20210804005755,2021-08-04,Creative Sanctuary,"The space We live in a large Victorian house on a quiet street, nestled between the Mission district and Noe Valley. We're one block from the streetcar stop and five blocks from the subway. We have three bedrooms, two of which have double beds and windows which cast their light upon the high, warm colored walls and ceilings of this haven. We have lots of spaces in the house where you can relax, read, listen to music and just cozy up. The kitchen is well stocked and we have a bountiful garden and back deck. Amenities include: Laundry, wireless, sun room, garden, sun deck and a full kitchen. We also have a library (not perfectly organized but filled with great books). We both are self employed creatives. I own a business involved in art and design and Tania is an arts journalist. We enjoy meeting people and are well versed in the art and design communities of the Bay Area. We also know some really good haunts. Guest access Our","I love how our neighborhood feels quiet but is so close to lots of hustle and bustle. There is the train nearby that takes you straight downtown and we are smack in the middle of Noe Valley (quiet and family oriented), The Mission (hip and lots of great restaurants and bars) and Bernal Heights (more good restaurants!).",https://a0.muscache.com/pictures/17714/3a7aea10_original.jpg,8904,https://www.airbnb.com/users/show/8904,Philip And Tania,2009-03-02,"San Francisco, California, United States","Philip: English transplant to the Bay Area and half Spanish adoptive son of hardworking Yorkshire folk. I'm married to the one, engaged to my work and in love with exploring the merging of design art and craft.",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Tania: SF raised with a NY energy and an Armenian background that I am proud of. I'm an international journalist,interviewing artists and producing for radio and print. I speak several languages and love to meet people. Found my husband in a cafe in NY and came back to SF to live near my Grandma. Oh,and I love plants.,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
""",within a few hours,70%,74%,f,https://a0.muscache.com/im/users/8904/profile_pic/1283840786/original.jpg?aki_policy=profile_small,https://a0.muscache.com/im/users/8904/profile_pic/1283840786/original.jpg?aki_policy=profile_x_medium,Bernal Heights,2,2,""['email'",'phone','reviews','kba',"'work_email']""",t,t,"San Francisco, California, United States",Bernal Heights,,37.74474,-122.42089,Entire rental unit,Entire home/apt,5,,1 bath,2,3,"[""Fire extinguisher"", ""Iron"", ""Essentials"", ""Dedicated workspace"", ""Hangers"", ""Long term stays allowed"", ""Heating"", ""Washer"", ""Hair dryer"", ""First aid kit"", ""Wifi"", ""Smoke alarm"", ""Private entrance"", ""Shampoo"", ""Dryer"", ""Kitchen""]",$235.00,30,60,30,30,60,60,30.0,60.0,,t,30,60,90,365,2021-08-04,111,0,0,2009-11-24,2015-08-28,4.88,4.85,4.87,4.89,4.85,4.77,4.68,,f,1,1,0,0,0.78,,,,,,,,,,,,,,,,,,,
7918,https://www.airbnb.com/rooms/7918,20210804005755,2021-08-04,A Friendly Room - UCSF/USF - San Francisco,"Nice and good public transportation. 7 minutes walk to UCSF. 15 minutes walk to USF, ST Mary Hospital Room rental-sunny view room/sink/Wi Fi (inner sunset / UCSF) (map) The place is located within walking distance to Golden Gate Park, UCSF & USF The space Settle down, S.F. resident, student, hospital, job relocated, rotation. Monthly rental, short term staying - Nice public transportation location. No tourist or transient. No pet no smoking no party inside the building. Medium private lock room(130 square feet) single person.  Large private lock room(190 square feet) single person. Extra large private room(230 square feet) single person. Additional bed $12 per night. Surround by sunny view and good transportation. Room with sink(apartment style - shared large eat in kitchen/2 full & 2 half bathrooms). Quiet house under management. Wa","Shopping old town, restaurants, McDonald, Whole food & Target stores.",https://a0.muscache.com/pictures/26356/8030652f_original.jpg,21994,https://www.airbnb.com/users/show/21994,Aaron,2009-06-17,"San Francisco, California, United States","7 minutes walk to UCSF hospital & school campus. 15 minutes walk to USF, St. Mary Hospital. 5 minute walk to Goldern Gate Park/Whole food/ regular food store and many different style restaurants & stores in stylish town.",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Two ways public muni transportation in front the building to sunset areas/downtown/Goldern Gate Bridge/Cal Train station/Airport. Other 5 minutes walking distance bus stops to downtown and different areas.,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
I have medium side room with one bed only and an extra large size of room with one single or one full size bed only. The price just a little bit different. Do you still want it?,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Please email your job offer letter and current official picture ID / acceptable credit report and score and financial information after you make reservation.,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Additional adult person $12 per night & One time cleaning fee $50 and security deposit will be collected upon your arrival.,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Wi-Fi signal in common areas. Large eat in kitchen. Sunny sitting room.,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [0]:
# Data cleaning / cleansing Part

In [0]:
data.columns

In [0]:
# select columns
columnsToKeep = ['host_is_superhost','review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value','price','minimum_nights', 'maximum_nights','beds','number_of_reviews','bedrooms','bathrooms','room_type', 'accommodates',
       'neighbourhood_group_cleansed', 'latitude','longitude', 'property_type', 'host_total_listings_count','instant_bookable','host_is_superhost']

baseDF = dataspark.select(columnsToKeep)
baseDF.cache().count()
display(baseDF)

# columns in spark should be double instead of integer

host_is_superhost,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,price,minimum_nights,maximum_nights,beds,number_of_reviews,bedrooms,bathrooms,room_type,accommodates,neighbourhood_group_cleansed,latitude,longitude,property_type,host_total_listings_count,instant_bookable,host_is_superhost.1
t,4.94,4.96,4.9,4.98,4.78,$160.00,2,1125,2,291,1,,Entire home/apt,3,,37.77028,-122.43317,Entire serviced apartment,1,f,t
,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,
2,,,,,,2009-11-24,2015-08-28,4.88,0,,111,365,60,90,60.0,,t,30,60,,2
,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,


In [0]:
# change data type + special character replacement
from pyspark.sql.functions import col, translate

FixedPrice = baseDF.withColumn("price", translate(col('price'), '$,', '').cast('double'))
FixedPrice = FixedPrice.withColumn("minimum_nights", baseDF.minimum_nights.cast('double'))
FixedPrice = FixedPrice.withColumn("review_scores_value",baseDF.review_scores_value.cast('double'))
FixedPrice = FixedPrice.withColumn("bedrooms",baseDF.bedrooms.cast('double'))
display(FixedPrice)


host_is_superhost,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,price,minimum_nights,maximum_nights,beds,number_of_reviews,bedrooms,bathrooms,room_type,accommodates,neighbourhood_group_cleansed,latitude,longitude,property_type,host_total_listings_count,instant_bookable,host_is_superhost.1
t,4.94,4.96,4.9,4.98,4.78,160.0,2.0,1125,2,291,1.0,,Entire home/apt,3,,37.77028,-122.43317,Entire serviced apartment,1,f,t
,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,4.88,0,,111.0,365,60,90,60.0,,t,30,60,,2
,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,


In [0]:
display(FixedPrice.describe())

summary,host_is_superhost,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,price,minimum_nights,maximum_nights,beds,number_of_reviews,bedrooms,bathrooms,room_type,accommodates,neighbourhood_group_cleansed,latitude,longitude,property_type,host_total_listings_count,instant_bookable,host_is_superhost.1
count,6610,3802,3674,3592,3553,3518.0,6091.0,6227.0,6503,6414,6261,4638.0,2075,6496,6440,2029,6547,6377,6578,6401,4592,6610
mean,-73.79850615083059,4.460267594326998,4.724911504424775,4.749074125874125,4.747696867061813,4.623902785673681,196.8848054506649,36.57938333065682,597.5567256153142,21.331498121222022,7828.92100394811,35.407869771453214,399.7094618901099,472.7680895169851,49.379379714646056,367.44098685152056,111.68021295906718,-38.350079477792036,429.5308663224326,28.14603630534351,3.5173333333333328,-73.79850615083059
stddev,68.89984419200307,1.194631631782209,0.8923977633880402,0.7800715669169154,0.6382456294136449,0.6142419557446055,712.3445579491331,85.17122646306828,13507.48796052392,124.75915087343488,88302.40535231077,178.76148308079178,509.25779593161616,529.9967518243677,205.17114999901258,469.54131262125526,278.2033950848377,247.4359225366796,494.71537433861783,204.96735147005256,2.415940493569423,68.89984419200307
min,Cal Shakes,Gavin Newsom,Joe Biden,Alliance for Climate Protection,Barack Obama,0.0,0.0,0.0,Aurora Theatre Company,American Free Enterprise,Alicia Blue Gallery,0.0,Village of Ottawa Hills,Brewer for Treasurer,ShitMyDadSays,David Suzuki,Arianna Huffington,(Hidden by Airbnb) ! News,Level The Playing Field,30 Rock,The Economist,Cal Shakes
max,within an hour,t,f,f,t,5.9,25000.0,1125.0,t,t,t,1125.0,t,t,t,t,t,t,t,t,t,within an hour


In [0]:
display(FixedPrice.summary())

summary,host_is_superhost,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,price,minimum_nights,maximum_nights,beds,number_of_reviews,bedrooms,bathrooms,room_type,accommodates,neighbourhood_group_cleansed,latitude,longitude,property_type,host_total_listings_count,instant_bookable,host_is_superhost.1
count,6610,3802,3674,3592,3553,3518.0,6091.0,6227.0,6503,6414,6261,4638.0,2075,6496,6440,2029,6547,6377,6578,6401,4592,6610
mean,-73.79850615083058,4.460267594326998,4.7249115044247745,4.749074125874124,4.747696867061812,4.623902785673681,196.88480545066489,36.57938333065681,597.5567256153143,21.331498121222022,7828.92100394811,35.407869771453214,399.7094618901098,472.7680895169851,49.379379714646056,367.44098685152056,111.68021295906718,-38.350079477792036,429.5308663224326,28.14603630534351,3.5173333333333328,-73.79850615083058
stddev,68.89984419200307,1.1946316317822092,0.8923977633880403,0.7800715669169154,0.6382456294136449,0.6142419557446055,712.3445579491331,85.17122646306827,13507.487960523918,124.75915087343485,88302.40535231077,178.76148308079175,509.2577959316161,529.9967518243678,205.17114999901258,469.54131262125514,278.2033950848377,247.43592253667964,494.71537433861783,204.9673514700526,2.415940493569423,68.89984419200307
min,Cal Shakes,Gavin Newsom,Joe Biden,Alliance for Climate Protection,Barack Obama,0.0,0.0,0.0,Aurora Theatre Company,American Free Enterprise,Alicia Blue Gallery,0.0,Village of Ottawa Hills,Brewer for Treasurer,ShitMyDadSays,David Suzuki,Arianna Huffington,(Hidden by Airbnb) ! News,Level The Playing Field,30 Rock,The Economist,Cal Shakes
25%,-122.43096,4.6,4.86,4.86,4.76,4.57,57.0,2.0,25.0,1.0,1.0,1.0,5.0,14.0,2.0,8.0,37.73409,-122.43424,8.0,1.0,1.0,-122.43096
50%,-122.4099,4.89,4.97,4.98,4.92,4.75,109.0,23.0,95.0,1.0,5.0,1.0,62.0,90.0,3.0,35.0,37.76608,-122.413,72.0,2.0,4.5,-122.4099
75%,2.0,5.0,5.0,5.0,5.0,4.91,199.0,30.0,1125.0,3.0,38.0,2.0,1125.0,1125.0,6.0,730.0,37.78822,1.0,1125.0,10.0,5.0,2.0
max,within an hour,t,f,f,t,5.9,25000.0,1125.0,t,t,t,1125.0,t,t,t,t,t,t,t,t,t,within an hour


In [0]:
FixedPrice.count()

In [0]:
# filtering outlier
posPricesDF = FixedPrice.filter(col("price") > 0)
posPricesDF.count()

In [0]:
display(posPricesDF.
       groupby('minimum_nights').count()
       .orderBy(col('count').desc(), col('minimum_nights')))

# 3 years is probably an outlier
minNightsDF = posPricesDF.filter(col('minimum_nights') <= 365)

minimum_nights,count
30.0,2151
1.0,844
2.0,738
3.0,359
4.0,137
5.0,106
,78
90.0,77
31.0,72
7.0,71


In [0]:
from pyspark.sql.functions import when

inputCols = ['bedrooms','beds','bathrooms']

for c in inputCols:
  doubleDF = minNightsDF.withColumn(c + "_na", when(col(c).isNull(),1.0).otherwise(0.0))  # add new columns to process, when I have an NA, we replace by 1
  
display(doubleDF.describe())

summary,host_is_superhost,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,price,minimum_nights,maximum_nights,beds,number_of_reviews,bedrooms,bathrooms,room_type,accommodates,neighbourhood_group_cleansed,latitude,longitude,property_type,host_total_listings_count,instant_bookable,host_is_superhost.1,bathrooms_na
count,5337,3535,3466,3467,3465,3461.0,5441.0,5441.0,5434,5303,5358,4048.0,925,5386,5440,911,5440,5433,5410,5295,4536,5337,5441.0
mean,-86.25337911941139,4.704621944286526,4.870806824754189,4.826570188133138,4.783537749493782,4.633163825483968,216.06733137290945,36.28046498805367,675.3487417218544,4.836754643206256,1965.8546999999999,18.55755928853755,526.3614207650273,628.0241736419754,34.058245098954444,441.6567021984924,53.28265961400404,-88.72070466448169,517.5431818181818,25.006545813702182,1.222,-86.25337911941139,0.829994486307664
stddev,64.70570426498001,0.6526806226778857,0.41201012459754116,0.5194165853085787,0.5109667198643552,0.5816602349600675,748.6376407801803,67.86981063133304,14389.630941076872,24.4049587614573,38136.82312228434,131.08609109530778,532.375026528353,523.7942896863683,173.80698510846133,510.5285336126208,135.28106677449364,124.96627588517778,532.8188079377957,204.875922135992,1.670459018753029,64.70570426498001,0.3756721671557209
min,$125.00,0.0,0,0,0,0.0,1.0,0.0,0,$129.00,0,0.0,$594.00,$125.00,$120.00,$100.00,0,-122.36823,0,"$1,695.00",0,$125.00,0.0
max,within an hour,t,f,f,t,5.0,25000.0,365.0,"[""Fire extinguisher"", ""Essentials"", ""Hot water"", ""Hangers"", ""Cable TV"", ""Heating"", ""Building staff"", ""Hair dryer"", ""Free street parking"", ""Wifi"", ""Breakfast"", ""Smoke alarm"", ""Paid parking off premises"", ""Shampoo"", ""Indoor fireplace"", ""Carbon monoxide alarm"", ""TV with standard cable""]",t,t,1125.0,"[""TV"", ""Iron"", ""Hot water"", ""Essentials"", ""Oven"", ""Hair dryer"", ""Dishwasher"", ""Stove"", ""Kitchen"", ""Hangers"", ""Microwave"", ""Dishes and silverware"", ""Smoke alarm"", ""Lockbox"", ""Shampoo"", ""Coffee maker"", ""Fire extinguisher"", ""Wifi"", ""Cooking basics"", ""Refrigerator"", ""Patio or balcony"", ""Carbon monoxide alarm"", ""Free street parking"", ""Dedicated workspace"", ""Heating"", ""Private entrance""]",t,t,t,Shared room in residential home,Shared room,"[""TV"", ""Iron"", ""Essentials"", ""Oven"", ""Hair dryer"", ""Room-darkening shades"", ""Stove"", ""Kitchen"", ""Hangers"", ""Long term stays allowed"", ""Microwave"", ""Dishes and silverware"", ""Smoke alarm"", ""Shampoo"", ""Coffee maker"", ""Fire extinguisher"", ""Security cameras on property"", ""Washer"", ""Wifi"", ""Cooking basics"", ""Refrigerator"", ""Patio or balcony"", ""Carbon monoxide alarm"", ""Free street parking"", ""Heating"", ""First aid kit"", ""Lock on bedroom door"", ""Dryer""]",t,t,within an hour,1.0


In [0]:
# inputation
from pyspark.ml.feature import Imputer

imputCols = ['bathrooms_na']

imputer = Imputer(strategy='median', inputCols = imputCols, outputCols=imputCols)
imputerModel = imputer.fit(doubleDF)
imputeDF = imputerModel.transform(doubleDF)

In [0]:
#imputeDF.write.format('delta').mode('overwrite').save()
imputeDF = imputeDF.drop('host_is_superhost')
# to save in the WS
imputeDF.write.format('parquet').mode("overwrite").saveAsTable("training_dataset_numeric")