In [1]:
sample_data = sc.parallelize([
(1, 'MacBook Pro', 2015, '15"', '16GB', '512GB SSD'
, 13.75, 9.48, 0.61, 4.02)
, (2, 'MacBook', 2016, '12"', '8GB', '256GB SSD'
, 11.04, 7.74, 0.52, 2.03)
, (3, 'MacBook Air', 2016, '13.3"', '8GB', '128GB SSD'
, 12.8, 8.94, 0.68, 2.96)
, (4, 'iMac', 2017, '27"', '64GB', '1TB SSD'
, 25.6, 8.0, 20.3, 20.8)
])

In [2]:
sample_data_df = spark.createDataFrame(sample_data,['Id'
, 'Model'
, 'Year'
, 'ScreenSize'
, 'RAM'
, 'HDD'
, 'W'
, 'D'
, 'H'
, 'Weight'
    ]
)

In [3]:
sample_data_df.show()

+---+-----------+----+----------+----+---------+-----+----+----+------+
| Id|      Model|Year|ScreenSize| RAM|      HDD|    W|   D|   H|Weight|
+---+-----------+----+----------+----+---------+-----+----+----+------+
|  1|MacBook Pro|2015|       15"|16GB|512GB SSD|13.75|9.48|0.61|  4.02|
|  2|    MacBook|2016|       12"| 8GB|256GB SSD|11.04|7.74|0.52|  2.03|
|  3|MacBook Air|2016|     13.3"| 8GB|128GB SSD| 12.8|8.94|0.68|  2.96|
|  4|       iMac|2017|       27"|64GB|  1TB SSD| 25.6| 8.0|20.3|  20.8|
+---+-----------+----+----------+----+---------+-----+----+----+------+



In [4]:
sample_data_df.printSchema()

root
 |-- Id: long (nullable = true)
 |-- Model: string (nullable = true)
 |-- Year: long (nullable = true)
 |-- ScreenSize: string (nullable = true)
 |-- RAM: string (nullable = true)
 |-- HDD: string (nullable = true)
 |-- W: double (nullable = true)
 |-- D: double (nullable = true)
 |-- H: double (nullable = true)
 |-- Weight: double (nullable = true)



In [5]:
import pyspark.sql as sql
import pyspark.sql.functions as f
sample_data_transformed = (
sample_data_df
.rdd
.map(lambda row: sql.Row(**row.asDict(),
HDD_size=row.HDD.split(' ')[0]
)
)
.map(lambda row: sql.Row(**row.asDict(),
    HDD_type=row.HDD.split(' ')[1]
)
)
.map(lambda row: sql.Row(**row.asDict(),
Volume=row.H * row.D * row.W
)
)
.toDF()
.select(
sample_data_df.columns +
[
'HDD_size'
, 'HDD_type'
, f.round(
f.col('Volume')
).alias('Volume_cuIn')
]
)
)

In [6]:
sample_data_transformed.show()

+---+-----------+----+----------+----+---------+-----+----+----+------+--------+--------+-----------+
| Id|      Model|Year|ScreenSize| RAM|      HDD|    W|   D|   H|Weight|HDD_size|HDD_type|Volume_cuIn|
+---+-----------+----+----------+----+---------+-----+----+----+------+--------+--------+-----------+
|  1|MacBook Pro|2015|       15"|16GB|512GB SSD|13.75|9.48|0.61|  4.02|   512GB|     SSD|       80.0|
|  2|    MacBook|2016|       12"| 8GB|256GB SSD|11.04|7.74|0.52|  2.03|   256GB|     SSD|       44.0|
|  3|MacBook Air|2016|     13.3"| 8GB|128GB SSD| 12.8|8.94|0.68|  2.96|   128GB|     SSD|       78.0|
|  4|       iMac|2017|       27"|64GB|  1TB SSD| 25.6| 8.0|20.3|  20.8|     1TB|     SSD|     4157.0|
+---+-----------+----+----------+----+---------+-----+----+----+------+--------+--------+-----------+



In [7]:
import findspark
findspark.init()

import pyspark


import pyspark.sql.functions as f
import pandas as pd
from scipy import stats
big_df = (
spark
.range(0, 1000000)
.withColumn('val', f.rand())
)
big_df.cache()
big_df.show(3)
@f.pandas_udf('double', f.PandasUDFType.SCALAR)
def pandas_pdf(v):
    return pd.Series(stats.norm.pdf(v))
(
big_df
.withColumn('probability', pandas_pdf(big_df.val))
.show(5)
)

+---+-------------------+
| id|                val|
+---+-------------------+
|  0| 0.5944708698472357|
|  1|0.34362507517305874|
|  2| 0.2501750762673367|
+---+-------------------+
only showing top 3 rows

+---+-------------------+-------------------+
| id|                val|        probability|
+---+-------------------+-------------------+
|  0| 0.5944708698472357|0.33432679348128996|
|  1|0.34362507517305874|0.37607088502856584|
|  2| 0.2501750762673367| 0.3866511871448149|
|  3| 0.2564542630918356| 0.3860366647096993|
|  4|0.21242539500233737| 0.3900420179121062|
+---+-------------------+-------------------+
only showing top 5 rows

