## Statistics with SparkML
#### pyspark.ml.stat is only available in Spark 2.4.0.

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[5]').appName('SPark ML Statistics Handson').getOrCreate()

In [2]:
df = (
    spark
    .read
    .csv(
        'H:\Training\PySpark\data\yellow_tripdata_2019-01.csv_Pieces/yellow_tripdata_2019-01_1.csv'
        , header=True
        , inferSchema=True)
)

In [37]:
cols=df.columns

In [43]:
print(cols)

['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge']


In [44]:
dfselect=df.select(cols[3:5])

In [45]:
dfRDD=dfselect.rdd.map(list)

In [46]:
dfRDD.top(10)

[[8, 5.08],
 [8, 0.01],
 [6, 40.77],
 [6, 23.77],
 [6, 22.96],
 [6, 21.39],
 [6, 20.25],
 [6, 20.11],
 [6, 19.96],
 [6, 19.8]]

### Stats

In [47]:
from pyspark.mllib.stat import Statistics
summary = Statistics.colStats(dfRDD)

In [49]:
summary.mean()

array([1.71366  , 3.0834406])

In [50]:
print(summary.mean())  # a dense vector containing the mean value for each column
print(summary.variance())  # column-wise variance
print(summary.numNonzeros())  # number of nonzeros in each column

[1.71366   3.0834406]
[ 1.64426229 10.19566132]
[49344. 49545.]


## Correlation Statistics

In [51]:
print(Statistics.corr(dfRDD, method="pearson"))

[[ 1.000000e+00 -2.789746e-04]
 [-2.789746e-04  1.000000e+00]]


## Hypothesis Testing
#### Statistics provides methods to run Pearson’s chi-squared tests. The following example demonstrates how to run and interpret hypothesis tests.

In [52]:
from pyspark.mllib.linalg import Matrices

In [108]:
dfselect.show()

+---------------+-------------+
|passenger_count|trip_distance|
+---------------+-------------+
|              1|          1.5|
|              1|          2.6|
|              3|          0.0|
|              5|          0.0|
|              5|          0.0|
|              5|          0.0|
|              5|          0.0|
|              1|          1.3|
|              1|          3.7|
|              2|          2.1|
|              2|          2.8|
|              1|          0.7|
|              1|          8.7|
|              1|          6.3|
|              1|          2.7|
|              1|         0.38|
|              1|         0.55|
|              1|          0.3|
|              1|         1.42|
|              1|         1.72|
+---------------+-------------+
only showing top 20 rows



In [109]:
cf=dfselect.crosstab('passenger_count','trip_distance')

In [110]:
cm= np.array(cf.collect())

In [111]:
independenceTestResult = Statistics.chiSqTest(cm)

In [123]:
print(independenceTestResult)

Chi squared test summary:
method: pearson
degrees of freedom = 14135 
statistic = 1261600.1704404952 
pValue = 0.0 
Very strong presumption against null hypothesis: observed follows the same distribution as expected..


In [124]:
vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25)  # a vector composed of the frequencies of events

# compute the goodness of fit. If a second vector to test against
# is not supplied as a parameter, the test runs against a uniform distribution.
goodnessOfFitTestResult = Statistics.chiSqTest(vec)

# summary of the test including the p-value, degrees of freedom,
# test statistic, the method used, and the null hypothesis.
print("%s\n" % goodnessOfFitTestResult)

Chi squared test summary:
method: pearson
degrees of freedom = 4 
statistic = 0.12499999999999999 
pValue = 0.998126379239318 
No presumption against null hypothesis: observed follows the same distribution as expected..



In [126]:
 from pyspark.mllib.stat import KernelDensity

# an RDD of sample data
data = sc.parallelize([1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0])

# Construct the density estimator with the sample data and a standard deviation for the Gaussian
# kernels
kd = KernelDensity()
kd.setSample(data)
kd.setBandwidth(3.0)

# Find density estimates for the given values
densities = kd.estimate([-1.0, 2.0, 5.0])

In [130]:
densities.data()

TypeError: 'memoryview' object is not callable