In [31]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler, StringIndexer, Imputer, OneHotEncoderEstimator
from pyspark.sql.functions import corr, udf, regexp_replace, desc, col, when
from pyspark.sql.types import DoubleType

In [32]:
spark = SparkSession.builder.master("local[4]").appName("spg2").getOrCreate()

In [33]:
df = spark.read.format('csv').option('header', 'true').option('inferSchema', 'true').load('data/exercise_03_train.csv')

In [34]:
df.printSchema()

root
 |-- x0: double (nullable = true)
 |-- x1: double (nullable = true)
 |-- x2: double (nullable = true)
 |-- x3: double (nullable = true)
 |-- x4: double (nullable = true)
 |-- x5: double (nullable = true)
 |-- x6: double (nullable = true)
 |-- x7: double (nullable = true)
 |-- x8: double (nullable = true)
 |-- x9: double (nullable = true)
 |-- x10: double (nullable = true)
 |-- x11: double (nullable = true)
 |-- x12: double (nullable = true)
 |-- x13: double (nullable = true)
 |-- x14: double (nullable = true)
 |-- x15: double (nullable = true)
 |-- x16: double (nullable = true)
 |-- x17: double (nullable = true)
 |-- x18: double (nullable = true)
 |-- x19: double (nullable = true)
 |-- x20: double (nullable = true)
 |-- x21: double (nullable = true)
 |-- x22: double (nullable = true)
 |-- x23: double (nullable = true)
 |-- x24: double (nullable = true)
 |-- x25: double (nullable = true)
 |-- x26: double (nullable = true)
 |-- x27: double (nullable = true)
 |-- x28: double (nullabl

In [35]:
for x in range(0, len(df.columns)):
    df.select(df.columns[x]).describe().show()

+-------+------------------+
|summary|                x0|
+-------+------------------+
|  count|             39989|
|   mean|23.170163974827418|
| stddev|108.21409581409586|
|    min|-413.8864838788073|
|    max| 499.6709528499989|
+-------+------------------+

+-------+--------------------+
|summary|                  x1|
+-------+--------------------+
|  count|               39990|
|   mean|-0.05898321626309...|
| stddev| 0.27696357550670037|
|    min| -1.1706779593267254|
|    max|  1.0941584537240783|
+-------+--------------------+

+-------+-------------------+
|summary|                 x2|
+-------+-------------------+
|  count|              39994|
|   mean|  1.639221469459076|
| stddev|  36.48343283384617|
|    min|-155.50638202658646|
|    max| 146.66296966177242|
+-------+-------------------+

+-------+-------------------+
|summary|                 x3|
+-------+-------------------+
|  count|              39990|
|   mean|-0.8156198657092192|
| stddev|  16.66105734554434|
|    mi

+-------+-------------------+
|summary|                x31|
+-------+-------------------+
|  count|              39992|
|   mean|0.04360227056216378|
| stddev|   9.41799653183694|
|    min|-38.970805817853744|
|    max| 37.285784165641736|
+-------+-------------------+

+-------+--------------------+
|summary|                 x32|
+-------+--------------------+
|  count|               39995|
|   mean|0.010438844947824235|
| stddev|   8.859309115597817|
|    min|  -37.05825854173414|
|    max|   42.27748004041602|
+-------+--------------------+

+-------+-------------------+
|summary|                x33|
+-------+-------------------+
|  count|              39991|
|   mean|-1.9970130000210122|
| stddev| 10.374541020833243|
|    min|-47.939956764878765|
|    max| 41.234313410755874|
+-------+-------------------+

+-------+----------+
|summary|       x34|
+-------+----------+
|  count|     39993|
|   mean|      null|
| stddev|      null|
|    min|     Honda|
|    max|volkswagon|
+-------+-

+-------+--------------------+
|summary|                 x64|
+-------+--------------------+
|  count|               39994|
|   mean|0.002640133417181...|
| stddev|  1.6883341440817452|
|    min|  -6.469411274884286|
|    max|    7.15868346758771|
+-------+--------------------+

+-------+--------------------+
|summary|                 x65|
+-------+--------------------+
|  count|               39986|
|   mean|-0.01858719880843603|
| stddev|   9.760571539676954|
|    min|  -42.63340113535817|
|    max|   42.66582184872517|
+-------+--------------------+

+-------+--------------------+
|summary|                 x66|
+-------+--------------------+
|  count|               39989|
|   mean|-0.10780489058624029|
| stddev|  1.8015694396439799|
|    min|  -8.333819505628679|
|    max|   9.580095666308337|
+-------+--------------------+

+-------+--------------------+
|summary|                 x67|
+-------+--------------------+
|  count|               39992|
|   mean|-0.01260216428628...|
| std

+-------+-------------------+
|summary|                x96|
+-------+-------------------+
|  count|              39987|
|   mean|-0.5051002488905024|
| stddev| 13.254743527932606|
|    min| -60.31087974588879|
|    max| 54.964037550786166|
+-------+-------------------+

+-------+-------------------+
|summary|                x97|
+-------+-------------------+
|  count|              39988|
|   mean|-1.0655546215346372|
| stddev|  7.673260043603108|
|    min|-30.759058275141353|
|    max|  32.23603287176438|
+-------+-------------------+

+-------+--------------------+
|summary|                 x98|
+-------+--------------------+
|  count|               39993|
|   mean|0.004239112405158896|
| stddev|  1.0209036217396876|
|    min| -4.2646705139277135|
|    max|     4.1146388930204|
+-------+--------------------+

+-------+-------------------+
|summary|                x99|
+-------+-------------------+
|  count|              39989|
|   mean|0.30148270088042406|
| stddev|  8.557657527710955

In [36]:
categorical = [typ[0] for typ in df.dtypes if typ[1] == 'string']
numeric = [typ[0] for typ in df.dtypes if typ[1] == 'double']
df.select(categorical).show(5)

+----------+---------+--------+------+----+----+
|       x34|      x35|     x41|   x45| x68| x93|
+----------+---------+--------+------+----+----+
|     Honda|      wed| $229.47|  0.0%|July|asia|
|     Honda|wednesday|  $213.9| -0.0%| Jun|asia|
|volkswagon|  thurday|$2207.13|-0.02%| Aug|asia|
|volkswagon|  thurday|  $82.09| -0.0%|July|asia|
|      ford|  thurday| $273.61| 0.01%|July|asia|
+----------+---------+--------+------+----+----+
only showing top 5 rows



In [37]:
df = df.withColumn('x41', regexp_replace('x41', '[^\\d.]', '').cast(DoubleType()))
df = df.withColumn('x45', regexp_replace('x45', '[^\\d.]', '').cast(DoubleType()))
df.select(['x41', 'x45']).show(10)

+-------+----+
|    x41| x45|
+-------+----+
| 229.47| 0.0|
|  213.9| 0.0|
|2207.13|0.02|
|  82.09| 0.0|
| 273.61|0.01|
| 439.68| 0.0|
| 152.64|0.02|
| 847.13|0.02|
| 235.98| 0.0|
| 550.02|0.01|
+-------+----+
only showing top 10 rows



In [38]:
df.groupBy('x34').count().show()

+----------+-----+
|       x34|count|
+----------+-----+
|      null|    7|
|volkswagon|12610|
|     Honda| 5176|
|     tesla| 2248|
|       bmw| 7306|
|      ford|  168|
|  mercades|   29|
| chevrolet|   12|
|    nissan|  342|
|    Toyota|10884|
| chrystler| 1218|
+----------+-----+



In [39]:
df.groupBy('x35').count().show()

+---------+-----+
|      x35|count|
+---------+-----+
|      fri|   26|
|     null|    9|
|     thur| 4405|
|   monday|   66|
|      wed|14775|
|wednesday| 5938|
|   friday|  521|
|  tuesday|  882|
|  thurday|13378|
+---------+-----+



In [40]:
fix_days_udf = udf(lambda x: 'wednesday' if x == 'wed' else('thursday' if x == 'thur' or x == 'thurday' else('friday' if x == 'fri' else x)))

df = df.withColumn('x35', fix_days_udf('x35'))

df.groupBy('x35').count().show()

+---------+-----+
|      x35|count|
+---------+-----+
|     null|    9|
| thursday|17783|
|   monday|   66|
|wednesday|20713|
|   friday|  547|
|  tuesday|  882|
+---------+-----+



In [41]:
fix_months_udf = udf(lambda x: 'Jan' if x == 'January' else('Jul' if x == 'July' else('Sep' if x == 'sept.' else('Dec' if x == 'Dev' else x))))

df = df.withColumn('x68', fix_months_udf('x68'))

df.groupBy('x68').count().show()

+----+-----+
| x68|count|
+----+-----+
| Oct|  910|
| Sep| 3514|
| Dec|   21|
|null|   10|
| Aug| 8173|
| May| 4801|
| Jun| 9256|
| Feb|   47|
| Nov|  150|
| Mar|  414|
| Jan|   11|
| Apr| 1640|
| Jul|11053|
+----+-----+



In [42]:
df.printSchema()

root
 |-- x0: double (nullable = true)
 |-- x1: double (nullable = true)
 |-- x2: double (nullable = true)
 |-- x3: double (nullable = true)
 |-- x4: double (nullable = true)
 |-- x5: double (nullable = true)
 |-- x6: double (nullable = true)
 |-- x7: double (nullable = true)
 |-- x8: double (nullable = true)
 |-- x9: double (nullable = true)
 |-- x10: double (nullable = true)
 |-- x11: double (nullable = true)
 |-- x12: double (nullable = true)
 |-- x13: double (nullable = true)
 |-- x14: double (nullable = true)
 |-- x15: double (nullable = true)
 |-- x16: double (nullable = true)
 |-- x17: double (nullable = true)
 |-- x18: double (nullable = true)
 |-- x19: double (nullable = true)
 |-- x20: double (nullable = true)
 |-- x21: double (nullable = true)
 |-- x22: double (nullable = true)
 |-- x23: double (nullable = true)
 |-- x24: double (nullable = true)
 |-- x25: double (nullable = true)
 |-- x26: double (nullable = true)
 |-- x27: double (nullable = true)
 |-- x28: double (nullabl

In [43]:
imputer = Imputer(
    inputCols=df.select(numeric).columns, 
    outputCols=df.select(numeric).columns)\
.setStrategy('mean')

df = imputer.fit(df).transform(df)

In [44]:
df.select('x28').describe().show()

+-------+--------------------+
|summary|                 x28|
+-------+--------------------+
|  count|               40000|
|   mean|-0.01363451123323...|
| stddev|  3.9438313125300626|
|    min|  -16.00498942061644|
|    max|  18.299486596823655|
+-------+--------------------+



In [45]:
new_cat = [typ[0] for typ in df.dtypes if typ[1] == 'string']

for colu in df.select(new_cat).columns:
    mode = df.groupBy(colu).count().sort(desc('count')).first()[0]
    df = df.fillna({colu : mode})
    categ = df.select(colu).distinct().rdd.flatMap(lambda x:x).collect()
    exprs = [when(col(colu) == cat,1).otherwise(0)\
            .alias(str(cat)) for cat in categ]
    df = df.select(exprs+df.columns)

In [46]:
df = df.select([typ[0] for typ in df.dtypes if typ[1] != 'string'])
df.printSchema()

root
 |-- america: integer (nullable = false)
 |-- euorpe: integer (nullable = false)
 |-- asia: integer (nullable = false)
 |-- Oct: integer (nullable = false)
 |-- Sep: integer (nullable = false)
 |-- Dec: integer (nullable = false)
 |-- Aug: integer (nullable = false)
 |-- May: integer (nullable = false)
 |-- Jun: integer (nullable = false)
 |-- Feb: integer (nullable = false)
 |-- Nov: integer (nullable = false)
 |-- Mar: integer (nullable = false)
 |-- Jan: integer (nullable = false)
 |-- Apr: integer (nullable = false)
 |-- Jul: integer (nullable = false)
 |-- thursday: integer (nullable = false)
 |-- monday: integer (nullable = false)
 |-- wednesday: integer (nullable = false)
 |-- friday: integer (nullable = false)
 |-- tuesday: integer (nullable = false)
 |-- volkswagon: integer (nullable = false)
 |-- Honda: integer (nullable = false)
 |-- tesla: integer (nullable = false)
 |-- bmw: integer (nullable = false)
 |-- ford: integer (nullable = false)
 |-- mercades: integer (nulla

In [47]:
df.groupBy('america').count().show()

+-------+-----+
|america|count|
+-------+-----+
|      1| 3124|
|      0|36876|
+-------+-----+



In [54]:
vecAssembler = VectorAssembler(inputCols=df.columns, outputCol="allCols")
output = vecAssembler.transform(df)

output.select('allCols').show()

+--------------------+
|             allCols|
+--------------------+
|[0.0,0.0,1.0,0.0,...|
|[0.0,0.0,1.0,0.0,...|
|[0.0,0.0,1.0,0.0,...|
|[0.0,0.0,1.0,0.0,...|
|[0.0,0.0,1.0,0.0,...|
|[0.0,0.0,1.0,0.0,...|
|[0.0,0.0,1.0,0.0,...|
|[0.0,0.0,1.0,0.0,...|
|[0.0,0.0,1.0,0.0,...|
|[0.0,0.0,1.0,0.0,...|
|[0.0,0.0,1.0,0.0,...|
|[0.0,0.0,1.0,0.0,...|
|[0.0,0.0,1.0,0.0,...|
|[0.0,0.0,1.0,0.0,...|
|[0.0,0.0,1.0,0.0,...|
|[0.0,0.0,1.0,0.0,...|
|[0.0,0.0,1.0,0.0,...|
|[1.0,0.0,0.0,0.0,...|
|[0.0,0.0,1.0,0.0,...|
|[0.0,0.0,1.0,0.0,...|
+--------------------+
only showing top 20 rows



In [55]:
corr_mat = Correlation.corr(output, 'allCols', method='pearson')

Py4JJavaError: An error occurred while calling z:org.apache.spark.ml.stat.Correlation.corr.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 578.0 failed 1 times, most recent failure: Lost task 1.0 in stage 578.0 (TID 8001, localhost, executor driver): org.apache.spark.SparkException: Failed to execute user defined function(: (struct<america_double_VectorAssembler_5e55410d5b53:double,euorpe_double_VectorAssembler_5e55410d5b53:double,asia_double_VectorAssembler_5e55410d5b53:double,Oct_double_VectorAssembler_5e55410d5b53:double,Sep_double_VectorAssembler_5e55410d5b53:double,Dec_double_VectorAssembler_5e55410d5b53:double,Aug_double_VectorAssembler_5e55410d5b53:double,May_double_VectorAssembler_5e55410d5b53:double,Jun_double_VectorAssembler_5e55410d5b53:double,Feb_double_VectorAssembler_5e55410d5b53:double,Nov_double_VectorAssembler_5e55410d5b53:double,Mar_double_VectorAssembler_5e55410d5b53:double,Jan_double_VectorAssembler_5e55410d5b53:double,Apr_double_VectorAssembler_5e55410d5b53:double,Jul_double_VectorAssembler_5e55410d5b53:double,thursday_double_VectorAssembler_5e55410d5b53:double,monday_double_VectorAssembler_5e55410d5b53:double,wednesday_double_VectorAssembler_5e55410d5b53:double,friday_double_VectorAssembler_5e55410d5b53:double,tuesday_double_VectorAssembler_5e55410d5b53:double,volkswagon_double_VectorAssembler_5e55410d5b53:double,Honda_double_VectorAssembler_5e55410d5b53:double,tesla_double_VectorAssembler_5e55410d5b53:double,bmw_double_VectorAssembler_5e55410d5b53:double,ford_double_VectorAssembler_5e55410d5b53:double,mercades_double_VectorAssembler_5e55410d5b53:double,chevrolet_double_VectorAssembler_5e55410d5b53:double,nissan_double_VectorAssembler_5e55410d5b53:double,Toyota_double_VectorAssembler_5e55410d5b53:double,chrystler_double_VectorAssembler_5e55410d5b53:double,x0:double,x1:double,x2:double,x3:double,x4:double,x5:double,x6:double,x7:double,x8:double,x9:double,x10:double,x11:double,x12:double,x13:double,x14:double,x15:double,x16:double,x17:double,x18:double,x19:double,x20:double,x21:double,x22:double,x23:double,x24:double,x25:double,x26:double,x27:double,x28:double,x29:double,x30:double,x31:double,x32:double,x33:double,x36:double,x37:double,x38:double,x39:double,x40:double,x41:double,x42:double,x43:double,x44:double,x45:double,x46:double,x47:double,x48:double,x49:double,x50:double,x51:double,x52:double,x53:double,x54:double,x55:double,x56:double,x57:double,x58:double,x59:double,x60:double,x61:double,x62:double,x63:double,x64:double,x65:double,x66:double,x67:double,x69:double,x70:double,x71:double,x72:double,x73:double,x74:double,x75:double,x76:double,x77:double,x78:double,x79:double,x80:double,x81:double,x82:double,x83:double,x84:double,x85:double,x86:double,x87:double,x88:double,x89:double,x90:double,x91:double,x92:double,x94:double,x95:double,x96:double,x97:double,x98:double,x99:double,y_double_VectorAssembler_5e55410d5b53:double>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.ScalaUDF_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.aggregate(TraversableOnce.scala:214)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1334)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1145)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1145)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1146)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1146)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1135)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:844)
Caused by: org.apache.spark.SparkException: Encountered null while assembling a row with handleInvalid = "keep". Consider
removing nulls from dataset or using handleInvalid = "keep" or "skip".
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$assemble$1.apply(VectorAssembler.scala:287)
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$assemble$1.apply(VectorAssembler.scala:255)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:255)
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$4.apply(VectorAssembler.scala:144)
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$4.apply(VectorAssembler.scala:143)
	... 30 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1887)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1875)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1874)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1874)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2108)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2057)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2046)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158)
	at org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1098)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1092)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1.apply(RDD.scala:1161)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1137)
	at org.apache.spark.mllib.linalg.distributed.RowMatrix.computeColumnSummaryStatistics(RowMatrix.scala:419)
	at org.apache.spark.mllib.linalg.distributed.RowMatrix.computeCovariance(RowMatrix.scala:334)
	at org.apache.spark.mllib.stat.correlation.PearsonCorrelation$.computeCorrelationMatrix(PearsonCorrelation.scala:49)
	at org.apache.spark.mllib.stat.correlation.Correlations$.corrMatrix(Correlation.scala:66)
	at org.apache.spark.mllib.stat.Statistics$.corr(Statistics.scala:74)
	at org.apache.spark.ml.stat.Correlation$.corr(Correlation.scala:73)
	at org.apache.spark.ml.stat.Correlation.corr(Correlation.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:564)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:844)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function(: (struct<america_double_VectorAssembler_5e55410d5b53:double,euorpe_double_VectorAssembler_5e55410d5b53:double,asia_double_VectorAssembler_5e55410d5b53:double,Oct_double_VectorAssembler_5e55410d5b53:double,Sep_double_VectorAssembler_5e55410d5b53:double,Dec_double_VectorAssembler_5e55410d5b53:double,Aug_double_VectorAssembler_5e55410d5b53:double,May_double_VectorAssembler_5e55410d5b53:double,Jun_double_VectorAssembler_5e55410d5b53:double,Feb_double_VectorAssembler_5e55410d5b53:double,Nov_double_VectorAssembler_5e55410d5b53:double,Mar_double_VectorAssembler_5e55410d5b53:double,Jan_double_VectorAssembler_5e55410d5b53:double,Apr_double_VectorAssembler_5e55410d5b53:double,Jul_double_VectorAssembler_5e55410d5b53:double,thursday_double_VectorAssembler_5e55410d5b53:double,monday_double_VectorAssembler_5e55410d5b53:double,wednesday_double_VectorAssembler_5e55410d5b53:double,friday_double_VectorAssembler_5e55410d5b53:double,tuesday_double_VectorAssembler_5e55410d5b53:double,volkswagon_double_VectorAssembler_5e55410d5b53:double,Honda_double_VectorAssembler_5e55410d5b53:double,tesla_double_VectorAssembler_5e55410d5b53:double,bmw_double_VectorAssembler_5e55410d5b53:double,ford_double_VectorAssembler_5e55410d5b53:double,mercades_double_VectorAssembler_5e55410d5b53:double,chevrolet_double_VectorAssembler_5e55410d5b53:double,nissan_double_VectorAssembler_5e55410d5b53:double,Toyota_double_VectorAssembler_5e55410d5b53:double,chrystler_double_VectorAssembler_5e55410d5b53:double,x0:double,x1:double,x2:double,x3:double,x4:double,x5:double,x6:double,x7:double,x8:double,x9:double,x10:double,x11:double,x12:double,x13:double,x14:double,x15:double,x16:double,x17:double,x18:double,x19:double,x20:double,x21:double,x22:double,x23:double,x24:double,x25:double,x26:double,x27:double,x28:double,x29:double,x30:double,x31:double,x32:double,x33:double,x36:double,x37:double,x38:double,x39:double,x40:double,x41:double,x42:double,x43:double,x44:double,x45:double,x46:double,x47:double,x48:double,x49:double,x50:double,x51:double,x52:double,x53:double,x54:double,x55:double,x56:double,x57:double,x58:double,x59:double,x60:double,x61:double,x62:double,x63:double,x64:double,x65:double,x66:double,x67:double,x69:double,x70:double,x71:double,x72:double,x73:double,x74:double,x75:double,x76:double,x77:double,x78:double,x79:double,x80:double,x81:double,x82:double,x83:double,x84:double,x85:double,x86:double,x87:double,x88:double,x89:double,x90:double,x91:double,x92:double,x94:double,x95:double,x96:double,x97:double,x98:double,x99:double,y_double_VectorAssembler_5e55410d5b53:double>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.ScalaUDF_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.aggregate(TraversableOnce.scala:214)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1334)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1145)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1145)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1146)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1146)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1135)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	... 1 more
Caused by: org.apache.spark.SparkException: Encountered null while assembling a row with handleInvalid = "keep". Consider
removing nulls from dataset or using handleInvalid = "keep" or "skip".
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$assemble$1.apply(VectorAssembler.scala:287)
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$assemble$1.apply(VectorAssembler.scala:255)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:255)
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$4.apply(VectorAssembler.scala:144)
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$4.apply(VectorAssembler.scala:143)
	... 30 more
