In [1]:
data = spark.read \
            .option("multiLine", True) \
            .csv('/home/ilyes/M2/algortihmique_BD/archive(2)/mental_health.csv',inferSchema=True, header=True)

In [2]:
data.printSchema()

root
 |-- text: string (nullable = true)
 |-- label: integer (nullable = true)



In [3]:
data.head()

Row(text='dear american teens question dutch person heard guys get way easier things learn age us sooooo thth graders like  right guys learn math', label=0)

In [4]:
from pyspark.sql.functions import regexp_replace, trim

data = data.withColumn('text', regexp_replace('text', '[\s]{2,}', ''))
data = data.withColumn('text', trim(data.text))

In [5]:
from pyspark.sql.functions import col, lower
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover

In [6]:
data = data.withColumn("text", lower(col('text')))
data = data.filter(data.text != '')

In [7]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
wordsData = tokenizer.transform(data)

In [8]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
Nstopwords = remover.transform(wordsData)

In [9]:
from pyspark.ml.feature import Word2Vec

word2Vec = Word2Vec(vectorSize=150, inputCol="filtered", outputCol="features")
word2vec_data = word2Vec.fit(Nstopwords)

In [34]:
from pyspark.ml.feature import Word2VecModel

In [33]:
word2vec_data.save('/home/hdoop/w2v')

In [10]:
w2v_data = word2vec_data.transform(Nstopwords)

In [11]:
# split dataframes between 0s and 1s
zeros = w2v_data.filter(w2v_data["label"]==0)
ones = w2v_data.filter(w2v_data["label"]==1)
# split datasets into training and testing
train0, test0 = zeros.randomSplit([0.8,0.2])
train1, test1 = ones.randomSplit([0.8,0.2])
# merge datasets back together
train = train0.union(train1)
test = test0.union(test1)

In [12]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [22]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier()

dtparamGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [2, 5, 10, 20])
             .addGrid(dt.maxBins, [10, 20, 40])
             .build())

In [23]:
crossvaldt = CrossValidator(estimator=dt,
                          estimatorParamMaps=dtparamGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=10)
cvModeldt = crossvaldt.fit(train)

In [24]:
tested = cvModeldt.transform(test)
evaluation = BinaryClassificationEvaluator() #AUC
evaluation.evaluate(tested)

0.8903006838647619

In [25]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression()

paramGridlr = ParamGridBuilder() \
    .addGrid(lr.elasticNetParam, [0, 1]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()
    
crossvallr = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGridlr,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=10)
cvModellr = crossvallr.fit(train)

In [26]:
tested = cvModellr.transform(test)
evaluation = BinaryClassificationEvaluator() #AUC
evaluation.evaluate(tested)

0.9536660891085791

In [20]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier()

rfparamGrid = (ParamGridBuilder()
               .addGrid(rf.maxDepth, [2, 5, 10])
               .addGrid(rf.maxBins, [5, 10, 20])
               .addGrid(rf.numTrees, [5, 20, 50])
             .build())

rfcv = CrossValidator(estimator = rf,
                      estimatorParamMaps = rfparamGrid,
                      evaluator = BinaryClassificationEvaluator(),
                      numFolds = 10)
cvModelrf = rfcv.fit(train)

In [22]:
best = cvModelrf.bestModel

In [29]:
best.save('/home/hdoop/model/')

In [30]:
best._java_obj.getMaxDepth()

10

In [31]:
best._java_obj.getMaxBins()

10

In [32]:
best._java_obj.getNumTrees()

50

In [21]:
tested = cvModelrf.transform(test)
evaluation = BinaryClassificationEvaluator() #AUC
evaluation.evaluate(tested)

0.9636242814198899

In [18]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier()

gbtparamGrid = ParamGridBuilder().build()

gbtcv = CrossValidator(estimator = gbt,
                      estimatorParamMaps = gbtparamGrid,
                      evaluator = BinaryClassificationEvaluator(),
                      numFolds = 10)
cvModelgbt = gbtcv.fit(train)

In [19]:
tested = cvModelgbt.transform(test)
evaluation = BinaryClassificationEvaluator() #AUC
evaluation.evaluate(tested)

0.9588292238991081

In [14]:
from pyspark.ml.classification import LinearSVC

svc = LinearSVC()

svcparamGrid = (ParamGridBuilder()
                .addGrid(svc.maxIter, [10, 100])
                .addGrid(svc.regParam, [0.001, 0.01, 1.0,10.0])
                .build())

svccv = CrossValidator(estimator = svc,
                      estimatorParamMaps = svcparamGrid,
                      evaluator = BinaryClassificationEvaluator(),
                      numFolds = 10)
cvModelsvc = svccv.fit(train)

In [19]:
bestmodel = cvModelsvc.bestModel
bestmodel._java_obj.getRegParam()

0.001

In [15]:
tested = cvModelsvc.transform(test)
evaluation = BinaryClassificationEvaluator() #AUC
evaluation.evaluate(tested)

0.9600915541100232

In [28]:
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=92769)
featurizedData = hashingTF.transform(Nstopwords)

In [29]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
tf_idf_data = idfModel.transform(featurizedData)

In [70]:
data.filter(data.label==0).count()

14138

In [71]:
data.filter(data.label==1).count()

13837

In [30]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression()

In [31]:
paramGrid = ParamGridBuilder() \
    .addGrid(lr.elasticNetParam, [0, 1]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

In [32]:
crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=10)
cvModel = crossval.fit(train)

In [33]:
tested = cvModel.transform(test)

In [34]:
evaluation = BinaryClassificationEvaluator() #AUC
evaluation.evaluate(tested)

0.9551031317760491

In [35]:
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(modelType="multinomial")

nbparamGrid = (ParamGridBuilder()
               .addGrid(nb.smoothing, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
               .build())

In [36]:
crossvalnb = CrossValidator(estimator=nb,
                          estimatorParamMaps=nbparamGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=10)
cvModelnb = crossvalnb.fit(train)

Py4JJavaError: An error occurred while calling o9024.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2537.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2537.0 (TID 4653) (172.23.0.86 executor driver): org.apache.spark.SparkException: Failed to execute user defined function (NaiveBayes$$Lambda$4329/0x0000000841866840: (struct<type:tinyint,size:int,indices:array<int>,values:array<double>>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>)
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:136)
	at org.apache.spark.sql.catalyst.expressions.ScalaUDF.eval(ScalaUDF.scala:1196)
	at org.apache.spark.ml.stat.SummaryBuilderImpl$MetricsAggregate.update(Summarizer.scala:382)
	at org.apache.spark.ml.stat.SummaryBuilderImpl$MetricsAggregate.update(Summarizer.scala:345)
	at org.apache.spark.sql.catalyst.expressions.aggregate.TypedImperativeAggregate.update(interfaces.scala:583)
	at org.apache.spark.sql.execution.aggregate.AggregationIterator$$anonfun$1.$anonfun$applyOrElse$2(AggregationIterator.scala:197)
	at org.apache.spark.sql.execution.aggregate.AggregationIterator$$anonfun$1.$anonfun$applyOrElse$2$adapted(AggregationIterator.scala:197)
	at org.apache.spark.sql.execution.aggregate.AggregationIterator.$anonfun$generateProcessRow$7(AggregationIterator.scala:214)
	at org.apache.spark.sql.execution.aggregate.AggregationIterator.$anonfun$generateProcessRow$7$adapted(AggregationIterator.scala:208)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.processInputs(ObjectAggregationIterator.scala:169)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.<init>(ObjectAggregationIterator.scala:83)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$1(ObjectHashAggregateExec.scala:112)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$1$adapted(ObjectHashAggregateExec.scala:88)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2(RDD.scala:885)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2$adapted(RDD.scala:885)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.IllegalArgumentException: requirement failed: Naive Bayes requires nonnegative feature values but found [-0.08118718913332983,0.06595419736748392,-0.04328706252007661,-0.020002422176978805,0.06108700958165256,-0.08360661058263345,-0.028859652240167965,0.07272575423121452,0.08763579228384928,2.287377772683447E-4,-0.008977195861834018,0.017123084769330242,-0.027302229300733994,0.03428681381046772,-0.03868433680724014,0.029901693532751367,-0.015687720278616656,0.028031414544040508,0.006077753337608143,0.03873664936558767,-0.03687503138049082,-0.03555061242183332,0.008404547348618507,-0.007930820638483221,0.05602607812563127,-0.015779880899406802,-0.003759931286119602,-0.042197861285372215,0.008132100909609686,-0.014012620869007978,0.015676816502078014,-0.05213455754247579,0.02162995822304352,-0.04144787373529239,-0.01430605428124016,0.008185336662625725,-0.028136456639251926,0.0286424324255098,-0.10217299736739899,0.038514040749181404,-0.02994196433362297,0.02560416143387556,-0.01984130128138614,-0.017808872952379963,0.02156172100115906,-0.04259027032689615,0.03829612024128437,-0.07394003309309483,0.016288643808696757,-0.012013201517137615,-0.019319818778471515,-0.021718522909478368,-0.027024125172333286,-0.051933614325455645,-0.03578495225784454,-0.08155154682357203,0.031422692562707445,-0.034354983320967716,0.0217902671799741,0.03153493995672431,0.012122979387640953,0.01145722276785157,0.07335225882178004,-0.016344936987893147,0.030051161958412693,-0.05890975333750248,-3.070555077019063E-4,0.015898595310070297,-0.052393504841761154,0.026755809847434815,0.005894599453313276,-0.0523899449670518,0.007643981921401891,-0.0753536324270747,0.018997276725713164,-0.04205024394799363,0.04140332341194153,-0.012607855468311094,-0.061253000389445915,-0.0928618134084073,-0.07396969741041011,0.09956257142634554,0.06599163516974924,-0.020313015038316902,0.07441376232203435,-0.0351781329584562,-0.015584308823401278,-0.028571074902587996,0.04221345323391936,-0.01886849126524546,0.04171837951501154,0.006888091479512778,0.037439940835941925,-0.02274214637211778,-0.005319520979273048,0.019718050110069187,0.028539475815540012,0.011974407661579211,-0.0666593731677329,0.09103225967423483,-0.020448689433661373,0.037500197078440003,-0.014756880700588226,-0.11341992007907141,0.030053322796117176,-0.07139102623543957,0.001573957989669659,-0.0017303975341333585,0.04782002956860445,0.01378172039139,0.02261196563697674,0.020072892477566547,0.07575733578679236,0.030926415527408775,-0.11713790250095454,0.01884923435070298,0.10744544080543247,-0.022081324322657154,0.021790957755663178,0.09264385541477664,0.07324390138753437,-9.922653605992143E-4,-8.456425910646266E-4,0.00803573810580102,0.030741645192558117,0.08612978297539733,0.04740994330495596,0.002340970306911252,0.0034124143421649933,-0.008109051903540438,0.005066500214690512,-0.04924779997567054,-0.027928474835458805,0.006442304277284579,-0.05347674767571417,0.08386206008832563,-0.015656177631833336,0.046761676740557465,-0.009041783315214245,-0.06334255195476793,-0.025025396364402364,0.050383128437467596,-0.035447718067602677,-0.005462859503247521,-0.03410375377544287,0.04284718183969909,0.04511980278502134,0.02164363099092787,0.020776554349471222,-0.027347084515812723].
	at scala.Predef$.require(Predef.scala:281)
	at org.apache.spark.ml.classification.NaiveBayes$.requireNonnegativeValues(NaiveBayes.scala:359)
	at org.apache.spark.ml.classification.NaiveBayes.$anonfun$trainDiscreteImpl$1(NaiveBayes.scala:178)
	at org.apache.spark.sql.catalyst.expressions.ScalaUDF.$anonfun$f$2(ScalaUDF.scala:210)
	at org.apache.spark.sql.catalyst.expressions.ScalaUDF.eval(ScalaUDF.scala:1192)
	... 29 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2454)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2403)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2402)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2402)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1160)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1160)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1160)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2642)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2584)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2573)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function (NaiveBayes$$Lambda$4329/0x0000000841866840: (struct<type:tinyint,size:int,indices:array<int>,values:array<double>>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>)
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:136)
	at org.apache.spark.sql.catalyst.expressions.ScalaUDF.eval(ScalaUDF.scala:1196)
	at org.apache.spark.ml.stat.SummaryBuilderImpl$MetricsAggregate.update(Summarizer.scala:382)
	at org.apache.spark.ml.stat.SummaryBuilderImpl$MetricsAggregate.update(Summarizer.scala:345)
	at org.apache.spark.sql.catalyst.expressions.aggregate.TypedImperativeAggregate.update(interfaces.scala:583)
	at org.apache.spark.sql.execution.aggregate.AggregationIterator$$anonfun$1.$anonfun$applyOrElse$2(AggregationIterator.scala:197)
	at org.apache.spark.sql.execution.aggregate.AggregationIterator$$anonfun$1.$anonfun$applyOrElse$2$adapted(AggregationIterator.scala:197)
	at org.apache.spark.sql.execution.aggregate.AggregationIterator.$anonfun$generateProcessRow$7(AggregationIterator.scala:214)
	at org.apache.spark.sql.execution.aggregate.AggregationIterator.$anonfun$generateProcessRow$7$adapted(AggregationIterator.scala:208)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.processInputs(ObjectAggregationIterator.scala:169)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.<init>(ObjectAggregationIterator.scala:83)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$1(ObjectHashAggregateExec.scala:112)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$1$adapted(ObjectHashAggregateExec.scala:88)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2(RDD.scala:885)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2$adapted(RDD.scala:885)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.IllegalArgumentException: requirement failed: Naive Bayes requires nonnegative feature values but found [-0.08118718913332983,0.06595419736748392,-0.04328706252007661,-0.020002422176978805,0.06108700958165256,-0.08360661058263345,-0.028859652240167965,0.07272575423121452,0.08763579228384928,2.287377772683447E-4,-0.008977195861834018,0.017123084769330242,-0.027302229300733994,0.03428681381046772,-0.03868433680724014,0.029901693532751367,-0.015687720278616656,0.028031414544040508,0.006077753337608143,0.03873664936558767,-0.03687503138049082,-0.03555061242183332,0.008404547348618507,-0.007930820638483221,0.05602607812563127,-0.015779880899406802,-0.003759931286119602,-0.042197861285372215,0.008132100909609686,-0.014012620869007978,0.015676816502078014,-0.05213455754247579,0.02162995822304352,-0.04144787373529239,-0.01430605428124016,0.008185336662625725,-0.028136456639251926,0.0286424324255098,-0.10217299736739899,0.038514040749181404,-0.02994196433362297,0.02560416143387556,-0.01984130128138614,-0.017808872952379963,0.02156172100115906,-0.04259027032689615,0.03829612024128437,-0.07394003309309483,0.016288643808696757,-0.012013201517137615,-0.019319818778471515,-0.021718522909478368,-0.027024125172333286,-0.051933614325455645,-0.03578495225784454,-0.08155154682357203,0.031422692562707445,-0.034354983320967716,0.0217902671799741,0.03153493995672431,0.012122979387640953,0.01145722276785157,0.07335225882178004,-0.016344936987893147,0.030051161958412693,-0.05890975333750248,-3.070555077019063E-4,0.015898595310070297,-0.052393504841761154,0.026755809847434815,0.005894599453313276,-0.0523899449670518,0.007643981921401891,-0.0753536324270747,0.018997276725713164,-0.04205024394799363,0.04140332341194153,-0.012607855468311094,-0.061253000389445915,-0.0928618134084073,-0.07396969741041011,0.09956257142634554,0.06599163516974924,-0.020313015038316902,0.07441376232203435,-0.0351781329584562,-0.015584308823401278,-0.028571074902587996,0.04221345323391936,-0.01886849126524546,0.04171837951501154,0.006888091479512778,0.037439940835941925,-0.02274214637211778,-0.005319520979273048,0.019718050110069187,0.028539475815540012,0.011974407661579211,-0.0666593731677329,0.09103225967423483,-0.020448689433661373,0.037500197078440003,-0.014756880700588226,-0.11341992007907141,0.030053322796117176,-0.07139102623543957,0.001573957989669659,-0.0017303975341333585,0.04782002956860445,0.01378172039139,0.02261196563697674,0.020072892477566547,0.07575733578679236,0.030926415527408775,-0.11713790250095454,0.01884923435070298,0.10744544080543247,-0.022081324322657154,0.021790957755663178,0.09264385541477664,0.07324390138753437,-9.922653605992143E-4,-8.456425910646266E-4,0.00803573810580102,0.030741645192558117,0.08612978297539733,0.04740994330495596,0.002340970306911252,0.0034124143421649933,-0.008109051903540438,0.005066500214690512,-0.04924779997567054,-0.027928474835458805,0.006442304277284579,-0.05347674767571417,0.08386206008832563,-0.015656177631833336,0.046761676740557465,-0.009041783315214245,-0.06334255195476793,-0.025025396364402364,0.050383128437467596,-0.035447718067602677,-0.005462859503247521,-0.03410375377544287,0.04284718183969909,0.04511980278502134,0.02164363099092787,0.020776554349471222,-0.027347084515812723].
	at scala.Predef$.require(Predef.scala:281)
	at org.apache.spark.ml.classification.NaiveBayes$.requireNonnegativeValues(NaiveBayes.scala:359)
	at org.apache.spark.ml.classification.NaiveBayes.$anonfun$trainDiscreteImpl$1(NaiveBayes.scala:178)
	at org.apache.spark.sql.catalyst.expressions.ScalaUDF.$anonfun$f$2(ScalaUDF.scala:210)
	at org.apache.spark.sql.catalyst.expressions.ScalaUDF.eval(ScalaUDF.scala:1192)
	... 29 more


In [92]:
tested = cvModelnb.transform(test)
evaluation = BinaryClassificationEvaluator() #AUC
evaluation.evaluate(tested)

0.4761694294819699

In [13]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

layers = [262144, 174762, 1]

trainer = MultilayerPerceptronClassifier(layers=layers)
model = trainer.fit(train)

Py4JJavaError: An error occurred while calling o158.fit.
: java.lang.NegativeArraySizeException: -1431481003
	at scala.reflect.ManifestFactory$DoubleManifest.newArray(Manifest.scala:194)
	at scala.reflect.ManifestFactory$DoubleManifest.newArray(Manifest.scala:191)
	at breeze.linalg.DenseVector$.zeros$mDc$sp(DenseVector.scala:274)
	at org.apache.spark.ml.ann.FeedForwardModel$.apply(Layer.scala:601)
	at org.apache.spark.ml.ann.FeedForwardTopology.model(Layer.scala:422)
	at org.apache.spark.ml.ann.FeedForwardTrainer.train(Layer.scala:844)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.$anonfun$train$1(MultilayerPerceptronClassifier.scala:228)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:184)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:93)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:151)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:115)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [None]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier()
gbtclassifier = gbt.fit(train)