In [1]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('tree_methods_adv').getOrCreate()

In [2]:
# Load training data. 
data = spark.read.csv('Datasets/suicide rate.csv',inferSchema=True,header=True)

In [3]:
# Let's get an idea of what the data looks like. 
data.printSchema()

root
 |-- country: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: string (nullable = true)
 |-- suicides_no: integer (nullable = true)
 |-- population: integer (nullable = true)
 |-- suicides/100k pop: double (nullable = true)
 |-- country-year: string (nullable = true)
 |-- HDI for year: double (nullable = true)
 |--  gdp_for_year ($) : string (nullable = true)
 |-- gdp_per_capita ($): integer (nullable = true)
 |-- generation: string (nullable = true)



In [4]:
data = data.withColumnRenamed('suicides/100k pop','suicide_rate')
data = data.withColumnRenamed(' gdp_for_year ($) ','gdp_year')
data = data.withColumnRenamed('gdp_per_capita ($)','gdp_capita')
data = data.withColumnRenamed('HDI for year','HDI')
data.printSchema()

root
 |-- country: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: string (nullable = true)
 |-- suicides_no: integer (nullable = true)
 |-- population: integer (nullable = true)
 |-- suicide_rate: double (nullable = true)
 |-- country-year: string (nullable = true)
 |-- HDI: double (nullable = true)
 |-- gdp_year: string (nullable = true)
 |-- gdp_capita: integer (nullable = true)
 |-- generation: string (nullable = true)



In [5]:
data = data.drop('suicides_no','country-year','population')

In [6]:
from pyspark.sql import functions as F
df_agg = data.agg(*[F.count(F.when(F.isnull(c),c)).alias(c) for c in data.columns])
df_agg.show()

+-------+----+---+---+------------+-----+--------+----------+----------+
|country|year|sex|age|suicide_rate|  HDI|gdp_year|gdp_capita|generation|
+-------+----+---+---+------------+-----+--------+----------+----------+
|      0|   0|  0|  0|           0|19456|       0|         0|         0|
+-------+----+---+---+------------+-----+--------+----------+----------+



In [7]:
data = data.drop('HDI')
data.printSchema()

root
 |-- country: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: string (nullable = true)
 |-- suicide_rate: double (nullable = true)
 |-- gdp_year: string (nullable = true)
 |-- gdp_capita: integer (nullable = true)
 |-- generation: string (nullable = true)



In [8]:
# data = data.drop('year','generation')
# data.printSchema()

In [9]:
from pyspark.sql import functions as F
data = data.withColumn("suicide_risk",F.when(data.suicide_rate> 12.816097411933894, 1 ).otherwise(0))
data.show()

+-------+----+------+-----------+------------+-------------+----------+---------------+------------+
|country|year|   sex|        age|suicide_rate|     gdp_year|gdp_capita|     generation|suicide_risk|
+-------+----+------+-----------+------------+-------------+----------+---------------+------------+
|Albania|1987|  male|15-24 years|        6.71|2,156,624,900|       796|   Generation X|           0|
|Albania|1987|  male|35-54 years|        5.19|2,156,624,900|       796|         Silent|           0|
|Albania|1987|female|15-24 years|        4.83|2,156,624,900|       796|   Generation X|           0|
|Albania|1987|  male|  75+ years|        4.59|2,156,624,900|       796|G.I. Generation|           0|
|Albania|1987|  male|25-34 years|        3.28|2,156,624,900|       796|        Boomers|           0|
|Albania|1987|female|  75+ years|        2.81|2,156,624,900|       796|G.I. Generation|           0|
|Albania|1987|female|35-54 years|        2.15|2,156,624,900|       796|         Silent|    

In [10]:
from pyspark.sql.functions import udf, concat, col, lit
commaRep = udf(lambda x: x.replace(',','')) 
data = data.withColumn('gdp_years',commaRep('gdp_year'))
data = data.drop('gdp_year')
from pyspark.sql.types import IntegerType
# data = data.withColumn("gdp_year", data["gdp_years"].cast(IntegerType()))
data = data.withColumn("gdp_year", data.gdp_years.cast('float'))
data = data.drop('gdp_years')
data.printSchema()
data.show()

root
 |-- country: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: string (nullable = true)
 |-- suicide_rate: double (nullable = true)
 |-- gdp_capita: integer (nullable = true)
 |-- generation: string (nullable = true)
 |-- suicide_risk: integer (nullable = false)
 |-- gdp_year: float (nullable = true)

+-------+----+------+-----------+------------+----------+---------------+------------+-----------+
|country|year|   sex|        age|suicide_rate|gdp_capita|     generation|suicide_risk|   gdp_year|
+-------+----+------+-----------+------------+----------+---------------+------------+-----------+
|Albania|1987|  male|15-24 years|        6.71|       796|   Generation X|           0|2.1566249E9|
|Albania|1987|  male|35-54 years|        5.19|       796|         Silent|           0|2.1566249E9|
|Albania|1987|female|15-24 years|        4.83|       796|   Generation X|           0|2.1566249E9|
|Albania|1987|  male|  75+ years|       

In [11]:
# Let's import the string indexer (similar to the logistic regression exercises).
from pyspark.ml.feature import StringIndexer
Year_Indexer = StringIndexer(inputCol='year',outputCol='yearIndex')
Year_Indexed = Year_Indexer.fit(data).transform(data)

Sex_Indexer = StringIndexer(inputCol='sex',outputCol='sexIndex')
Sex_Indexed = Sex_Indexer.fit(Year_Indexed).transform(Year_Indexed)

Country_Indexer = StringIndexer(inputCol='country',outputCol='countryIndex')
Country_Indexed = Country_Indexer.fit(Sex_Indexed).transform(Sex_Indexed)

Age_Indexer = StringIndexer(inputCol='age',outputCol='ageIndex')
Age_Indexed = Age_Indexer.fit(Country_Indexed).transform(Country_Indexed)

Generation_Indexer = StringIndexer(inputCol='generation',outputCol='generationIndex')
Generation_Indexed = Generation_Indexer.fit(Age_Indexed).transform(Age_Indexed)


In [12]:
Generation_Indexed = Generation_Indexed.drop('country','year','sex','age','suicide_rate','generation')
Generation_Indexed.printSchema()

root
 |-- gdp_capita: integer (nullable = true)
 |-- suicide_risk: integer (nullable = false)
 |-- gdp_year: float (nullable = true)
 |-- yearIndex: double (nullable = true)
 |-- sexIndex: double (nullable = true)
 |-- countryIndex: double (nullable = true)
 |-- ageIndex: double (nullable = true)
 |-- generationIndex: double (nullable = true)



In [13]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
  inputCols=[
             'countryIndex',
#               'yearIndex',
              'sexIndex',
              'gdp_year',
             'gdp_capita',
             'ageIndex',
             'generationIndex'
             ],
              outputCol="features")
output = assembler.transform(Generation_Indexed)

In [14]:
final_data = output.select("features",'suicide_risk')

In [15]:
final_data.show(20,False)

+--------------------------------------+------------+
|features                              |suicide_risk|
+--------------------------------------+------------+
|[67.0,0.0,2.156624896E9,796.0,2.0,0.0]|0           |
|[67.0,0.0,2.156624896E9,796.0,0.0,1.0]|0           |
|[67.0,1.0,2.156624896E9,796.0,2.0,0.0]|0           |
|[67.0,0.0,2.156624896E9,796.0,4.0,4.0]|0           |
|[67.0,0.0,2.156624896E9,796.0,1.0,3.0]|0           |
|[67.0,1.0,2.156624896E9,796.0,4.0,4.0]|0           |
|[67.0,1.0,2.156624896E9,796.0,0.0,1.0]|0           |
|[67.0,1.0,2.156624896E9,796.0,1.0,3.0]|0           |
|[67.0,0.0,2.156624896E9,796.0,3.0,4.0]|0           |
|[67.0,1.0,2.156624896E9,796.0,5.0,0.0]|0           |
|[67.0,1.0,2.156624896E9,796.0,3.0,4.0]|0           |
|[67.0,0.0,2.156624896E9,796.0,5.0,0.0]|0           |
|[67.0,1.0,2.126E9,769.0,4.0,4.0]      |0           |
|[67.0,0.0,2.126E9,769.0,2.0,0.0]      |0           |
|[67.0,0.0,2.126E9,769.0,4.0,4.0]      |0           |
|[67.0,0.0,2.126E9,769.0,0.0

In [16]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

# Compute summary statistics and generate MinMaxScalerModel
scalerModel = scaler.fit(final_data)

# rescale each feature to range [min, max].
scaledData = scalerModel.transform(final_data)
print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax()))
scaledData.select("features", "scaledFeatures").show()

Features scaled to range: [0.000000, 1.000000]
+--------------------+--------------------+
|            features|      scaledFeatures|
+--------------------+--------------------+
|[67.0,0.0,2.15662...|[0.67,0.0,1.16425...|
|[67.0,0.0,2.15662...|[0.67,0.0,1.16425...|
|[67.0,1.0,2.15662...|[0.67,1.0,1.16425...|
|[67.0,0.0,2.15662...|[0.67,0.0,1.16425...|
|[67.0,0.0,2.15662...|[0.67,0.0,1.16425...|
|[67.0,1.0,2.15662...|[0.67,1.0,1.16425...|
|[67.0,1.0,2.15662...|[0.67,1.0,1.16425...|
|[67.0,1.0,2.15662...|[0.67,1.0,1.16425...|
|[67.0,0.0,2.15662...|[0.67,0.0,1.16425...|
|[67.0,1.0,2.15662...|[0.67,1.0,1.16425...|
|[67.0,1.0,2.15662...|[0.67,1.0,1.16425...|
|[67.0,0.0,2.15662...|[0.67,0.0,1.16425...|
|[67.0,1.0,2.126E9...|[0.67,1.0,1.14735...|
|[67.0,0.0,2.126E9...|[0.67,0.0,1.14735...|
|[67.0,0.0,2.126E9...|[0.67,0.0,1.14735...|
|[67.0,0.0,2.126E9...|[0.67,0.0,1.14735...|
|[67.0,0.0,2.126E9...|[0.67,0.0,1.14735...|
|[67.0,1.0,2.126E9...|[0.67,1.0,1.14735...|
|[67.0,1.0,2.126E9...|[0.67,1

In [17]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors

# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
l1NormData = normalizer.transform(final_data)
print("Normalized using L^1 norm")
l1NormData.show(5,False)

Normalized using L^1 norm
+--------------------------------------+------------+---------------------------------------------------------------------------------------------------------------+
|features                              |suicide_risk|normFeatures                                                                                                   |
+--------------------------------------+------------+---------------------------------------------------------------------------------------------------------------+
|[67.0,0.0,2.156624896E9,796.0,2.0,0.0]|0           |[3.1067049838509276E-8,0.0,0.999999598910476,3.6909509957393115E-7,9.273746220450531E-10,0.0]                  |
|[67.0,0.0,2.156624896E9,796.0,0.0,1.0]|0           |[3.106704985291467E-8,0.0,0.999999599374163,3.6909509974507586E-7,0.0,4.6368731123753244E-10]                  |
|[67.0,1.0,2.156624896E9,796.0,2.0,0.0]|0           |[3.106704982410388E-8,4.6368731080752064E-10,0.9999995984467889,3.690950994027864E-7,9.2737

In [18]:
from pyspark.ml.classification import RandomForestClassifier
rfc = RandomForestClassifier(labelCol='suicide_risk',featuresCol='features',maxBins=120)
rfc_model = rfc.fit(final_data)
rfc_model.featureImportances


SparseVector(6, {0: 0.3461, 1: 0.3321, 2: 0.0183, 3: 0.0092, 4: 0.1839, 5: 0.1104})

In [19]:
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors
selector = ChiSqSelector(numTopFeatures=5, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="suicide_risk")

result = selector.fit(final_data).transform(final_data)

print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
result.show(20,False)

ChiSqSelector output with top 5 features selected
+--------------------------------------+------------+----------------------------------+
|features                              |suicide_risk|selectedFeatures                  |
+--------------------------------------+------------+----------------------------------+
|[67.0,0.0,2.156624896E9,796.0,2.0,0.0]|0           |[67.0,0.0,2.156624896E9,796.0,2.0]|
|[67.0,0.0,2.156624896E9,796.0,0.0,1.0]|0           |[67.0,0.0,2.156624896E9,796.0,0.0]|
|[67.0,1.0,2.156624896E9,796.0,2.0,0.0]|0           |[67.0,1.0,2.156624896E9,796.0,2.0]|
|[67.0,0.0,2.156624896E9,796.0,4.0,4.0]|0           |[67.0,0.0,2.156624896E9,796.0,4.0]|
|[67.0,0.0,2.156624896E9,796.0,1.0,3.0]|0           |[67.0,0.0,2.156624896E9,796.0,1.0]|
|[67.0,1.0,2.156624896E9,796.0,4.0,4.0]|0           |[67.0,1.0,2.156624896E9,796.0,4.0]|
|[67.0,1.0,2.156624896E9,796.0,0.0,1.0]|0           |[67.0,1.0,2.156624896E9,796.0,0.0]|
|[67.0,1.0,2.156624896E9,796.0,1.0,3.0]|0           |[67.0,1

In [20]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [40]:
#Logistic Regression
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol='features',labelCol='suicide_risk',maxIter=100, regParam=0.2, elasticNetParam=0.7)
# Fit the model
lrModel = lr.fit(train_data)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))


Coefficients: (6,[1],[-0.45271270012262355])
Intercept: -0.5607334661895695


In [57]:
lrModel_predictions = lrModel.transform(test_data)
# Let's import the evaluator.
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error. 
acc_evaluator = MulticlassClassificationEvaluator(labelCol="suicide_risk", 
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
lrModel_acc = acc_evaluator.evaluate(lrModel_predictions)

# Let's start off with binary classification.
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Note that the label column isn't named label, it's named PrivateIndex in this case.
my_binary_eval = BinaryClassificationEvaluator(labelCol = 'suicide_risk')
# Let's do something a bit more complex in terms of printing, just so it's formatted nicer. 
print("Here are the results!")
print('-'*20 + 'Logistic Regression model' + '-'*20)
print('Logistic Regression model accuracy: {0:2.2f}%'.format(lrModel_acc*100))
print('areaUnderROC of LR: {0:2.2f}%'.format(my_binary_eval.evaluate(lrModel_predictions)*100))
print('areaUnderPR of LR: {0:2.2f}%'.format(my_binary_eval.
                            evaluate(lrModel_predictions, 
                            {my_binary_eval.metricName: "areaUnderPR"})*100))

Here are the results!
--------------------Logistic Regression model--------------------
Logistic Regression model accuracy: 68.50%
areaUnderROC of LR: 73.73%
areaUnderPR of LR: 70.01%


In [58]:
#Decision Tree
from pyspark.ml.classification import DecisionTreeClassifier
dtc = DecisionTreeClassifier(labelCol='suicide_risk',featuresCol='features',maxBins=120,maxDepth=5, 
                             impurity='gini')
dtc_model = dtc.fit(train_data)
dtc_model.featureImportances
# print(dtc_model.toDebugString)
dtc_predictions = dtc_model.transform(test_data)
# Let's import the evaluator.
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error. 
acc_evaluator = MulticlassClassificationEvaluator(labelCol="suicide_risk", predictionCol="prediction", 
                                                  metricName="accuracy")
dtcModel_acc = acc_evaluator.evaluate(dtc_predictions)
# Let's start off with binary classification.
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Note that the label column isn't named label, it's named PrivateIndex in this case.
my_binary_eval = BinaryClassificationEvaluator(labelCol = 'suicide_risk')

# Let's do something a bit more complex in terms of printing, just so it's formatted nicer. 
print("Here are the results!")
print('-'*20 + 'Decision Tree model' + '-'*20)
print('Decision Tree model accuracy: {0:2.2f}%'.format(dtcModel_acc*100))
print('areaUnderROC of DT: {0:2.2f}%'.format(my_binary_eval.evaluate(dtc_predictions)*100))
print('areaUnderPR of DT: {0:2.2f}%'.format(my_binary_eval.
                            evaluate(dtc_predictions, 
                            {my_binary_eval.metricName: "areaUnderPR"})*100))


Here are the results!
--------------------Decision Tree model--------------------
Decision Tree model accuracy: 91.13%
areaUnderROC of DT: 95.50%
areaUnderPR of DT: 91.04%


In [25]:
dtc_model.featureImportances

SparseVector(6, {0: 0.3718, 1: 0.2501, 2: 0.0234, 3: 0.0097, 4: 0.2615, 5: 0.0834})

In [59]:
#Random Forest
from pyspark.ml.classification import RandomForestClassifier
rfc = RandomForestClassifier(labelCol='suicide_risk',featuresCol='features',maxBins=120,
                             maxDepth=30, impurity='gini',numTrees=30)
rfc_model = rfc.fit(train_data)
rfc_model.featureImportances
# print(rfc_model.toDebugString)
rfc_predictions = rfc_model.transform(test_data)
# Let's import the evaluator.
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error. 
acc_evaluator = MulticlassClassificationEvaluator(labelCol="suicide_risk", predictionCol="prediction", 
                                                  metricName="accuracy")
rfcModel_acc = acc_evaluator.evaluate(rfc_predictions)
# Let's start off with binary classification.
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Note that the label column isn't named label, it's named PrivateIndex in this case.
my_binary_eval = BinaryClassificationEvaluator(labelCol = 'suicide_risk')

# Let's do something a bit more complex in terms of printing, just so it's formatted nicer. 
print("Here are the results!")
print('-'*20 + 'Random Forest model' + '-'*20)
print('Random Forest model accuracy: {0:2.2f}%'.format(rfcModel_acc*100))
print('areaUnderROC of RF: {0:2.2f}%'.format(my_binary_eval.evaluate(rfc_predictions)*100))
print('areaUnderPR of RF: {0:2.2f}%'.format(my_binary_eval.
                            evaluate(rfc_predictions, 
                            {my_binary_eval.metricName: "areaUnderPR"})*100))

Here are the results!
--------------------Random Forest model--------------------
Random Forest model accuracy: 93.61%
areaUnderROC of RF: 98.08%
areaUnderPR of RF: 96.32%


In [29]:
rfc_model.featureImportances

SparseVector(6, {0: 0.3719, 1: 0.2918, 2: 0.0293, 3: 0.0168, 4: 0.1966, 5: 0.0936})

In [29]:
# Let's import the relevant classifiers. 
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier
from pyspark.ml import Pipeline
dtc = DecisionTreeClassifier(labelCol='suicide_risk',featuresCol='features',maxBins=120)
rfc = RandomForestClassifier(labelCol='suicide_risk',featuresCol='features',maxBins=120)
gbt = GBTClassifier(labelCol='suicide_risk',featuresCol='features',maxBins=120)


In [58]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)


In [66]:
dtc_model.featureImportances
rfc_model.featureImportances
gbt_model.featureImportances

SparseVector(6, {0: 0.167, 1: 0.4472, 2: 0.2793, 3: 0.0341, 4: 0.0227, 5: 0.0497})

In [72]:
print(dtc_model.toDebugString)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4ecf99d2a7575a034006) of depth 5 with 57 nodes
  If (feature 0 in {1.0})
   If (feature 1 in {0.0,1.0,3.0,5.0,6.0,8.0,9.0,11.0,12.0,13.0,14.0,15.0,17.0,18.0,20.0,21.0,22.0,23.0,24.0,25.0,26.0,28.0,30.0,31.0,32.0,34.0,37.0,38.0,39.0,40.0,41.0,42.0,43.0,45.0,46.0,47.0,49.0,50.0,51.0,53.0,54.0,55.0,56.0,58.0,59.0,60.0,62.0,63.0,64.0,65.0,66.0,67.0,73.0,76.0,78.0,79.0,80.0,81.0,82.0,83.0,84.0,85.0,87.0,88.0,89.0,90.0,91.0,92.0,93.0,94.0,95.0,96.0,97.0,98.0,99.0,100.0})
    If (feature 1 in {5.0,6.0,8.0,9.0,11.0,12.0,13.0,14.0,17.0,18.0,20.0,21.0,22.0,23.0,24.0,25.0,26.0,28.0,31.0,32.0,34.0,37.0,40.0,41.0,42.0,43.0,45.0,46.0,51.0,53.0,54.0,55.0,56.0,58.0,59.0,62.0,63.0,65.0,66.0,67.0,73.0,76.0,78.0,79.0,80.0,81.0,82.0,83.0,84.0,87.0,88.0,90.0,91.0,92.0,93.0,94.0,95.0,96.0,97.0,98.0,99.0,100.0})
     If (feature 1 in {5.0,6.0,8.0,9.0,11.0,12.0,13.0,14.0,17.0,18.0,20.0,21.0,22.0,23.0,24.0,25.0,26.0,32.0,37.0,40.0,41.0,43.0,45.0,53.0,

In [59]:
dtc_predictions = dtc_model.transform(test_data)
rfc_predictions = rfc_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)

In [62]:
# Let's start off with binary classification.
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Note that the label column isn't named label, it's named PrivateIndex in this case.
my_binary_eval = BinaryClassificationEvaluator(labelCol = 'suicide_risk')

In [63]:
# This is the area under the curve. This indicates that the data is highly seperable.
print("DTC")
print(my_binary_eval.evaluate(dtc_predictions))

# RFC improves accuracy but also model complexity. RFC outperforms DTC in nearly every situation.
print("RFC")
print(my_binary_eval.evaluate(rfc_predictions))

# We can't repeat these exact steps for GBT. If you print the schema of all three, you may be able to notice why.
# Instead, let's redefine the object:
my_binary_gbt_eval = BinaryClassificationEvaluator(labelCol='suicide_risk', rawPredictionCol='prediction')
print("GBT")
print(my_binary_gbt_eval.evaluate(gbt_predictions))

DTC
0.9534551410912341
RFC
0.9666498849704965
GBT
0.9068846681285399


In [64]:
# Let's import the evaluator.
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error. 
acc_evaluator = MulticlassClassificationEvaluator(labelCol="suicide_risk", predictionCol="prediction", metricName="accuracy")
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
rfc_acc = acc_evaluator.evaluate(rfc_predictions)
gbt_acc = acc_evaluator.evaluate(gbt_predictions)
# Let's do something a bit more complex in terms of printing, just so it's formatted nicer. 
print("Here are the results!")
print('-'*40)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(dtc_acc*100))
print('-'*40)
print('A random forest ensemble has an accuracy of: {0:2.2f}%'.format(rfc_acc*100))
print('-'*40)
print('An ensemble using GBT has an accuracy of: {0:2.2f}%'.format(gbt_acc*100))

Here are the results!
----------------------------------------
A single decision tree has an accuracy of: 91.50%
----------------------------------------
A random forest ensemble has an accuracy of: 90.89%
----------------------------------------
An ensemble using GBT has an accuracy of: 92.22%


In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
my_eval = BinaryClassificationEvaluator(rawPredictionCol='risk_pre',
                                       labelCol='suicide_risk')
results.select('Survived','prediction').show()
AUC = my_eval.evaluate(results)

AUC

In [15]:
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors
selector = ChiSqSelector(numTopFeatures=8, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="risk")

result = selector.fit(data).transform(data)

print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
result.show()

IllegalArgumentException: 'Field "features" does not exist.'

In [7]:
data.columns

['country',
 'year',
 'sex',
 'age',
 'suicides/100k pop',
 'HDI for year',
 ' gdp_for_year ($) ',
 'gdp_per_capita ($)',
 'generation']

In [5]:
data.columns

['country',
 'year',
 'sex',
 'age',
 'suicides_no',
 'population',
 'suicides/100k pop',
 'country-year',
 'HDI for year',
 ' gdp_for_year ($) ',
 'gdp_per_capita ($)',
 'generation']

In [7]:
# Let's see the data. You'll notice nulls.
data.show()

+-------+----+------+-----------+-----------+----------+-----------------+------------+------------+------------------+------------------+---------------+
|country|year|   sex|        age|suicides_no|population|suicides/100k pop|country-year|HDI for year| gdp_for_year ($) |gdp_per_capita ($)|     generation|
+-------+----+------+-----------+-----------+----------+-----------------+------------+------------+------------------+------------------+---------------+
|Albania|1987|  male|15-24 years|         21|    312900|             6.71| Albania1987|        null|     2,156,624,900|               796|   Generation X|
|Albania|1987|  male|35-54 years|         16|    308000|             5.19| Albania1987|        null|     2,156,624,900|               796|         Silent|
|Albania|1987|female|15-24 years|         14|    289700|             4.83| Albania1987|        null|     2,156,624,900|               796|   Generation X|
|Albania|1987|  male|  75+ years|          1|     21800|             4

In [8]:
# A few things we need to do before Spark can accept the data!
# It needs to be in the form of two columns: "label" and "features".

# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [9]:
# Let's visualise the columns to help with assembly. 
data.columns

['country',
 'year',
 'sex',
 'age',
 'suicides_no',
 'population',
 'suicides/100k pop',
 'country-year',
 'HDI for year',
 ' gdp_for_year ($) ',
 'gdp_per_capita ($)',
 'generation']

In [15]:
!pip3 install --upgrade pip
!pip3 install seaborn

Collecting pip
  Downloading https://files.pythonhosted.org/packages/cb/28/91f26bd088ce8e22169032100d4260614fc3da435025ff389ef1d396a433/pip-20.2.4-py2.py3-none-any.whl (1.5MB)
[K    100% |████████████████████████████████| 1.5MB 914kB/s eta 0:00:01  4% |█▎                              | 61kB 12.5MB/s eta 0:00:01
[?25hInstalling collected packages: pip
Successfully installed pip-20.2.4
[33mDEPRECATION: Python 3.5 reached the end of its life on September 13th, 2020. Please upgrade your Python as Python 3.5 is no longer maintained. pip 21.0 will drop support for Python 3.5 in January 2021. pip 21.0 will remove support for this functionality.[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting seaborn
  Downloading seaborn-0.9.1-py2.py3-none-any.whl (216 kB)
[K     |████████████████████████████████| 216 kB 10.9 MB/s eta 0:00:01
Collecting scipy>=0.17.1
  Downloading scipy-1.4.1-cp35-cp35m-manylinux1_x86_64.whl (26.0 MB)
[K     |█████████████████

In [5]:
# We can use the describe method get some general statistics on our data too. Remember to show the DataFrame!
# But what about data type?
data.describe(['country',
 'year',
 'sex',
 'age',
 'suicides_no',
 'population',
 'suicides/100k pop',
]).show()

+-------+----------+------------------+------+-----------+------------------+------------------+------------------+
|summary|   country|              year|   sex|        age|       suicides_no|        population| suicides/100k pop|
+-------+----------+------------------+------+-----------+------------------+------------------+------------------+
|  count|     27820|             27820| 27820|      27820|             27820|             27820|             27820|
|   mean|      null|2001.2583752695903|  null|       null|242.57440690150972|1844793.6173975556|12.816097411933894|
| stddev|      null| 8.469055024441408|  null|       null| 902.0479168336403|3911779.4417563565| 18.96151101450326|
|    min|   Albania|              1985|female|05-14 years|                 0|               278|               0.0|
|    max|Uzbekistan|              2016|  male|  75+ years|             22338|          43805214|            224.97|
+-------+----------+------------------+------+-----------+--------------

In [6]:
data.describe([
 'country-year',
 'HDI for year',
 ' gdp_for_year ($) ',
 'gdp_per_capita ($)',
 'generation']).show()

+-------+--------------+-------------------+------------------+------------------+----------+
|summary|  country-year|       HDI for year| gdp_for_year ($) |gdp_per_capita ($)|generation|
+-------+--------------+-------------------+------------------+------------------+----------+
|  count|         27820|               8364|             27820|             27820|     27820|
|   mean|          null| 0.7766011477761785|              null|16866.464414090584|      null|
| stddev|          null|0.09336670859029984|              null|18887.576472205576|      null|
|    min|   Albania1987|              0.483| 1,002,219,052,968|               251|   Boomers|
|    max|Uzbekistan2014|              0.944|       997,007,926|            126352|    Silent|
+-------+--------------+-------------------+------------------+------------------+----------+

