In [None]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Data_Importance').getOrCreate()

In [None]:
# Let's read in the data. Note that it's in the format of JSON.
dog_data_merged = spark.read.load("Datasets/Dog_registred_hamilton_new_v1_2_6.csv", format="csv", header ="true")
dog_data_merged.show()

In [None]:
dog_data_merged.printSchema()

In [None]:
from pyspark.sql.types import IntegerType
dog_data_merged = dog_data_merged.withColumn("Dog_Number",dog_data_merged["Dog_Number"].cast(IntegerType()))
dog_data_merged = dog_data_merged.withColumn("Age",dog_data_merged["Age"].cast(IntegerType()))
dog_data_merged = dog_data_merged.withColumn("Total_Complains",dog_data_merged["Total_Complains"].cast(IntegerType()))
dog_data_merged = dog_data_merged.withColumn("Classification",dog_data_merged["Classification"].cast(IntegerType()))

In [None]:
dog_data_merged.printSchema()

In [None]:
dog_data_merged.groupBy("Classification").count().show()

In [None]:
from pyspark.sql.functions import col, explode, array, lit

major_df = dog_data_merged.filter(col("Classification") == 0)
minor_df = dog_data_merged.filter(col("Classification") == 1)
ratio = int(major_df.count()/minor_df.count())
print("ratio: {}".format(ratio))

In [None]:
a = range(ratio)
# duplicate the minority rows
oversampled_df = minor_df.withColumn("dummy", explode(array([lit(x) for x in a]))).drop('dummy')
# combine both oversampled minority rows and previous majority rows 
combined_data = major_df.unionAll(oversampled_df)
combined_data.show()

In [None]:
combined_data.groupBy("Classification").count().show()

In [None]:
combined_data.columns

In [None]:
combined_data = combined_data.select(['Primary_Colour_Code',
 'Secondary_Colour_Code',
 'Age',
 'Animal_Sex',
 'Desexed',
 'Classification',
 'Microchip_Flag',
 'Total_Complains'])

In [None]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)

In [None]:
primary_colour_indexer = StringIndexer(inputCol='Primary_Colour_Code',\
                                       outputCol='Primary_Colour_Code_Index')
primary_colour_indexed = primary_colour_indexer.fit(combined_data).transform(combined_data)

In [None]:
secondary_colour_indexer = StringIndexer(inputCol='Secondary_Colour_Code',\
                                         outputCol='Secondary_Colour_Code_Index')
secondary_colour_indexd = secondary_colour_indexer.fit(primary_colour_indexed).transform(primary_colour_indexed)

In [None]:
sex_indexer = StringIndexer(inputCol='Animal_Sex',\
                                       outputCol='Animal_Sex_Index')
sex_indexed = sex_indexer.fit(secondary_colour_indexd).transform(secondary_colour_indexd)

In [None]:
desexed_indexer = StringIndexer(inputCol='Desexed',\
                                       outputCol='Desexed_Index')
desexed_indexd = desexed_indexer.fit(sex_indexed).transform(sex_indexed)

In [None]:
microchip_flag_indexer = StringIndexer(inputCol='Microchip_Flag',\
                                       outputCol='Microchip_Flag_Index')

microchip_flag_indexd = microchip_flag_indexer.fit(desexed_indexd).transform(desexed_indexd)

In [None]:
assembler = VectorAssembler(inputCols=
                            ['Age','Total_Complains', 'Primary_Colour_Code_Index',
            'Secondary_Colour_Code_Index', 'Animal_Sex_Index', 'Desexed_Index',
            'Microchip_Flag_Index',], outputCol='features')

In [None]:
output = assembler.transform(microchip_flag_indexd)

In [None]:
final_data = output.select("features",'Classification')

In [None]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier
from pyspark.ml import Pipeline

In [None]:
dtc = DecisionTreeClassifier(labelCol='Classification',featuresCol='features', maxDepth=3)
rfc = RandomForestClassifier(labelCol='Classification',featuresCol='features')
gbt = GBTClassifier(labelCol='Classification',featuresCol='features')

In [None]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [None]:
dtc_predictions = dtc_model.transform(test_data)
rfc_predictions = rfc_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

my_binary_eval = BinaryClassificationEvaluator(labelCol = 'Classification')

In [None]:
my_binary_gbt_eval = BinaryClassificationEvaluator(labelCol='Classification', rawPredictionCol='prediction')
dtc_eva = my_binary_eval.evaluate(dtc_predictions)
rfc_eva = my_binary_eval.evaluate(rfc_predictions)
gbt_eva = my_binary_gbt_eval.evaluate(gbt_predictions)

print("Here are the BinaryClassification results!")
print('-'*40)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(dtc_eva*100))
print('-'*40)
print('A random forest ensemble has an accuracy of: {0:2.2f}%'.format(rfc_eva*100))
print('-'*40)
print('An ensemble using GBT has an accuracy of: {0:2.2f}%'.format(gbt_eva*100))


In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol="Classification", predictionCol="prediction", metricName="accuracy")

In [None]:
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
rfc_acc = acc_evaluator.evaluate(rfc_predictions)
gbt_acc = acc_evaluator.evaluate(gbt_predictions)

In [None]:
print("Here are the MulticalssBinary results!")
print('-'*40)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(dtc_acc*100))
print('-'*40)
print('A random forest ensemble has an accuracy of: {0:2.2f}%'.format(rfc_acc*100))
print('-'*40)
print('An ensemble using GBT has an accuracy of: {0:2.2f}%'.format(gbt_acc*100))

In [None]:
from pyspark.ml.classification import DecisionTreeClassificationModel

In [None]:
print(dtc_model.toDebugString)