In [1]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Data_Importance').getOrCreate()

In [2]:
# Let's read in the data. Note that it's in the format of JSON.
dog_data_merged = spark.read.load("Datasets/Dog_registred_hamilton_new_v1_2_6.csv", format="csv", header ="true")
dog_data_merged.show()

+----------+-------------------+---------------------+---+----------+-------+------+--------------+--------------+---------------+
|Dog_Number|Primary_Colour_Code|Secondary_Colour_Code|Age|Animal_Sex|Desexed|Worker|Classification|Microchip_Flag|Total_Complains|
+----------+-------------------+---------------------+---+----------+-------+------+--------------+--------------+---------------+
|    151010|               DARK|                 DARK|  0|         F|      Y|     N|             0|             Y|              0|
|    173567|               DARK|                LIGHT|  0|         M|      N|     N|             0|             Y|              0|
|    192345|              LIGHT|                LIGHT|  0|         F|      N|     N|             0|             N|              0|
|    193685|              LIGHT|                LIGHT|  0|         M|      N|     N|             0|             Y|              0|
|    218377|              LIGHT|                 DARK|  0|         M|      Y|     N

In [3]:
dog_data_merged.printSchema()

root
 |-- Dog_Number: string (nullable = true)
 |-- Primary_Colour_Code: string (nullable = true)
 |-- Secondary_Colour_Code: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Animal_Sex: string (nullable = true)
 |-- Desexed: string (nullable = true)
 |-- Worker: string (nullable = true)
 |-- Classification: string (nullable = true)
 |-- Microchip_Flag: string (nullable = true)
 |-- Total_Complains: string (nullable = true)



In [4]:
from pyspark.sql.types import IntegerType
dog_data_merged = dog_data_merged.withColumn("Dog_Number",dog_data_merged["Dog_Number"].cast(IntegerType()))
dog_data_merged = dog_data_merged.withColumn("Age",dog_data_merged["Age"].cast(IntegerType()))
dog_data_merged = dog_data_merged.withColumn("Total_Complains",dog_data_merged["Total_Complains"].cast(IntegerType()))
dog_data_merged = dog_data_merged.withColumn("Classification",dog_data_merged["Classification"].cast(IntegerType()))

In [5]:
dog_data_merged.printSchema()

root
 |-- Dog_Number: integer (nullable = true)
 |-- Primary_Colour_Code: string (nullable = true)
 |-- Secondary_Colour_Code: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Animal_Sex: string (nullable = true)
 |-- Desexed: string (nullable = true)
 |-- Worker: string (nullable = true)
 |-- Classification: integer (nullable = true)
 |-- Microchip_Flag: string (nullable = true)
 |-- Total_Complains: integer (nullable = true)



In [7]:
dog_data_merged.groupBy("Classification").count().show()

+--------------+-----+
|Classification|count|
+--------------+-----+
|             1| 1504|
|             0|57545|
+--------------+-----+



In [8]:
from pyspark.sql.functions import col, explode, array, lit

major_df = dog_data_merged.filter(col("Classification") == 0)
minor_df = dog_data_merged.filter(col("Classification") == 1)
ratio = int(major_df.count()/minor_df.count())
print("ratio: {}".format(ratio))

ratio: 38


In [9]:
a = range(ratio)
# duplicate the minority rows
oversampled_df = minor_df.withColumn("dummy", explode(array([lit(x) for x in a]))).drop('dummy')
# combine both oversampled minority rows and previous majority rows 
combined_data = major_df.unionAll(oversampled_df)
combined_data.show()

+----------+-------------------+---------------------+---+----------+-------+------+--------------+--------------+---------------+
|Dog_Number|Primary_Colour_Code|Secondary_Colour_Code|Age|Animal_Sex|Desexed|Worker|Classification|Microchip_Flag|Total_Complains|
+----------+-------------------+---------------------+---+----------+-------+------+--------------+--------------+---------------+
|    151010|               DARK|                 DARK|  0|         F|      Y|     N|             0|             Y|              0|
|    173567|               DARK|                LIGHT|  0|         M|      N|     N|             0|             Y|              0|
|    192345|              LIGHT|                LIGHT|  0|         F|      N|     N|             0|             N|              0|
|    193685|              LIGHT|                LIGHT|  0|         M|      N|     N|             0|             Y|              0|
|    218377|              LIGHT|                 DARK|  0|         M|      Y|     N

In [10]:
combined_data.groupBy("Classification").count().show()

+--------------+-----+
|Classification|count|
+--------------+-----+
|             1|57152|
|             0|57545|
+--------------+-----+



In [11]:
combined_data.columns

['Dog_Number',
 'Primary_Colour_Code',
 'Secondary_Colour_Code',
 'Age',
 'Animal_Sex',
 'Desexed',
 'Worker',
 'Classification',
 'Microchip_Flag',
 'Total_Complains']

In [12]:
combined_data = combined_data.select(['Primary_Colour_Code',
 'Secondary_Colour_Code',
 'Age',
 'Animal_Sex',
 'Desexed',
 'Classification',
 'Microchip_Flag',
 'Total_Complains'])

In [13]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)

In [14]:
primary_colour_indexer = StringIndexer(inputCol='Primary_Colour_Code',\
                                       outputCol='Primary_Colour_Code_Index',)
primary_colour_encoder = OneHotEncoder(inputCol='Primary_Colour_Code_Index',\
                                       outputCol='Primary_Colour_Vec')

In [15]:
secondary_colour_indexer = StringIndexer(inputCol='Secondary_Colour_Code',\
                                         outputCol='Secondary_Colour_Code_Index', )
secondary_colour_encoder = OneHotEncoder(inputCol='Secondary_Colour_Code_Index',\
                                         outputCol='Secondary_Colour_Vec')

In [16]:
sex_indexer = StringIndexer(inputCol='Animal_Sex',\
                                       outputCol='Animal_Sex_Index', )
sex_encoder = OneHotEncoder(inputCol='Animal_Sex_Index',\
                                       outputCol='Animal_Sex_Vec')

In [17]:
desexed_indexer = StringIndexer(inputCol='Desexed',\
                                       outputCol='Desexed_Index', )
desexed_encoder = OneHotEncoder(inputCol='Desexed_Index', outputCol='Desexed_Vec')

In [18]:
microchip_flag_indexer = StringIndexer(inputCol='Microchip_Flag',\
                                       outputCol='Microchip_Flag_Index', )
microchip_flag_encoder = OneHotEncoder(inputCol='Microchip_Flag_Index',\
                                       outputCol='Microchip_Flag_Vec')

In [19]:
assembler = VectorAssembler(inputCols=['Primary_Colour_Vec',
 'Secondary_Colour_Vec','Age','Animal_Sex_Vec','Desexed_Vec','Microchip_Flag_Vec','Total_Complains'], outputCol='features')

In [20]:
from pyspark.ml.classification import LogisticRegression

In [21]:
from pyspark.ml import Pipeline

In [22]:
log_reg_bite = LogisticRegression(featuresCol='features',labelCol='Classification')

In [23]:
pipeline = Pipeline(stages = [primary_colour_indexer, secondary_colour_indexer, sex_indexer,
                            desexed_indexer, microchip_flag_indexer,
                           primary_colour_encoder, secondary_colour_encoder, sex_encoder,
                            desexed_encoder, microchip_flag_encoder, 
                            assembler, log_reg_bite])

In [24]:
train_bite_data, test_bite_data = combined_data.randomSplit([0.7,.3])
print("Training Dataset Count: " + str(train_bite_data.count()))
print("Test Dataset Count: " + str(test_bite_data.count()))

Training Dataset Count: 80459
Test Dataset Count: 34238


In [25]:
fit_model = pipeline.fit(train_bite_data)

In [26]:
results = fit_model.transform(test_bite_data)

In [29]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='Classification')

In [30]:
AUC = my_eval.evaluate(results)

AUC

0.8092084931644584

In [32]:
results.columns

['Primary_Colour_Code',
 'Secondary_Colour_Code',
 'Age',
 'Animal_Sex',
 'Desexed',
 'Classification',
 'Microchip_Flag',
 'Total_Complains',
 'Primary_Colour_Code_Index',
 'Secondary_Colour_Code_Index',
 'Animal_Sex_Index',
 'Desexed_Index',
 'Microchip_Flag_Index',
 'Primary_Colour_Vec',
 'Secondary_Colour_Vec',
 'Animal_Sex_Vec',
 'Desexed_Vec',
 'Microchip_Flag_Vec',
 'features',
 'rawPrediction',
 'probability',
 'prediction']

In [33]:
totalResults = results.select('Classification','prediction')

correctResults = totalResults.filter(totalResults['Classification'] == totalResults['prediction'])

wrongResults = totalResults.filter(totalResults['Classification'] != totalResults['prediction'])

countTR = totalResults.count()
print("Correct: " + str(countTR))

countTC = correctResults.count()
print("Total Correct: " + str(countTC))

countTW = wrongResults.count()
print("Total Wrong: " + str(countTW))

ratioCorrect = countTC / countTR
print("Correct Ratio: " + str(ratioCorrect))

ratioWrong = countTW / countTR
print("Wrong Ratio: " + str(ratioWrong))

Correct: 34238
Total Correct: 27705
Total Wrong: 6533
Correct Ratio: 0.809188620830656
Wrong Ratio: 0.19081137916934401
