In [1]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Data_Importance').getOrCreate()

In [2]:
# Let's read in the data. Note that it's in the format of JSON.
dog_data_merged = spark.read.load("Datasets/Dog_registred_hamilton_new_v1_2_6.csv", format="csv", header ="true")
dog_data_merged.show()

+----------+-------------------+---------------------+---+----------+-------+------+--------------+--------------+---------------+
|Dog_Number|Primary_Colour_Code|Secondary_Colour_Code|Age|Animal_Sex|Desexed|Worker|Classification|Microchip_Flag|Total_Complains|
+----------+-------------------+---------------------+---+----------+-------+------+--------------+--------------+---------------+
|    151010|               DARK|                 DARK|  0|         F|      Y|     N|             0|             Y|              0|
|    173567|               DARK|                LIGHT|  0|         M|      N|     N|             0|             Y|              0|
|    192345|              LIGHT|                LIGHT|  0|         F|      N|     N|             0|             N|              0|
|    193685|              LIGHT|                LIGHT|  0|         M|      N|     N|             0|             Y|              0|
|    218377|              LIGHT|                 DARK|  0|         M|      Y|     N

In [3]:
dog_data_merged.printSchema()

root
 |-- Dog_Number: string (nullable = true)
 |-- Primary_Colour_Code: string (nullable = true)
 |-- Secondary_Colour_Code: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Animal_Sex: string (nullable = true)
 |-- Desexed: string (nullable = true)
 |-- Worker: string (nullable = true)
 |-- Classification: string (nullable = true)
 |-- Microchip_Flag: string (nullable = true)
 |-- Total_Complains: string (nullable = true)



In [4]:
from pyspark.sql.types import IntegerType
dog_data_merged = dog_data_merged.withColumn("Dog_Number",dog_data_merged["Dog_Number"].cast(IntegerType()))
dog_data_merged = dog_data_merged.withColumn("Age",dog_data_merged["Age"].cast(IntegerType()))
dog_data_merged = dog_data_merged.withColumn("Total_Complains",dog_data_merged["Total_Complains"].cast(IntegerType()))
dog_data_merged = dog_data_merged.withColumn("Classification",dog_data_merged["Classification"].cast(IntegerType()))

In [5]:
dog_data_merged.printSchema()

root
 |-- Dog_Number: integer (nullable = true)
 |-- Primary_Colour_Code: string (nullable = true)
 |-- Secondary_Colour_Code: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Animal_Sex: string (nullable = true)
 |-- Desexed: string (nullable = true)
 |-- Worker: string (nullable = true)
 |-- Classification: integer (nullable = true)
 |-- Microchip_Flag: string (nullable = true)
 |-- Total_Complains: integer (nullable = true)



In [6]:
dog_data_merged.groupBy("Classification").count().show()

+--------------+-----+
|Classification|count|
+--------------+-----+
|             1| 1504|
|             0|57545|
+--------------+-----+



In [7]:
dog_dataPanda_merged = dog_data_merged.toPandas()

In [8]:
dog_data_merged.columns

['Dog_Number',
 'Primary_Colour_Code',
 'Secondary_Colour_Code',
 'Age',
 'Animal_Sex',
 'Desexed',
 'Worker',
 'Classification',
 'Microchip_Flag',
 'Total_Complains']

In [9]:
from pyspark.ml.feature import RFormula
formula = RFormula(
    formula="Classification ~ Primary_Colour_Code + Secondary_Colour_Code + Age +\
    Animal_Sex + Desexed + Worker + Microchip_Flag+ Total_Complains",
    featuresCol="features",
    labelCol="label")

In [10]:
output = formula.fit(dog_data_merged).transform(dog_data_merged)
output.select("features", "label").show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(8,[0,5,6],[1.0,1...|  0.0|
|[1.0,1.0,0.0,1.0,...|  0.0|
|(8,[1,4,5],[1.0,1...|  0.0|
|[0.0,1.0,0.0,1.0,...|  0.0|
|(8,[3,5,6],[1.0,1...|  0.0|
|(8,[0,1,5,6],[1.0...|  0.0|
|(8,[1,4,5,6],[1.0...|  0.0|
|[1.0,0.0,0.0,1.0,...|  0.0|
|[0.0,1.0,0.0,1.0,...|  0.0|
|[1.0,1.0,0.0,0.0,...|  0.0|
|[1.0,1.0,0.0,0.0,...|  0.0|
|[1.0,1.0,0.0,1.0,...|  0.0|
|(8,[1,4,5,6],[1.0...|  0.0|
|[0.0,1.0,0.0,1.0,...|  0.0|
|(8,[1,4,5],[1.0,1...|  0.0|
|(8,[1,4,5],[1.0,1...|  0.0|
|(8,[0,3,4,5],[1.0...|  0.0|
|(8,[0,1,4,5],[1.0...|  0.0|
|(8,[0,4,5,6],[1.0...|  0.0|
|[1.0,1.0,0.0,1.0,...|  0.0|
+--------------------+-----+
only showing top 20 rows



In [11]:
dog_selected_data = dog_data_merged.select(['Primary_Colour_Code',
 'Secondary_Colour_Code',
 'Age',
 'Animal_Sex',
 'Desexed',
 'Worker',
 'Classification',
 'Microchip_Flag',
 'Total_Complains'])

In [12]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)

In [13]:
primary_colour_indexer = StringIndexer(inputCol='Primary_Colour_Code',\
                                       outputCol='Primary_Colour_Code_Index',)
primary_colour_encoder = OneHotEncoder(inputCol='Primary_Colour_Code_Index',\
                                       outputCol='Primary_Colour_Vec')

In [14]:
secondary_colour_indexer = StringIndexer(inputCol='Secondary_Colour_Code',\
                                         outputCol='Secondary_Colour_Code_Index', )
secondary_colour_encoder = OneHotEncoder(inputCol='Secondary_Colour_Code_Index',\
                                         outputCol='Secondary_Colour_Vec')

In [15]:
sex_indexer = StringIndexer(inputCol='Animal_Sex',\
                                       outputCol='Animal_Sex_Index', )
sex_encoder = OneHotEncoder(inputCol='Animal_Sex_Index',\
                                       outputCol='Animal_Sex_Vec')

In [16]:
desexed_indexer = StringIndexer(inputCol='Desexed',\
                                       outputCol='Desexed_Index', )
desexed_encoder = OneHotEncoder(inputCol='Desexed_Index', outputCol='Desexed_Vec')

In [17]:
worker_indexer = StringIndexer(inputCol='Worker',\
                                       outputCol='Worker_Index', )
worker_encoder = OneHotEncoder(inputCol='Worker_Index',outputCol='Worker_Vec')

In [18]:
microchip_flag_indexer = StringIndexer(inputCol='Microchip_Flag',\
                                       outputCol='Microchip_Flag_Index', )
microchip_flag_encoder = OneHotEncoder(inputCol='Microchip_Flag_Index',\
                                       outputCol='Microchip_Flag_Vec')

In [19]:
string_indexes = ['primary_colour_indexer', 'secondary_colour_indexer', 'sex_indexer',\
                            'desexed_indexer', 'worker_indexer', 'microchip_flag_indexer']

In [20]:
onehot_indexes = ['primary_colour_encoder', 'secondary_colour_encoder', 'sex_encoder',
                            'desexed_encoder', 'worker_encoder', 'microchip_flag_encoder']

In [21]:
label_indexes = StringIndexer(inputCol = 'Classification', outputCol = 'label')

In [22]:
assembler = VectorAssembler(inputCols=['Primary_Colour_Vec',
 'Secondary_Colour_Vec','Age','Animal_Sex_Vec','Desexed_Vec','Worker_Vec','Microchip_Flag_Vec','Total_Complains'], outputCol='features')

In [23]:
from pyspark.ml.classification import  RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed = 8464,
                            numTrees=10, cacheNodeIds = True, subsamplingRate = 0.7)

In [24]:
from pyspark.ml import Pipeline


pipe = Pipeline(stages = [primary_colour_indexer, secondary_colour_indexer, sex_indexer,
                            desexed_indexer, worker_indexer, microchip_flag_indexer,
                           primary_colour_encoder, secondary_colour_encoder, sex_encoder,
                            desexed_encoder, worker_encoder,  microchip_flag_encoder, 
                            assembler, label_indexes, rf])

In [25]:
mod = pipe.fit(dog_selected_data)

In [26]:
df2 = mod.transform(dog_selected_data)

In [27]:
mod.stages[-1].featureImportances

SparseVector(8, {0: 0.0347, 1: 0.018, 2: 0.1045, 3: 0.0029, 4: 0.2384, 5: 0.0, 6: 0.1451, 7: 0.4564})

In [28]:
df2.select('features','Classification').show()

+--------------------+--------------+
|            features|Classification|
+--------------------+--------------+
|(8,[0,5,6],[1.0,1...|             0|
|[1.0,1.0,0.0,1.0,...|             0|
|(8,[1,4,5],[1.0,1...|             0|
|[0.0,1.0,0.0,1.0,...|             0|
|(8,[3,5,6],[1.0,1...|             0|
|(8,[0,1,5,6],[1.0...|             0|
|(8,[1,4,5,6],[1.0...|             0|
|[1.0,0.0,0.0,1.0,...|             0|
|[0.0,1.0,0.0,1.0,...|             0|
|[1.0,1.0,0.0,0.0,...|             0|
|[1.0,1.0,0.0,0.0,...|             0|
|[1.0,1.0,0.0,1.0,...|             0|
|(8,[1,4,5,6],[1.0...|             0|
|[0.0,1.0,0.0,1.0,...|             0|
|(8,[1,4,5],[1.0,1...|             0|
|(8,[1,4,5],[1.0,1...|             0|
|(8,[0,3,4,5],[1.0...|             0|
|(8,[0,1,4,5],[1.0...|             0|
|(8,[0,4,5,6],[1.0...|             0|
|[1.0,1.0,0.0,1.0,...|             0|
+--------------------+--------------+
only showing top 20 rows



In [29]:
df2.columns

['Primary_Colour_Code',
 'Secondary_Colour_Code',
 'Age',
 'Animal_Sex',
 'Desexed',
 'Worker',
 'Classification',
 'Microchip_Flag',
 'Total_Complains',
 'Primary_Colour_Code_Index',
 'Secondary_Colour_Code_Index',
 'Animal_Sex_Index',
 'Desexed_Index',
 'Worker_Index',
 'Microchip_Flag_Index',
 'Primary_Colour_Vec',
 'Secondary_Colour_Vec',
 'Animal_Sex_Vec',
 'Desexed_Vec',
 'Worker_Vec',
 'Microchip_Flag_Vec',
 'features',
 'label',
 'rawPrediction',
 'probability',
 'prediction']

In [30]:
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

selector = ChiSqSelector(numTopFeatures=1, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="Classification")

result = selector.fit(df2).transform(df2)

print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
result.show()

ChiSqSelector output with top 1 features selected
+-------------------+---------------------+---+----------+-------+------+--------------+--------------+---------------+-------------------------+---------------------------+----------------+-------------+------------+--------------------+------------------+--------------------+--------------+-------------+-------------+------------------+--------------------+-----+--------------------+--------------------+----------+----------------+
|Primary_Colour_Code|Secondary_Colour_Code|Age|Animal_Sex|Desexed|Worker|Classification|Microchip_Flag|Total_Complains|Primary_Colour_Code_Index|Secondary_Colour_Code_Index|Animal_Sex_Index|Desexed_Index|Worker_Index|Microchip_Flag_Index|Primary_Colour_Vec|Secondary_Colour_Vec|Animal_Sex_Vec|  Desexed_Vec|   Worker_Vec|Microchip_Flag_Vec|            features|label|       rawPrediction|         probability|prediction|selectedFeatures|
+-------------------+---------------------+---+----------+-------+------+-

In [31]:
result.count()

59049

In [32]:
dog_data_merged.count()

59049

In [33]:
dog_data_merged.groupBy("Classification").count().show()

+--------------+-----+
|Classification|count|
+--------------+-----+
|             1| 1504|
|             0|57545|
+--------------+-----+

