### Decision Tree

#### problem statement : 
    -features : duration, product related duration, region, month, bonus rate etc
    
    -Target: Rvenue(yes/no)
    -Goal : To create a well trained supervised model for prediction of revenue as yes or no (Binary Classification)

In [1]:
spark

In [2]:
sc

#### 1.Read Dataset

In [3]:
online_shoppers = spark.read.csv("file:///home/hadoop/Downloads/Online Shoppers Intention.csv", inferSchema=True,header=True)

#### 2. Show Schema

In [4]:
online_shoppers.cache()

DataFrame[Administrative: int, Administrative_Duration: double, Informational: int, Informational_Duration: double, ProductRelated: int, ProductRelated_Duration: double, BounceRates: double, ExitRates: double, PageValues: double, SpecialDay: double, Month: string, OperatingSystems: int, Browser: int, Region: int, TrafficType: int, VisitorType: string, Weekend: boolean, Revenue: boolean]

In [5]:
online_shoppers.printSchema()

root
 |-- Administrative: integer (nullable = true)
 |-- Administrative_Duration: double (nullable = true)
 |-- Informational: integer (nullable = true)
 |-- Informational_Duration: double (nullable = true)
 |-- ProductRelated: integer (nullable = true)
 |-- ProductRelated_Duration: double (nullable = true)
 |-- BounceRates: double (nullable = true)
 |-- ExitRates: double (nullable = true)
 |-- PageValues: double (nullable = true)
 |-- SpecialDay: double (nullable = true)
 |-- Month: string (nullable = true)
 |-- OperatingSystems: integer (nullable = true)
 |-- Browser: integer (nullable = true)
 |-- Region: integer (nullable = true)
 |-- TrafficType: integer (nullable = true)
 |-- VisitorType: string (nullable = true)
 |-- Weekend: boolean (nullable = true)
 |-- Revenue: boolean (nullable = true)



#### 2. Data Wrangling
    - Is there any missing values

In [6]:
from pyspark.sql.functions import *

In [7]:
online_shoppers.select([count(when(isnull(col),col)).alias(col) for col in online_shoppers.columns]).show()

+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+---------+----------+----------+-----+----------------+-------+------+-----------+-----------+-------+-------+
|Administrative|Administrative_Duration|Informational|Informational_Duration|ProductRelated|ProductRelated_Duration|BounceRates|ExitRates|PageValues|SpecialDay|Month|OperatingSystems|Browser|Region|TrafficType|VisitorType|Weekend|Revenue|
+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+---------+----------+----------+-----+----------------+-------+------+-----------+-----------+-------+-------+
|            14|                     14|           14|                    14|            14|                     14|         14|       14|         0|         0|    0|               0|      0|     0|          0|          0|      0|      0|
+--------------+-----------------------+----

In [8]:
#remove null value
shoppers_df = online_shoppers.na.drop()

In [9]:
#
online_shoppers.count()-shoppers_df.count()

14

In [10]:
shoppers_df.count()

12316

In [11]:
!pip install numpy pandas matplotlib seaborn

Defaulting to user installation because normal site-packages is not writeable


In [12]:
shoppers_df.toPandas().head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,-1.0,0,-1.0,1,-1.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


#### 4. Data preprocessing
    transformation of categorical values into numerical dataframe

In [13]:
shoppers_df.select('Month').distinct().show()

+-----+
|Month|
+-----+
|  Oct|
|  Sep|
|  Dec|
|  Aug|
|  May|
| June|
|  Feb|
|  Nov|
|  Mar|
|  Jul|
+-----+



In [14]:
shoppers_df.select('VisitorType').distinct().show()

+-----------------+
|      VisitorType|
+-----------------+
|      New_Visitor|
|            Other|
|Returning_Visitor|
+-----------------+



In [15]:
shoppers_df = shoppers_df.withColumn("VisitorType",regexp_replace('VisitorType','New_Visitor',"0"))
shoppers_df = shoppers_df.withColumn("VisitorType",regexp_replace('VisitorType','Other',"1"))
shoppers_df = shoppers_df.withColumn("VisitorType",regexp_replace('VisitorType','Returning_Visitor',"2"))

##### StringIndexer() - Encode categorical values into numerical labels as 0,1,2

In [16]:
shoppers_df.select('VisitorType').distinct().show()

+-----------+
|VisitorType|
+-----------+
|          0|
|          1|
|          2|
+-----------+



In [17]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [18]:
indexer = StringIndexer(inputCol='Month', outputCol='Month_index')
# FIt the stringindexer() model on the DataFrame
shoppers_df1 = indexer.fit(shoppers_df).transform(shoppers_df)

shoppers_df1.show()

+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+-----+----------------+-------+------+-----------+-----------+-------+-------+-----------+
|Administrative|Administrative_Duration|Informational|Informational_Duration|ProductRelated|ProductRelated_Duration|BounceRates|  ExitRates|PageValues|SpecialDay|Month|OperatingSystems|Browser|Region|TrafficType|VisitorType|Weekend|Revenue|Month_index|
+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+-----+----------------+-------+------+-----------+-----------+-------+-------+-----------+
|             0|                    0.0|            0|                   0.0|             1|                    0.0|        0.2|        0.2|       0.0|       0.0|  Feb|               1|      1|     1|          1|          2|  false|  false| 

In [19]:
shoppers_df1.select('Month_index').distinct().show()

+-----------+
|Month_index|
+-----------+
|        8.0|
|        0.0|
|        7.0|
|        1.0|
|        4.0|
|        3.0|
|        2.0|
|        6.0|
|        5.0|
|        9.0|
+-----------+



In [30]:
shoppers_df4 = shoppers_df1.drop('Month')

In [31]:
shoppers_df4.printSchema()

root
 |-- Administrative: integer (nullable = true)
 |-- Administrative_Duration: double (nullable = true)
 |-- Informational: integer (nullable = true)
 |-- Informational_Duration: double (nullable = true)
 |-- ProductRelated: integer (nullable = true)
 |-- ProductRelated_Duration: double (nullable = true)
 |-- BounceRates: double (nullable = true)
 |-- ExitRates: double (nullable = true)
 |-- PageValues: double (nullable = true)
 |-- SpecialDay: double (nullable = true)
 |-- OperatingSystems: integer (nullable = true)
 |-- Browser: integer (nullable = true)
 |-- Region: integer (nullable = true)
 |-- TrafficType: integer (nullable = true)
 |-- VisitorType: string (nullable = true)
 |-- Weekend: boolean (nullable = true)
 |-- Revenue: boolean (nullable = true)
 |-- Month_index: double (nullable = false)



In [32]:
from pyspark.sql.types import IntegerType
shoppers_df5 = shoppers_df4.withColumn('VisitorType',col('VisitorType').cast("integer"))

In [33]:
shoppers_df5.select('VisitorType').distinct().show()

+-----------+
|VisitorType|
+-----------+
|          1|
|          2|
|          0|
+-----------+



In [34]:
shoppers_df5.columns

['Administrative',
 'Administrative_Duration',
 'Informational',
 'Informational_Duration',
 'ProductRelated',
 'ProductRelated_Duration',
 'BounceRates',
 'ExitRates',
 'PageValues',
 'SpecialDay',
 'OperatingSystems',
 'Browser',
 'Region',
 'TrafficType',
 'VisitorType',
 'Weekend',
 'Revenue',
 'Month_index']

#### 5. features Vector

In [25]:
from pyspark.ml.feature import VectorAssembler

In [27]:
vector_assembler = VectorAssembler(inputCols=['Administrative',
 'Administrative_Duration',
 'Informational',
 'Informational_Duration',
 'ProductRelated',
 'ProductRelated_Duration',
 'BounceRates',
 'ExitRates',
 'PageValues',
 'SpecialDay',
 'OperatingSystems',
 'Browser',
 'Region',
 'TrafficType',
 'VisitorType',
 'Weekend',
  'Month_index' ],outputCol='feature')

In [35]:
shoppers_df5.printSchema()

root
 |-- Administrative: integer (nullable = true)
 |-- Administrative_Duration: double (nullable = true)
 |-- Informational: integer (nullable = true)
 |-- Informational_Duration: double (nullable = true)
 |-- ProductRelated: integer (nullable = true)
 |-- ProductRelated_Duration: double (nullable = true)
 |-- BounceRates: double (nullable = true)
 |-- ExitRates: double (nullable = true)
 |-- PageValues: double (nullable = true)
 |-- SpecialDay: double (nullable = true)
 |-- OperatingSystems: integer (nullable = true)
 |-- Browser: integer (nullable = true)
 |-- Region: integer (nullable = true)
 |-- TrafficType: integer (nullable = true)
 |-- VisitorType: integer (nullable = true)
 |-- Weekend: boolean (nullable = true)
 |-- Revenue: boolean (nullable = true)
 |-- Month_index: double (nullable = false)



In [36]:
final_shopper = vector_assembler.transform(shoppers_df5)

In [38]:
final_shopper.show()
final_shopper.select(['feature']).show(truncate=False)

+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+----------------+-------+------+-----------+-----------+-------+-------+-----------+--------------------+
|Administrative|Administrative_Duration|Informational|Informational_Duration|ProductRelated|ProductRelated_Duration|BounceRates|  ExitRates|PageValues|SpecialDay|OperatingSystems|Browser|Region|TrafficType|VisitorType|Weekend|Revenue|Month_index|             feature|
+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+----------------+-------+------+-----------+-----------+-------+-------+-----------+--------------------+
|             0|                    0.0|            0|                   0.0|             1|                    0.0|        0.2|        0.2|       0.0|       0.0|               1|      1|     1|  

In [42]:
final_shopper.select(['Revenue']).distinct().show()

+-------+
|Revenue|
+-------+
|   true|
|  false|
+-------+



In [53]:
final_shoppers2 = final_shopper.withColumn('Revenue',col('Revenue').cast("integer"))

#### 6.Split Dataset into train & test
    - randomSplit()

In [54]:
train,test = final_shoppers2.randomSplit([0.8,0.2] ,seed=12)

In [55]:
train.show()

+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+----------------+-------+------+-----------+-----------+-------+-------+-----------+--------------------+
|Administrative|Administrative_Duration|Informational|Informational_Duration|ProductRelated|ProductRelated_Duration|BounceRates|  ExitRates|PageValues|SpecialDay|OperatingSystems|Browser|Region|TrafficType|VisitorType|Weekend|Revenue|Month_index|             feature|
+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+----------------+-------+------+-----------+-----------+-------+-------+-----------+--------------------+
|             0|                   -1.0|            0|                  -1.0|             1|                   -1.0|        0.0|0.066666667|       0.0|       0.0|               2|      2|     7|  

In [56]:
train.select(['Revenue','feature'])

DataFrame[Revenue: int, feature: vector]

In [57]:
final_shoppers2 = final_shopper.withColumn('Revenue',col('Revenue').cast("integer"))

#### Decision Tree

In [58]:
from pyspark.ml.classification import DecisionTreeClassifier
tree = DecisionTreeClassifier(featuresCol = 'feature',labelCol='Revenue')
model = tree.fit(train)

In [59]:
#predictions on test data
predict = model.transform(test)

In [60]:
predict.select(['feature','Revenue','prediction']).show()

+--------------------+-------+----------+
|             feature|Revenue|prediction|
+--------------------+-------+----------+
|[0.0,-1.0,0.0,-1....|      0|       0.0|
|[0.0,-1.0,0.0,-1....|      0|       0.0|
|[0.0,-1.0,0.0,-1....|      0|       0.0|
|[0.0,-1.0,0.0,-1....|      0|       0.0|
|[0.0,-1.0,0.0,-1....|      0|       0.0|
|[0.0,-1.0,0.0,-1....|      0|       0.0|
|(17,[6,7,10,11,12...|      0|       0.0|
|(17,[4,7,10,11,12...|      0|       0.0|
|(17,[4,7,10,11,12...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
+--------------------+-------+----

#### 8. Classification Metrics

In [61]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol='Revenue', predictionCol='prediction',
                                 metricName='accuracy')
accuracy = evaluator.evaluate(predict)

In [62]:
accuracy

0.9112869637650979