In [2]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=9d86f9d857d706e3fdfb8c8f0fc1eb7c405bb5abd3b20dc0e615f2bbf0042739
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import Imputer,VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

Read dataset

In [None]:
spark = SparkSession.builder.appName('dt').getOrCreate()
spark
df = spark.read.csv('/content/covtype.csv',inferSchema=True,header=True)
#df.show(5)
colnames = ["Elevation","Aspect","Slope","Horizontal_Distance_To_Hydrology","Vertical_Distance_To_Hydrology",\
           "Horizontal_Distance_To_Roadways","Hillshade_9am","Hillshade_noon","Hillshade_3pm",\
           "Horizontal_Distance_To_Fire_Points"] + \
           [f"Wilderness_Area_{i}" for i in range(4)] + [f"Soil_Type_{i}" for i in range(40)] + ["Cover_Type"]
#df = df.toDF(*colnames)
df.show(5)
df.printSchema()

Display stats

In [None]:
df.describe().show()

Null Values and Imputation

In [None]:
#colnames
#null_cond = col(colnames[0]).isNull()
#for col_name in colnames[1:]:
#    null_cond = null_cond | col(col_name).isNull()
#filtered_df = df.filter(null_cond)
#filtered_df.show()
filtered_df=df.na.drop()
filtered_df.show()
num_features = [
    'Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points'
]
cat_features = [
    'Wilderness_Area1','Wilderness_Area2','Wilderness_Area3','Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6',
    'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13',
    'Soil_Type14', 'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
    'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27',
    'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
    'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40' , 'Cover_Type'
]

num_imputer = Imputer(inputCols=num_features,outputCols=[f'{col_name}_imputed' for col_name in num_features])
num_imputed = num_imputer.fit(df)
df_num_imp = num_imputed.transform(df)
cat_imputer = Imputer(inputCols=cat_features,outputCols=[f"{col_name}_imputed" for col_name in cat_features])
cat_imputed = cat_imputer.fit(df_num_imp)
df_imp = cat_imputed.transform(df_num_imp)
print(f"imputed categorical and numerical features")
df_imp.show(5)


Imputation

In [None]:
train,test = df.randomSplit([0.9,0.1])
#assembler = VectorAssembler(inputCols=df.col)
col_names = df.columns
col_names = col_names[:-1]
print(col_names)
assembler = VectorAssembler(inputCols=col_names,outputCol='features')
train = assembler.transform(df)
test = assembler.transform(df)
train.show(3)
test.show(3)



In [None]:
dt_model = DecisionTreeClassifier(labelCol='Cover_Type',featuresCol='features')
dt = dt_model.fit(train)
print(dt.toDebugString)

In [None]:
preds = dt.transform(test)
preds.show(5)

In [17]:
evaluator_acc = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='Cover_Type',metricName='accuracy')
accuracy = evaluator_acc.evaluate(preds)
print(f"Accuracy: {accuracy}")
evaluator_prec = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='Cover_Type',metricName='weightedPrecision')
precision = evaluator_prec.evaluate(preds)
print(f"Precision: {precision}")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="Cover_Type", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator_recall.evaluate(preds)
print(f"Recall: {recall}")

Accuracy: 0.7845490779759758
Precision: 0.7859717675565264
Recall: 0.7845490779759757
