In [348]:
import findspark
findspark.init()
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, LinearSVC
import pyspark
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

### **Create Spark session**

In [349]:
spark = SparkSession.builder.master("local")\
          .appName("StructureAPI")\
          .config("spark.some.config.option", "some-value")\
          .getOrCreate()

## **Dataset1**

### **Explore Data Analysis**

In [350]:
data1 = spark.read.load("waterQuality1.csv", format="csv", header=True, delimiter=",")
data1.show(5)

+---------+-------+-------+------+-------+----------+--------+------+--------+--------+-------+-----+--------+--------+-------+-----------+------+--------+------+-------+-------+
|aluminium|ammonia|arsenic|barium|cadmium|chloramine|chromium|copper|flouride|bacteria|viruses| lead|nitrates|nitrites|mercury|perchlorate|radium|selenium|silver|uranium|is_safe|
+---------+-------+-------+------+-------+----------+--------+------+--------+--------+-------+-----+--------+--------+-------+-----------+------+--------+------+-------+-------+
|     1.65|   9.08|   0.04|  2.85|  0.007|      0.35|    0.83|  0.17|    0.05|     0.2|      0|0.054|   16.08|    1.13|  0.007|      37.75|  6.78|    0.08|  0.34|   0.02|      1|
|     2.32|  21.16|   0.01|  3.31|  0.002|      5.28|    0.68|  0.66|     0.9|    0.65|   0.65|  0.1|    2.01|    1.93|  0.003|      32.26|  3.21|    0.08|  0.27|   0.05|      1|
|     1.01|  14.02|   0.04|  0.58|  0.008|      4.24|    0.53|  0.02|    0.99|    0.05|  0.003|0.078|   1

In [351]:
data1.count()

7999

- **aluminium** - dangerous if greater than 2.8
- **ammonia** - dangerous if greater than 32.5
- **arsenic** - dangerous if greater than 0.01
- **barium** - dangerous if greater than 2
- **cadmium**- dangerous if greater than 0.005
- **chloramine** - dangerous if greater than 4
- **chromium** - dangerous if greater than 0.1
- **copper** - dangerous if greater than 1.3
- **flouride** - dangerous if greater than 1.5
- **bacteria** - dangerous if greater than 0
- **viruses** - dangerous if greater than 0
- **lead** - dangerous if greater than 0.015
- **nitrates** - dangerous if greater than 10
- **nitrites** - dangerous if greater than 1
- **mercury** - dangerous if greater than 0.002
- **perchlorate** - dangerous if greater than 56
- **radium** - dangerous if greater than 5
- **selenium** - dangerous if greater than 0.5
- **silver**- dangerous if greater than 0.1
- **uranium** - dangerous if greater than 0.3
- **is_safe** - class attribute {0 - not safe, 1 - safe}

In [352]:
data1.printSchema()


root
 |-- aluminium: string (nullable = true)
 |-- ammonia: string (nullable = true)
 |-- arsenic: string (nullable = true)
 |-- barium: string (nullable = true)
 |-- cadmium: string (nullable = true)
 |-- chloramine: string (nullable = true)
 |-- chromium: string (nullable = true)
 |-- copper: string (nullable = true)
 |-- flouride: string (nullable = true)
 |-- bacteria: string (nullable = true)
 |-- viruses: string (nullable = true)
 |-- lead: string (nullable = true)
 |-- nitrates: string (nullable = true)
 |-- nitrites: string (nullable = true)
 |-- mercury: string (nullable = true)
 |-- perchlorate: string (nullable = true)
 |-- radium: string (nullable = true)
 |-- selenium: string (nullable = true)
 |-- silver: string (nullable = true)
 |-- uranium: string (nullable = true)
 |-- is_safe: string (nullable = true)



Tiếp theo xét xem có dòng nào bị thiếu giá trị hay không.

In [353]:
data1.select([f.count(f.when(f.col(c).contains('None') | \
                            f.col(c).contains('NULL') | \
                            (f.col(c) == '' ) | \
                            (f.col(c) == '#NUM!' ) | \
                            f.col(c).isNull() | \
                            f.isnan(c), c 
                           )).alias(c)
                    for c in data1.columns]).show()

+---------+-------+-------+------+-------+----------+--------+------+--------+--------+-------+----+--------+--------+-------+-----------+------+--------+------+-------+-------+
|aluminium|ammonia|arsenic|barium|cadmium|chloramine|chromium|copper|flouride|bacteria|viruses|lead|nitrates|nitrites|mercury|perchlorate|radium|selenium|silver|uranium|is_safe|
+---------+-------+-------+------+-------+----------+--------+------+--------+--------+-------+----+--------+--------+-------+-----------+------+--------+------+-------+-------+
|        0|      3|      0|     0|      0|         0|       0|     0|       0|       0|      0|   0|       0|       0|      0|          0|     0|       0|     0|      0|      3|
+---------+-------+-------+------+-------+----------+--------+------+--------+--------+-------+----+--------+--------+-------+-----------+------+--------+------+-------+-------+



Do số lượng dữ liệu bị missing không đáng kể nên ta sẽ remove chúng.

In [354]:
data1 = data1.where(data1.ammonia != '#NUM!').where(data1.is_safe!='#NUM!')

Đổi kiểu dữ liệu của các cột

In [355]:
for item in data1.columns:
    if item != 'is_safe':
        data1 = data1.withColumn(f'{item}', data1[item].cast('double'))
    else:
        data1 = data1.withColumn(f'{item}', data1[item].cast('int'))


In [356]:
data1.printSchema()

root
 |-- aluminium: double (nullable = true)
 |-- ammonia: double (nullable = true)
 |-- arsenic: double (nullable = true)
 |-- barium: double (nullable = true)
 |-- cadmium: double (nullable = true)
 |-- chloramine: double (nullable = true)
 |-- chromium: double (nullable = true)
 |-- copper: double (nullable = true)
 |-- flouride: double (nullable = true)
 |-- bacteria: double (nullable = true)
 |-- viruses: double (nullable = true)
 |-- lead: double (nullable = true)
 |-- nitrates: double (nullable = true)
 |-- nitrites: double (nullable = true)
 |-- mercury: double (nullable = true)
 |-- perchlorate: double (nullable = true)
 |-- radium: double (nullable = true)
 |-- selenium: double (nullable = true)
 |-- silver: double (nullable = true)
 |-- uranium: double (nullable = true)
 |-- is_safe: integer (nullable = true)



In [357]:
for item in data1.columns:
    data1.select(item).describe().show()

+-------+------------------+
|summary|         aluminium|
+-------+------------------+
|  count|              7996|
|   mean|0.6663956978489863|
| stddev|1.2653230979043648|
|    min|               0.0|
|    max|              5.05|
+-------+------------------+

+-------+------------------+
|summary|           ammonia|
+-------+------------------+
|  count|              7996|
|   mean|14.278211605802943|
| stddev|  8.87893018212118|
|    min|             -0.08|
|    max|             29.84|
+-------+------------------+

+-------+-------------------+
|summary|            arsenic|
+-------+-------------------+
|  count|               7996|
|   mean|0.16147698849422124|
| stddev|0.25263248322013493|
|    min|                0.0|
|    max|               1.05|
+-------+-------------------+

+-------+------------------+
|summary|            barium|
+-------+------------------+
|  count|              7996|
|   mean|1.5679277138569263|
| stddev|1.2162273125632017|
|    min|               0.0|
| 

 Phân bố các giá trị của label

In [358]:
data1.groupBy('is_safe').count().show()

+-------+-----+
|is_safe|count|
+-------+-----+
|      1|  912|
|      0| 7084|
+-------+-----+



### **Classification algorithms**

tạo cột feature bằng cách gom các giá trị đặc điểm thành 1 vector

In [359]:
assem = VectorAssembler(inputCols=data1.columns[:-1],outputCol='features',handleInvalid='keep')

train-test split

In [360]:
(trainingData1, testData1) = data1.randomSplit([0.8, 0.2], 134)

#### Logistic regression

In [361]:
log_reg1 = LogisticRegression(featuresCol='features', labelCol='is_safe')

In [362]:
log_pipeline1 = Pipeline(stages=[assem, log_reg1])
log_model1 = log_pipeline1.fit(trainingData1)

prediction

In [363]:
predictions = log_model1.transform(testData1)
predictions.select("prediction", "is_safe", "features").show(5)

+----------+-------+--------------------+
|prediction|is_safe|            features|
+----------+-------+--------------------+
|       0.0|      0|[0.0,0.75,0.07,0....|
|       0.0|      0|[0.0,1.29,0.07,0....|
|       0.0|      0|[0.0,1.42,0.02,0....|
|       0.0|      0|[0.0,2.09,0.09,2....|
|       0.0|      0|[0.0,2.22,0.03,0....|
+----------+-------+--------------------+
only showing top 5 rows



In [364]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="is_safe", predictionCol="prediction", metricName="accuracy")

In [365]:
accuracy = evaluator.evaluate(predictions)
print("Logistic Regression - Test Accuracy = %g" % (accuracy))
print("Logistic Regression - Test Error = %g" % (1.0 - accuracy))

Logistic Regression - Test Accuracy = 0.902251
Logistic Regression - Test Error = 0.0977492


In [366]:
y_true = predictions.select('is_safe').rdd.flatMap(lambda x: x).collect()
y_pred = predictions.select('prediction').rdd.flatMap(lambda x: x).collect()
confusionmatrix = confusion_matrix(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='micro')
recall = recall_score(y_true, y_pred, average='micro')

In [367]:
print("The Confusion Matrix for Logistic Regression is :\n" + str(confusionmatrix))

print("The precision score for Logistic Regression is: " + str(precision))

print("The recall score for Decision Tree Model is: " + str(recall))

The Confusion Matrix for Logistic Regression is :
[[1351   26]
 [ 126   52]]
The precision score for Logistic Regression is: 0.9022508038585209
The recall score for Decision Tree Model is: 0.9022508038585209


#### Linear Support Vector Machine


In [368]:
svc = LinearSVC(maxIter=10, regParam=0.1, labelCol='is_safe', featuresCol='features')

In [369]:
svc_pipeline1 = Pipeline(stages=[assem, svc])
svc_model1 = svc_pipeline1.fit(trainingData1)

In [370]:
predictions = svc_model1.transform(testData1)
predictions.select("prediction", "is_safe", "features").show(5)

+----------+-------+--------------------+
|prediction|is_safe|            features|
+----------+-------+--------------------+
|       0.0|      0|[0.0,0.75,0.07,0....|
|       0.0|      0|[0.0,1.29,0.07,0....|
|       0.0|      0|[0.0,1.42,0.02,0....|
|       0.0|      0|[0.0,2.09,0.09,2....|
|       0.0|      0|[0.0,2.22,0.03,0....|
+----------+-------+--------------------+
only showing top 5 rows



In [371]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="is_safe", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
print("LinearSVC - Test Accuracy = %g" % (accuracy))
print("LinearSVC - Test Error = %g" % (1.0 - accuracy))

LinearSVC - Test Accuracy = 0.885531
LinearSVC - Test Error = 0.114469


In [372]:
y_true = predictions.select('is_safe').rdd.flatMap(lambda x: x).collect()
y_pred = predictions.select('prediction').rdd.flatMap(lambda x: x).collect()
confusionmatrix = confusion_matrix(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='micro')
recall = recall_score(y_true, y_pred, average='micro')

In [373]:
print("The Confusion Matrix for LinearSVC is :\n" + str(confusionmatrix))

print("The precision score for LinearSVC is: " + str(precision))

print("The recall score for LinearSVC is: " + str(recall))

The Confusion Matrix for LinearSVC is :
[[1377    0]
 [ 178    0]]
The precision score for LinearSVC is: 0.8855305466237942
The recall score for LinearSVC is: 0.8855305466237942


## **DataSet2**

### **Explore Data Analysis**

In [374]:
data2 = spark.read.load("gender_classification.csv", format="csv", header=True, delimiter=",")
data2.show(5)

+---------+-----------------+------------------+---------+---------+---------+-------------------------+------+
|long_hair|forehead_width_cm|forehead_height_cm|nose_wide|nose_long|lips_thin|distance_nose_to_lip_long|gender|
+---------+-----------------+------------------+---------+---------+---------+-------------------------+------+
|        1|             11.8|               6.1|        1|        0|        1|                        1|  Male|
|        0|               14|               5.4|        0|        0|        1|                        0|Female|
|        0|             11.8|               6.3|        1|        1|        1|                        1|  Male|
|        0|             14.4|               6.1|        0|        1|        1|                        1|  Male|
|        1|             13.5|               5.9|        0|        0|        0|                        0|Female|
+---------+-----------------+------------------+---------+---------+---------+-------------------------+

In [375]:
data2.count()

5001

- longhair - This column contains 0's and 1's where 1 is "long hair" and 0 is "not long hair".
- foreheadwidthcm - This column is in CM's. This is the width of the forehead.
- foreheadheightcm - This is the height of the forehead and it's in Cm's.
- nosewide - This column contains 0's and 1's where 1 is "wide nose" and 0 is "not wide nose".
- noselong - This column contains 0's and 1's where 1 is "Long nose" and 0 is "not long nose".
- lipsthin - This column contains 0's and 1's where 1 represents the "thin lips" while 0 is "Not thin lips".
- distancenosetoliplong - This column contains 0's and 1's where 1 represents the "long distance between nose and lips" while 0 is "short distance between nose and lips".
- gender - This is either "Male" or "Female".

In [376]:
data2.printSchema()


root
 |-- long_hair: string (nullable = true)
 |-- forehead_width_cm: string (nullable = true)
 |-- forehead_height_cm: string (nullable = true)
 |-- nose_wide: string (nullable = true)
 |-- nose_long: string (nullable = true)
 |-- lips_thin: string (nullable = true)
 |-- distance_nose_to_lip_long: string (nullable = true)
 |-- gender: string (nullable = true)



Tiếp theo xét xem có dòng nào bị thiếu giá trị hay không.

In [377]:
data2.select([f.count(f.when(f.col(c).contains('None') | \
                            f.col(c).contains('NULL') | \
                            (f.col(c) == '' ) | \
                            (f.col(c) == '#NUM!' ) | \
                            f.col(c).isNull() | \
                            f.isnan(c), c 
                           )).alias(c)
                    for c in data2.columns]).show()

+---------+-----------------+------------------+---------+---------+---------+-------------------------+------+
|long_hair|forehead_width_cm|forehead_height_cm|nose_wide|nose_long|lips_thin|distance_nose_to_lip_long|gender|
+---------+-----------------+------------------+---------+---------+---------+-------------------------+------+
|        0|                0|                 0|        0|        0|        0|                        0|     0|
+---------+-----------------+------------------+---------+---------+---------+-------------------------+------+



Đổi kiểu dữ liệu của các cột

In [378]:
for item in data2.columns:
    if item != 'gender':
        data2 = data2.withColumn(f'{item}', data2[item].cast('double'))


In [379]:
data2.printSchema()

root
 |-- long_hair: double (nullable = true)
 |-- forehead_width_cm: double (nullable = true)
 |-- forehead_height_cm: double (nullable = true)
 |-- nose_wide: double (nullable = true)
 |-- nose_long: double (nullable = true)
 |-- lips_thin: double (nullable = true)
 |-- distance_nose_to_lip_long: double (nullable = true)
 |-- gender: string (nullable = true)



In [380]:
for item in data2.columns:
    data2.select(item).describe().show()

+-------+------------------+
|summary|         long_hair|
+-------+------------------+
|  count|              5001|
|   mean| 0.869626074785043|
| stddev|0.3367480365970515|
|    min|               0.0|
|    max|               1.0|
+-------+------------------+

+-------+------------------+
|summary| forehead_width_cm|
+-------+------------------+
|  count|              5001|
|   mean|13.181483703259353|
| stddev| 1.107128302494507|
|    min|              11.4|
|    max|              15.5|
+-------+------------------+

+-------+------------------+
|summary|forehead_height_cm|
+-------+------------------+
|  count|              5001|
|   mean| 5.946310737852416|
| stddev|0.5412678510891585|
|    min|               5.1|
|    max|               7.1|
+-------+------------------+

+-------+------------------+
|summary|         nose_wide|
+-------+------------------+
|  count|              5001|
|   mean|0.4939012197560488|
| stddev|0.5000127972767414|
|    min|               0.0|
|    max|  

 Phân bố các giá trị của label

In [381]:
data2.groupBy('gender').count().show()

+------+-----+
|gender|count|
+------+-----+
|Female| 2501|
|  Male| 2500|
+------+-----+



### **Classification algorithms**

tạo cột feature bằng cách gom các giá trị đặc điểm thành 1 vector

In [382]:
assem = VectorAssembler(inputCols=data2.columns[:-1],outputCol='features',handleInvalid='keep')
data2 = assem.transform(data2)

Do cột gender ở đây là dạng string nên ta sẽ số hóa chúng.

In [383]:
labelIndexer = StringIndexer(inputCol="gender", outputCol="label").fit(data2)
data2 = labelIndexer.transform(data2)

train-test split

In [384]:
(trainingData2, testData2) = data2.randomSplit([0.8, 0.2], 124)

#### Logistic regression

In [385]:
log_reg2 = LogisticRegression(featuresCol='features', labelCol='label')

In [386]:
log_pipeline2 = Pipeline(stages=[log_reg2])
log_model2 = log_pipeline2.fit(trainingData2)

prediction

In [387]:
predictions = log_model2.transform(testData2)
predictions.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(7,[1,2],[11.4,5.4])|
|       0.0|  0.0|(7,[1,2,4],[11.4,...|
|       1.0|  1.0|[0.0,11.5,5.3,1.0...|
|       0.0|  0.0|(7,[1,2,6],[11.5,...|
|       0.0|  0.0|(7,[1,2],[11.5,6.2])|
+----------+-----+--------------------+
only showing top 5 rows



In [388]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")

In [389]:
accuracy = evaluator.evaluate(predictions)
print("Logistic Regression - Test Accuracy = %g" % (accuracy))
print("Logistic Regression - Test Error = %g" % (1.0 - accuracy))

Logistic Regression - Test Accuracy = 0.971292
Logistic Regression - Test Error = 0.0287081


In [390]:
y_true = predictions.select('label').rdd.flatMap(lambda x: x).collect()
y_pred = predictions.select('prediction').rdd.flatMap(lambda x: x).collect()
confusionmatrix = confusion_matrix(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='micro')
recall = recall_score(y_true, y_pred, average='micro')

In [391]:
print("The Confusion Matrix for Logistic Regression is :\n" + str(confusionmatrix))

print("The precision score for Logistic Regression is: " + str(precision))

print("The recall score for Decision Tree Model is: " + str(recall))

The Confusion Matrix for Logistic Regression is :
[[521  14]
 [ 16 494]]
The precision score for Logistic Regression is: 0.9712918660287081
The recall score for Decision Tree Model is: 0.9712918660287081


#### Linear Support Vector Machine


In [392]:
svc = LinearSVC(maxIter=10, regParam=0.1, labelCol='label', featuresCol='features')

In [393]:
svc_pipeline2 = Pipeline(stages=[ svc])
svc_model2 = svc_pipeline2.fit(trainingData2)

In [394]:
predictions = svc_model2.transform(testData2)
predictions.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(7,[1,2],[11.4,5.4])|
|       0.0|  0.0|(7,[1,2,4],[11.4,...|
|       1.0|  1.0|[0.0,11.5,5.3,1.0...|
|       0.0|  0.0|(7,[1,2,6],[11.5,...|
|       0.0|  0.0|(7,[1,2],[11.5,6.2])|
+----------+-----+--------------------+
only showing top 5 rows



In [395]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
print("LinearSVC - Test Accuracy = %g" % (accuracy))
print("LinearSVC - Test Error = %g" % (1.0 - accuracy))

LinearSVC - Test Accuracy = 0.959809
LinearSVC - Test Error = 0.0401914


In [396]:
y_true = predictions.select('label').rdd.flatMap(lambda x: x).collect()
y_pred = predictions.select('prediction').rdd.flatMap(lambda x: x).collect()
confusionmatrix = confusion_matrix(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='micro')
recall = recall_score(y_true, y_pred, average='micro')

In [397]:
print("The Confusion Matrix for LinearSVC is :\n" + str(confusionmatrix))

print("The precision score for LinearSVC is: " + str(precision))

print("The recall score for LinearSVC is: " + str(recall))

The Confusion Matrix for LinearSVC is :
[[517  18]
 [ 24 486]]
The precision score for LinearSVC is: 0.9598086124401913
The recall score for LinearSVC is: 0.9598086124401913
