# PCA and Logistic Regression

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName('PCA_sonar').getOrCreate()

In [None]:
df = spark.read.csv('sonar_all_data.txt',inferSchema=True,header=False)

In [None]:
df.printSchema()

In [None]:
df = df.withColumnRenamed("_c60","label")

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import PCA

In [None]:
assembler = VectorAssembler(
    inputCols=['_c%d' % i for i in range(60)],
    outputCol="features")
output = assembler.transform(df)

In [None]:
output.select("features").show(1, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                                                                                                                                                                                                                                                                                                                   |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Scale feature if we need

In [None]:
standardizer = StandardScaler(withMean=True, withStd=True,
                              inputCol='features',
                              outputCol='std_features')
model = standardizer.fit(output)
output = model.transform(output)

### Label to String

In [None]:
indexer = StringIndexer(inputCol="label", outputCol="label_idx")
indexed = indexer.fit(output).transform(output)

In [None]:
final_data = indexed.select(['std_features', 'label', 'label_idx'])

In [None]:
final_data.show(3)

+--------------------+-----+---------+
|        std_features|label|label_idx|
+--------------------+-----+---------+
|[-0.3985897356694...|    R|      1.0|
|[0.70184498705605...|    R|      1.0|
|[-0.1289179854363...|    R|      1.0|
+--------------------+-----+---------+
only showing top 3 rows



### PCA

In [None]:
pca = PCA(k=15, inputCol="std_features", outputCol="pca")
model = pca.fit(final_data)

In [None]:
model.explainedVariance

DenseVector([0.2035, 0.189, 0.0855, 0.0568, 0.0501, 0.0406, 0.0328, 0.0305, 0.0257, 0.0249, 0.0208, 0.019, 0.0175, 0.0154, 0.0143])

In [None]:
percent = model.explainedVariance
type(percent)    

pyspark.ml.linalg.DenseVector

In [None]:
percent.values.sum()

0.8261807898020073

In [None]:
transformed = model.transform(final_data)

In [None]:
transformed.show(3)

+--------------------+-----+---------+--------------------+
|        std_features|label|label_idx|                 pca|
+--------------------+-----+---------+--------------------+
|[-0.3985897356694...|    R|      1.0|[-1.9165444107164...|
|[0.70184498705605...|    R|      1.0|[0.47896904316845...|
|[-0.1289179854363...|    R|      1.0|[-3.8499400285258...|
+--------------------+-----+---------+--------------------+
only showing top 3 rows



In [None]:
final_data = transformed.select("label_idx","pca")

### Logistic Regression

In [None]:
train_data,test_data = final_data.randomSplit([0.8,0.2])

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
logistic = LogisticRegression(featuresCol='pca', 
                      labelCol='label_idx', 
                      predictionCol='prediction')

In [None]:
# Fit the model to the data and call this model logisticModel
logisticModel = logistic.fit(train_data)

In [None]:
# Create predictions for the testing data and show confusion matrix
test_model = logisticModel.transform(test_data)
test_model.groupBy('label_idx', 'prediction').count().show()

+---------+----------+-----+
|label_idx|prediction|count|
+---------+----------+-----+
|      1.0|       1.0|   13|
|      0.0|       1.0|    5|
|      1.0|       0.0|    5|
|      0.0|       0.0|   23|
+---------+----------+-----+



In [None]:
# Calculate the elements of the confusion matrix
TN = test_model.filter('prediction = 0 AND label_idx = prediction').count()
TP = test_model.filter('prediction = 1 AND label_idx = prediction').count()
FN = test_model.filter('prediction = 0 AND label_idx != prediction').count()
FP = test_model.filter('prediction = 1 AND label_idx != prediction').count()

In [None]:
# Calculate precision and recall
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print('precision = {:.2f}\nrecall    = {:.2f}'.format(precision, recall))

precision = 0.72
recall    = 0.72


In [None]:
acc =(TP+TN)/test_model.count()
acc

0.782608695652174