In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import isnan, when, count, col
from pyspark.ml.evaluation import BinaryClassificationEvaluator

import sklearn 
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
spark = SparkSession.builder.appName('BreastCancer').getOrCreate()

# **Part 1 - Data Preprocessing**

In [None]:
df = spark.read.csv('/FileStore/tables/breast_cancer_data.csv',inferSchema=True,header=True)

df.printSchema()

**1.1 Cleaning Null Values**

In [None]:
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show() #looks like _c32 column is null, we just won't include it in the features

**1.2 Creating Features Column**

In [None]:
assembler = VectorAssembler(inputCols=df.columns[2:-2],
                           outputCol='features') 
#so we're grabbing columns 2 to 31 which are our features. Column 1 is our label. Column 0 and 32 are not relevant.

output = assembler.transform(df) #applying our vector assembler to all our data

output.select('features').head(1) #notice features is a DenseVector containing all the features we combined

**1.3 Encoding Label**

In [None]:
output.groupBy('diagnosis').count().show() #so we got 2 categories for diagnosis

In [None]:
encoder = StringIndexer(inputCol="diagnosis", outputCol="diagnosis_cat")
encoded = encoder.fit(output).transform(output)
encoded.select('diagnosis_cat').show(5) #now M = 1, B = 0

**1.4 Creating Features and Label Dataframe**

In [None]:
df_final = encoded.select('features', 'diagnosis_cat') #so the data we'll be using to apply Logistic Rregression will be the features and the label

df_final.show(3)

**1.5 Train Test Split**

In [None]:
train, test = df_final.randomSplit([0.8,0.2]) #train will have 80%, test will have 20% of data

train.describe().show()

In [None]:
test.describe().show()

# **Part 2 - Creating our Model**

In [None]:
logreg = LogisticRegression(featuresCol='features', labelCol='diagnosis_cat', predictionCol='prediction', maxIter=200)

logreg_model = logreg.fit(train) #fitting our train data

#**Part 3 - Evaluating Model**


In [None]:
trainsummary = logreg_model.summary
trainsummary.predictions.describe().show()

In [None]:
pred = logreg_model.evaluate(test)
pred.predictions.show(5)

In [None]:
y_true = pred.predictions.select(['diagnosis_cat']).collect()
y_pred = pred.predictions.select(['prediction']).collect()

print(classification_report(y_true, y_pred))

In [None]:
print(confusion_matrix(y_true, y_pred))