In [0]:
file_location = "/FileStore/BRCA.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [0]:
#To get the shape of the dataset
print((df.count(), len(df.columns)))

In [0]:
# Spark does in-memory comoutation it stores the data along clusters in RAM so as to faster processing.
df.cache()

In [0]:
#2% data of the database was null
df.filter("Tumour_Stage is null").count() / df.count() *100

In [0]:
df.printSchema()

In [0]:
#To work over columns i.e. to add or drop the columns


In [0]:
df.show()

In [0]:
# Handled missing values from Numeric column
from pyspark.ml.feature import Imputer
imputer = Imputer(inputCol='Age',outputCol='Imputed_Age').setStrategy('mean')
imputer.fit(df).transform(df).show()
df.select('Age').show()

In [0]:
df = df.withColumnRenamed('ER status','ER_status').withColumnRenamed('PR status','PR_status').withColumnRenamed('HER2 status','HER2_status')

In [0]:

# Handle missing values from Categorical dataset
'''
1. Gender null values should be replaced with Female
2. Histology null values should be replacd with Infiltrating Ductal Carcinoma
3. ER and PR status are all Positive. every person who is suffering from cancer has thses tests positive during initial stage doctors used to perform these tests.
4. HER2 test is highly important and it's conducted to check whether drugs will lower down size of tumor (Negative): drugs can't help to lower the breast cancer (Positive):drugs can effect. Hence categorical values from this column are important to handle.
5. In HER2 2% values are null and have to be filled carefully.
6. Mostly tumor stage 2 and 3 patient found this HER2 test negative. Tumor~~HER2 status.
As HER2 column is hard to fill we well use KNN Imputer
'''

df = df.na.fill('FEMALE',subset=['Gender'])
df = df.na.fill('Infiltrating Ductal Carcinoma',subset=['Histology'])
df = df.na.fill('Positive',subset=['ER_status'])
df = df.na.fill('Positive',subset=['PR_status'])
df = df.na.fill('Negative',subset=['HER2_status'])
df = df.na.fill('Modified Radical Mastectomy',subset=['Surgery_type'])

In [0]:
df.groupby('Gender').count().show()
df.groupby('Tumour_Stage').count().show()
df.groupby('Histology').count().show()
df.groupby('ER_status').count().show()
df.groupby('PR_status').count().show()
df.groupby('HER2_status').count().show()
df.groupby('Surgery_type').count().show()

In [0]:
df= df.na.fill('Alive',subset=['Patient_Status'])

In [0]:
from pyspark.sql.functions import regexp_replace

In [0]:
df = df.na.fill('III',subset=['Tumour_Stage'])

In [0]:
# Categorical column values handling
"""
1, Gendeer column has just two nominal categories hence used StringIndexer for label encoding.
2. Except Tumor_stage others are of nominal categories hence they can be handle out using StringIndexer()
"""
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCols=['Tumour_Stage','Gender','Histology','ER_status','PR_status','HER2_status','Surgery_type','Patient_Status'],outputCols=['impt_Tumour_stage','impt_Gender','impt_Histology','impt_ER_status','impt_PR_status','impt_HER2_status','impt_Surgery_typetype','imptPatient_Status'])

In [0]:
n_indexer = indexer.fit(df).transform(df)

In [0]:
n_indexer = n_indexer.drop("Patient_Status")

In [0]:
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder(inputCols=['impt_Tumour_stage','impt_Gender','impt_Histology','impt_HER2_status','impt_Surgery_typetype'], outputCols=['encoded_Tumour_stage','encoded_Gender','encoded_Histology','encoded_HER2_status','encoded_Surgery_type'])
df_onehot = encoder.fit(n_indexer).transform(n_indexer)

In [0]:
df_onehot.printSchema()

In [0]:
df_onehot = df_onehot.drop('impt_Tumour_stage','impt_Gender','impt_Histology','impt_HER2_status','impt_Surgery_typetype')

In [0]:
df_onehot.select("imptPatient_Status").show()

In [0]:
df_onehot = df_onehot.withColumn("Label",col('imptPatient_Status'))

In [0]:
df_onehot = df_onehot.na.fill(1.0,subset=['imptPatient_Status'])

In [0]:
df_onehot = df_onehot.drop('ER_status','PR_status','HER2_status','Surgery_type')

In [0]:
df_onehot = df_onehot.drop('Tumour_Stage')

In [0]:
df_onehot = df_onehot.drop('Patient_ID','Gender','Histology','Date_of_Surgery','Date_of_Last_Visit')

In [0]:
'''
Before model building we used to perform train_test_split
In Spark as Data get distributed over clusters so to deal with RDD dataframe
we combine vectors of independent and dependent features
'''

from pyspark.ml.feature import VectorAssembler
feature_ass = VectorAssembler(inputCols= ['Age','Protein1','Protein2','Protein3','Protein4','impt_ER_status','impt_PR_status','encoded_Tumour_stage','encoded_Gender','encoded_Histology','encoded_HER2_status','encoded_Surgery_type',"Label"],outputCol ='vector_of_features')
final_dataset = feature_ass.transform(df_onehot)
final_dataset.show()

In [0]:
df

In [0]:
#Now Let's just combine independent and dependent features
final_output = final_dataset.select("vector_of_features","Label")

In [0]:
#split the dataset into training and testing
(train_data,test_data) = final_output.randomSplit([0.80,0.20],seed=13)

In [0]:

# As we get the finalized output Now it's time to apply ML algorithm
"""
1. Logistic Regression From pyspark.classification import LogisticRegression()
"""

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import RandomForestClassifier

In [0]:
lr = LogisticRegression(labelCol = "Label",featuresCol="vector_of_features")

In [0]:
#Sometimes error occured i.e. user defined function error occured: can be resolved by removing NULL values from the table.

In [0]:
lr_model = lr.fit(train_data)