In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

In [2]:
# config = SparkConf().setMaster("local").setAppName('Diabetes Data')
# spark = SparkContext(conf=config)

In [3]:
spark = SparkSession.builder.appName('Diabetes Data').getOrCreate()

In [4]:
spark

In [5]:
df = spark.read.csv("diabetes.csv", header=True)

In [8]:
df.toPandas()

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1354778,0,171,80,34,23,43.50972593,1.213191354,21,0
1,1147438,8,92,93,47,36,21.24057571,0.158364981,23,0
2,1640031,7,115,47,52,35,41.51152348,0.079018568,23,0
3,1883350,9,103,78,25,304,29.58219193,1.282869847,43,1
4,1424119,1,85,59,27,35,42.60453585,0.549541871,22,0
...,...,...,...,...,...,...,...,...,...,...
14995,1490300,10,65,60,46,177,33.51246773,0.14832658,41,1
14996,1744410,2,73,66,27,168,30.13263576,0.862252262,38,1
14997,1742742,0,93,89,43,57,18.69068305,0.427048955,24,0
14998,1099353,0,132,98,18,161,19.7916451,0.302257208,23,0


In [9]:
# How many rows we have
df.count()

15000

In [10]:
# The names of our columns
df.columns

['PatientID',
 'Pregnancies',
 'PlasmaGlucose',
 'DiastolicBloodPressure',
 'TricepsThickness',
 'SerumInsulin',
 'BMI',
 'DiabetesPedigree',
 'Age',
 'Diabetic']

In [11]:
# Basics stats from our columns
df.describe().toPandas()

Unnamed: 0,summary,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
1,mean,1502922.0028666668,3.224533333333333,107.85686666666666,71.22066666666667,28.814,137.85213333333334,31.509646041017334,0.3989677489566001,30.137733333333333,0.3333333333333333
2,stddev,289253.4434711824,3.3910202078566654,31.981974651810688,16.7587160365316,14.55571578192323,133.0682519590133,9.758999734051889,0.3779435321540775,12.089702515888606,0.4714202350607769
3,min,1000038.0,0.0,100.0,100.0,10.0,124.0,18.20051152,0.078043795,21.0,0.0
4,max,1999997.0,9.0,99.0,99.0,93.0,97.0,56.03462763,2.301594189,77.0,1.0


In [12]:
from pyspark.sql.functions import col
dataset = df.select(
    col('PatientID').cast('int'),
    col('Pregnancies').cast('int'),
    col('PlasmaGlucose').cast('int'),
    col('DiastolicBloodPressure').cast('int'),
    col('TricepsThickness').cast('int'),
    col('SerumInsulin').cast('int'),
    col('BMI').cast('float'),
    col('DiabetesPedigree').cast('float'),
    col('Age').cast('int'),
    col('Diabetic').cast('int')
)
dataset.toPandas()

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1354778,0,171,80,34,23,43.509727,1.213191,21,0
1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,1640031,7,115,47,52,35,41.511524,0.079019,23,0
3,1883350,9,103,78,25,304,29.582191,1.282870,43,1
4,1424119,1,85,59,27,35,42.604534,0.549542,22,0
...,...,...,...,...,...,...,...,...,...,...
14995,1490300,10,65,60,46,177,33.512466,0.148327,41,1
14996,1744410,2,73,66,27,168,30.132635,0.862252,38,1
14997,1742742,0,93,89,43,57,18.690683,0.427049,24,0
14998,1099353,0,132,98,18,161,19.791645,0.302257,23,0


In [13]:
from pyspark.ml.feature import VectorAssembler

# Assemble all the features with VectorAssembler
required_features = [
    'PatientID',
    'Pregnancies',
    'PlasmaGlucose',
    'DiastolicBloodPressure',
    'TricepsThickness',
    'SerumInsulin',
    'BMI',
    'DiabetesPedigree',
    'Age'
]

assembler = VectorAssembler(inputCols=required_features, outputCol='features')
transformed_data = assembler.transform(dataset)

In [14]:
transformed_data.toPandas()

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic,features
0,1354778,0,171,80,34,23,43.509727,1.213191,21,0,"[1354778.0, 0.0, 171.0, 80.0, 34.0, 23.0, 43.5..."
1,1147438,8,92,93,47,36,21.240576,0.158365,23,0,"[1147438.0, 8.0, 92.0, 93.0, 47.0, 36.0, 21.24..."
2,1640031,7,115,47,52,35,41.511524,0.079019,23,0,"[1640031.0, 7.0, 115.0, 47.0, 52.0, 35.0, 41.5..."
3,1883350,9,103,78,25,304,29.582191,1.282870,43,1,"[1883350.0, 9.0, 103.0, 78.0, 25.0, 304.0, 29...."
4,1424119,1,85,59,27,35,42.604534,0.549542,22,0,"[1424119.0, 1.0, 85.0, 59.0, 27.0, 35.0, 42.60..."
...,...,...,...,...,...,...,...,...,...,...,...
14995,1490300,10,65,60,46,177,33.512466,0.148327,41,1,"[1490300.0, 10.0, 65.0, 60.0, 46.0, 177.0, 33...."
14996,1744410,2,73,66,27,168,30.132635,0.862252,38,1,"[1744410.0, 2.0, 73.0, 66.0, 27.0, 168.0, 30.1..."
14997,1742742,0,93,89,43,57,18.690683,0.427049,24,0,"[1742742.0, 0.0, 93.0, 89.0, 43.0, 57.0, 18.69..."
14998,1099353,0,132,98,18,161,19.791645,0.302257,23,0,"[1099353.0, 0.0, 132.0, 98.0, 18.0, 161.0, 19...."


In [15]:
rddObj = transformed_data.rdd

In [16]:
# (training_data, test_data) = transformed_data.randomSplit([0.7, 0.3])
(training_data, test_data) = rddObj.randomSplit([0.7, 0.3])

In [23]:
# from pyspark.ml import Pipeline
# from pyspark.ml.classification import RandomForestClassifier
# from pyspark.ml.feature import HashingTF, Tokenizer


# tokenizer = Tokenizer(inputCol="text", outputCol="words")
# hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")

# rf = RandomForestClassifier(labelCol='Diabetic', featuresCol='features', maxDepth=10)
# pipeline = Pipeline(stages=[rf])

# # Fit the pipeline to training documents.
# model = pipeline.fit(training_data)

# # model = rf.fit(training_data)

AttributeError: 'PipelinedRDD' object has no attribute '_jdf'

In [17]:
rddObj.collect()

[Row(PatientID=1354778, Pregnancies=0, PlasmaGlucose=171, DiastolicBloodPressure=80, TricepsThickness=34, SerumInsulin=23, BMI=43.509727478027344, DiabetesPedigree=1.2131913900375366, Age=21, Diabetic=0, features=DenseVector([1354778.0, 0.0, 171.0, 80.0, 34.0, 23.0, 43.5097, 1.2132, 21.0])),
 Row(PatientID=1147438, Pregnancies=8, PlasmaGlucose=92, DiastolicBloodPressure=93, TricepsThickness=47, SerumInsulin=36, BMI=21.240575790405273, DiabetesPedigree=0.15836498141288757, Age=23, Diabetic=0, features=DenseVector([1147438.0, 8.0, 92.0, 93.0, 47.0, 36.0, 21.2406, 0.1584, 23.0])),
 Row(PatientID=1640031, Pregnancies=7, PlasmaGlucose=115, DiastolicBloodPressure=47, TricepsThickness=52, SerumInsulin=35, BMI=41.51152420043945, DiabetesPedigree=0.07901857048273087, Age=23, Diabetic=0, features=DenseVector([1640031.0, 7.0, 115.0, 47.0, 52.0, 35.0, 41.5115, 0.079, 23.0])),
 Row(PatientID=1883350, Pregnancies=9, PlasmaGlucose=103, DiastolicBloodPressure=78, TricepsThickness=25, SerumInsulin=304,

In [63]:
rdd=spark.sparkContext.parallelize(data)

1

In [78]:
training_data

PythonRDD[149] at RDD at PythonRDD.scala:53

In [55]:
data3 = sc.textFile("README.md")

In [61]:
data3

README.md MapPartitionsRDD[147] at textFile at NativeMethodAccessorImpl.java:0

In [20]:
# for f in data3.collect():
#     print("data3 Key:"+ str(data3[0]) +", Value:"+data3[1])

In [None]:
# frddObj.take(5)

In [20]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol='Diabetic', featuresCol='features', maxDepth=10)
model = rf.fit(training_data)
predictions = model.transform(test_data)

In [21]:
sc = spark.sparkContext

In [19]:
# from pyspark.mllib.tree import RandomForest, RandomForestModel
# from time import *

# start_time = time()

# model = RandomForest.trainClassifier(
#     rddObj, 
#     numClasses=2, 
#     categoricalFeaturesInfo={},
#     seed=9,
#     numTrees=9,
#     maxDepth=10
# )

# end_time = time()
# elapsed_time = end_time - start_time
# print("Time to train model: %.3f seconds" % elapsed_time)

In [22]:
# test_data.take(2)

In [24]:
# Evaluate our model
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol='Diabetic', predictionCol='prediction', metricName='accuracy')

In [25]:
accuracy = evaluator.evaluate(predictions)

print('Test Accuracy = ', accuracy)

Test Accuracy =  0.9192615658362989
