In [1]:
from pyspark import SparkContext

In [2]:
sc = SparkContext(master='local[12]')

In [3]:
#spark UI
sc

In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('MLwithSpark').getOrCreate()

In [6]:
spark

#### Below is the forkflow of ML project: 
    1. Data Preparation 
    2. Data Engineering 
    3. Build Model 
    4. Model Evaluation 
    

Download and read csv file with source: 
https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv 

This is classification data. 

#### Attribute Information:
    
    All attributes except Category and Sex are numerical. The laboratory data are the attributes 5-14. 
    1) X (Patient ID/No.)
    2) Category (diagnosis) (values: '0=Blood Donor', '0s=suspect Blood Donor', '1=Hepatitis', '2=Fibrosis', '3=Cirrhosis')
    3) Age (in years)
    4) Sex (f,m)
    5) ALB
    6) ALP
    7) ALT
    8) AST
    9) BIL
    10) CHE
    11) CHOL
    12) CREA
    13) GGT
    14) PROT

    The target attribute for classification is Category (blood donors vs. Hepatitis C (including its progress ('just' Hepatitis C, Fibrosis, Cirrhosis).

In [9]:
# read csv in spark dataframe 
df = spark.read.csv('hcvdat0.csv', header=True, inferSchema=True)

In [10]:
df.show()

+---+-------------+---+---+----+----+----+----+----+-----+----+-----+----+----+
|_c0|     Category|Age|Sex| ALB| ALP| ALT| AST| BIL|  CHE|CHOL| CREA| GGT|PROT|
+---+-------------+---+---+----+----+----+----+----+-----+----+-----+----+----+
|  1|0=Blood Donor| 32|  m|38.5|52.5| 7.7|22.1| 7.5| 6.93|3.23|106.0|12.1|  69|
|  2|0=Blood Donor| 32|  m|38.5|70.3|  18|24.7| 3.9|11.17| 4.8| 74.0|15.6|76.5|
|  3|0=Blood Donor| 32|  m|46.9|74.7|36.2|52.6| 6.1| 8.84| 5.2| 86.0|33.2|79.3|
|  4|0=Blood Donor| 32|  m|43.2|  52|30.6|22.6|18.9| 7.33|4.74| 80.0|33.8|75.7|
|  5|0=Blood Donor| 32|  m|39.2|74.1|32.6|24.8| 9.6| 9.15|4.32| 76.0|29.9|68.7|
|  6|0=Blood Donor| 32|  m|41.6|43.3|18.5|19.7|12.3| 9.92|6.05|111.0|91.0|  74|
|  7|0=Blood Donor| 32|  m|46.3|41.3|17.5|17.8| 8.5| 7.01|4.79| 70.0|16.9|74.5|
|  8|0=Blood Donor| 32|  m|42.2|41.9|35.8|31.1|16.1| 5.82| 4.6|109.0|21.5|67.1|
|  9|0=Blood Donor| 32|  m|50.9|65.5|23.2|21.2| 6.9| 8.69| 4.1| 83.0|13.7|71.3|
| 10|0=Blood Donor| 32|  m|42.4|86.3|20.

In [14]:
#print columns
print(df.columns)

['_c0', 'Category', 'Age', 'Sex', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT']


In [16]:
#The first column is not useful and we can drop it
d = df.drop('_c0')

In [17]:
d.show()

+-------------+---+---+----+----+----+----+----+-----+----+-----+----+----+
|     Category|Age|Sex| ALB| ALP| ALT| AST| BIL|  CHE|CHOL| CREA| GGT|PROT|
+-------------+---+---+----+----+----+----+----+-----+----+-----+----+----+
|0=Blood Donor| 32|  m|38.5|52.5| 7.7|22.1| 7.5| 6.93|3.23|106.0|12.1|  69|
|0=Blood Donor| 32|  m|38.5|70.3|  18|24.7| 3.9|11.17| 4.8| 74.0|15.6|76.5|
|0=Blood Donor| 32|  m|46.9|74.7|36.2|52.6| 6.1| 8.84| 5.2| 86.0|33.2|79.3|
|0=Blood Donor| 32|  m|43.2|  52|30.6|22.6|18.9| 7.33|4.74| 80.0|33.8|75.7|
|0=Blood Donor| 32|  m|39.2|74.1|32.6|24.8| 9.6| 9.15|4.32| 76.0|29.9|68.7|
|0=Blood Donor| 32|  m|41.6|43.3|18.5|19.7|12.3| 9.92|6.05|111.0|91.0|  74|
|0=Blood Donor| 32|  m|46.3|41.3|17.5|17.8| 8.5| 7.01|4.79| 70.0|16.9|74.5|
|0=Blood Donor| 32|  m|42.2|41.9|35.8|31.1|16.1| 5.82| 4.6|109.0|21.5|67.1|
|0=Blood Donor| 32|  m|50.9|65.5|23.2|21.2| 6.9| 8.69| 4.1| 83.0|13.7|71.3|
|0=Blood Donor| 32|  m|42.4|86.3|20.3|20.0|35.2| 5.46|4.45| 81.0|15.9|69.9|
|0=Blood Don

In [20]:
#check datatype of each column 
d.dtypes

[('Category', 'string'),
 ('Age', 'int'),
 ('Sex', 'string'),
 ('ALB', 'string'),
 ('ALP', 'string'),
 ('ALT', 'string'),
 ('AST', 'double'),
 ('BIL', 'double'),
 ('CHE', 'double'),
 ('CHOL', 'string'),
 ('CREA', 'double'),
 ('GGT', 'double'),
 ('PROT', 'string')]

In [21]:
#d = d['Age', 'Sex', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CREA', 'GGT', 'Category']
d.printSchema()

root
 |-- Category: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- ALB: string (nullable = true)
 |-- ALP: string (nullable = true)
 |-- ALT: string (nullable = true)
 |-- AST: double (nullable = true)
 |-- BIL: double (nullable = true)
 |-- CHE: double (nullable = true)
 |-- CHOL: string (nullable = true)
 |-- CREA: double (nullable = true)
 |-- GGT: double (nullable = true)
 |-- PROT: string (nullable = true)



In [27]:
#see the Category column values for each category 
d.groupBy('Category').count().show()

+--------------------+-----+
|            Category|count|
+--------------------+-----+
|       0=Blood Donor|  533|
|         3=Cirrhosis|   30|
|          2=Fibrosis|   21|
|0s=suspect Blood ...|    7|
|         1=Hepatitis|   24|
+--------------------+-----+



We can see that above data is imbalanced 

### Feature Engineering: 
    Numerical values
    Vectorization 
    Scaling

In [29]:
import pyspark.ml

In [30]:
dir(pyspark.ml)

['Estimator',
 'Model',
 'Pipeline',
 'PipelineModel',
 'PredictionModel',
 'Predictor',
 'Transformer',
 'UnaryTransformer',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'base',
 'classification',
 'clustering',
 'common',
 'evaluation',
 'feature',
 'fpm',
 'image',
 'linalg',
 'param',
 'pipeline',
 'recommendation',
 'regression',
 'stat',
 'tree',
 'tuning',
 'util',
 'wrapper']

In [31]:
from pyspark.ml.feature import VectorAssembler, StringIndexer


In [32]:
# we will do label encoding for columns: Sex, Category
genderEncoder = StringIndexer(inputCol='Sex', outputCol='Gender').fit(d)

In [33]:
d = genderEncoder.transform(d)

In [34]:
d.show()

+-------------+---+---+----+----+----+----+----+-----+----+-----+----+----+------+
|     Category|Age|Sex| ALB| ALP| ALT| AST| BIL|  CHE|CHOL| CREA| GGT|PROT|Gender|
+-------------+---+---+----+----+----+----+----+-----+----+-----+----+----+------+
|0=Blood Donor| 32|  m|38.5|52.5| 7.7|22.1| 7.5| 6.93|3.23|106.0|12.1|  69|   0.0|
|0=Blood Donor| 32|  m|38.5|70.3|  18|24.7| 3.9|11.17| 4.8| 74.0|15.6|76.5|   0.0|
|0=Blood Donor| 32|  m|46.9|74.7|36.2|52.6| 6.1| 8.84| 5.2| 86.0|33.2|79.3|   0.0|
|0=Blood Donor| 32|  m|43.2|  52|30.6|22.6|18.9| 7.33|4.74| 80.0|33.8|75.7|   0.0|
|0=Blood Donor| 32|  m|39.2|74.1|32.6|24.8| 9.6| 9.15|4.32| 76.0|29.9|68.7|   0.0|
|0=Blood Donor| 32|  m|41.6|43.3|18.5|19.7|12.3| 9.92|6.05|111.0|91.0|  74|   0.0|
|0=Blood Donor| 32|  m|46.3|41.3|17.5|17.8| 8.5| 7.01|4.79| 70.0|16.9|74.5|   0.0|
|0=Blood Donor| 32|  m|42.2|41.9|35.8|31.1|16.1| 5.82| 4.6|109.0|21.5|67.1|   0.0|
|0=Blood Donor| 32|  m|50.9|65.5|23.2|21.2| 6.9| 8.69| 4.1| 83.0|13.7|71.3|   0.0|
|0=B

In [35]:
categoryEncoder = StringIndexer(inputCol='Category', outputCol='Target').fit(d)
d = categoryEncoder.transform(d)
d.show()

+-------------+---+---+----+----+----+----+----+-----+----+-----+----+----+------+------+
|     Category|Age|Sex| ALB| ALP| ALT| AST| BIL|  CHE|CHOL| CREA| GGT|PROT|Gender|Target|
+-------------+---+---+----+----+----+----+----+-----+----+-----+----+----+------+------+
|0=Blood Donor| 32|  m|38.5|52.5| 7.7|22.1| 7.5| 6.93|3.23|106.0|12.1|  69|   0.0|   0.0|
|0=Blood Donor| 32|  m|38.5|70.3|  18|24.7| 3.9|11.17| 4.8| 74.0|15.6|76.5|   0.0|   0.0|
|0=Blood Donor| 32|  m|46.9|74.7|36.2|52.6| 6.1| 8.84| 5.2| 86.0|33.2|79.3|   0.0|   0.0|
|0=Blood Donor| 32|  m|43.2|  52|30.6|22.6|18.9| 7.33|4.74| 80.0|33.8|75.7|   0.0|   0.0|
|0=Blood Donor| 32|  m|39.2|74.1|32.6|24.8| 9.6| 9.15|4.32| 76.0|29.9|68.7|   0.0|   0.0|
|0=Blood Donor| 32|  m|41.6|43.3|18.5|19.7|12.3| 9.92|6.05|111.0|91.0|  74|   0.0|   0.0|
|0=Blood Donor| 32|  m|46.3|41.3|17.5|17.8| 8.5| 7.01|4.79| 70.0|16.9|74.5|   0.0|   0.0|
|0=Blood Donor| 32|  m|42.2|41.9|35.8|31.1|16.1| 5.82| 4.6|109.0|21.5|67.1|   0.0|   0.0|
|0=Blood D

In [43]:
#to get back the origional column from the label encoded column 
from pyspark.ml.feature import IndexToString
idexToString = IndexToString(inputCol = 'Target', outputCol = 'Cat')
indexToString.transform(d).show()

+-------------+---+---+----+----+----+----+----+-----+----+-----+----+----+------+------+-------------+
|     Category|Age|Sex| ALB| ALP| ALT| AST| BIL|  CHE|CHOL| CREA| GGT|PROT|Gender|Target|          Cat|
+-------------+---+---+----+----+----+----+----+-----+----+-----+----+----+------+------+-------------+
|0=Blood Donor| 32|  m|38.5|52.5| 7.7|22.1| 7.5| 6.93|3.23|106.0|12.1|  69|   0.0|   0.0|0=Blood Donor|
|0=Blood Donor| 32|  m|38.5|70.3|  18|24.7| 3.9|11.17| 4.8| 74.0|15.6|76.5|   0.0|   0.0|0=Blood Donor|
|0=Blood Donor| 32|  m|46.9|74.7|36.2|52.6| 6.1| 8.84| 5.2| 86.0|33.2|79.3|   0.0|   0.0|0=Blood Donor|
|0=Blood Donor| 32|  m|43.2|  52|30.6|22.6|18.9| 7.33|4.74| 80.0|33.8|75.7|   0.0|   0.0|0=Blood Donor|
|0=Blood Donor| 32|  m|39.2|74.1|32.6|24.8| 9.6| 9.15|4.32| 76.0|29.9|68.7|   0.0|   0.0|0=Blood Donor|
|0=Blood Donor| 32|  m|41.6|43.3|18.5|19.7|12.3| 9.92|6.05|111.0|91.0|  74|   0.0|   0.0|0=Blood Donor|
|0=Blood Donor| 32|  m|46.3|41.3|17.5|17.8| 8.5| 7.01|4.79| 70.0

In [45]:
#Feature selection 
d.show(3)

+-------------+---+---+----+----+----+----+---+-----+----+-----+----+----+------+------+
|     Category|Age|Sex| ALB| ALP| ALT| AST|BIL|  CHE|CHOL| CREA| GGT|PROT|Gender|Target|
+-------------+---+---+----+----+----+----+---+-----+----+-----+----+----+------+------+
|0=Blood Donor| 32|  m|38.5|52.5| 7.7|22.1|7.5| 6.93|3.23|106.0|12.1|  69|   0.0|   0.0|
|0=Blood Donor| 32|  m|38.5|70.3|  18|24.7|3.9|11.17| 4.8| 74.0|15.6|76.5|   0.0|   0.0|
|0=Blood Donor| 32|  m|46.9|74.7|36.2|52.6|6.1| 8.84| 5.2| 86.0|33.2|79.3|   0.0|   0.0|
+-------------+---+---+----+----+----+----+---+-----+----+-----+----+----+------+------+
only showing top 3 rows



In [47]:
print(d.columns)

['Category', 'Age', 'Sex', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT', 'Gender', 'Target']


In [49]:
d1 = d['Age', 'Gender', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT', 'Target']

In [51]:
d1.show(3)

+---+------+----+----+----+----+---+-----+----+-----+----+----+------+
|Age|Gender| ALB| ALP| ALT| AST|BIL|  CHE|CHOL| CREA| GGT|PROT|Target|
+---+------+----+----+----+----+---+-----+----+-----+----+----+------+
| 32|   0.0|38.5|52.5| 7.7|22.1|7.5| 6.93|3.23|106.0|12.1|  69|   0.0|
| 32|   0.0|38.5|70.3|  18|24.7|3.9|11.17| 4.8| 74.0|15.6|76.5|   0.0|
| 32|   0.0|46.9|74.7|36.2|52.6|6.1| 8.84| 5.2| 86.0|33.2|79.3|   0.0|
+---+------+----+----+----+----+---+-----+----+-----+----+----+------+
only showing top 3 rows



In [52]:
d1.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Gender: double (nullable = false)
 |-- ALB: string (nullable = true)
 |-- ALP: string (nullable = true)
 |-- ALT: string (nullable = true)
 |-- AST: double (nullable = true)
 |-- BIL: double (nullable = true)
 |-- CHE: double (nullable = true)
 |-- CHOL: string (nullable = true)
 |-- CREA: double (nullable = true)
 |-- GGT: double (nullable = true)
 |-- PROT: string (nullable = true)
 |-- Target: double (nullable = false)



In [107]:
# Find Count of Null, None, NaN of All DataFrame Columns
from pyspark.sql.functions import col,isnan, when, count
d1.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in d1.columns]).show()

+---+------+---+---+---+---+---+---+----+----+---+----+------+
|Age|Gender|ALB|ALP|ALT|AST|BIL|CHE|CHOL|CREA|GGT|PROT|Target|
+---+------+---+---+---+---+---+---+----+----+---+----+------+
|  0|     0|  1| 18|  1|  0|  0|  0|  10|   0|  0|   1|     0|
+---+------+---+---+---+---+---+---+----+----+---+----+------+



In [108]:
d1 = d1.na.drop('any')

In [109]:
d1=d1.withColumn("ALB",d1.ALB.cast('double'))
d1=d1.withColumn("ALP",d1.ALP.cast('double'))
d1=d1.withColumn("ALT",d1.ALT.cast('double'))
d1=d1.withColumn("CHOL",d1.CHOL.cast('double'))
d1=d1.withColumn("PROT",d1.PROT.cast('double'))

In [110]:
d1.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Gender: double (nullable = false)
 |-- ALB: double (nullable = true)
 |-- ALP: double (nullable = true)
 |-- ALT: double (nullable = true)
 |-- AST: double (nullable = true)
 |-- BIL: double (nullable = true)
 |-- CHE: double (nullable = true)
 |-- CHOL: double (nullable = true)
 |-- CREA: double (nullable = true)
 |-- GGT: double (nullable = true)
 |-- PROT: double (nullable = true)
 |-- Target: double (nullable = false)



In [111]:
required_columns = ['Age', 'Gender', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT', 'Target']

In [112]:
from pyspark.ml.feature import VectorAssembler
vector_assembler = VectorAssembler(inputCols=required_columns, outputCol='features')

In [113]:
new_df = vector_assembler.transform(d1)

In [114]:
new_df.show(3)

+---+------+----+----+----+----+---+-----+----+-----+----+----+------+--------------------+
|Age|Gender| ALB| ALP| ALT| AST|BIL|  CHE|CHOL| CREA| GGT|PROT|Target|            features|
+---+------+----+----+----+----+---+-----+----+-----+----+----+------+--------------------+
| 32|   0.0|38.5|52.5| 7.7|22.1|7.5| 6.93|3.23|106.0|12.1|69.0|   0.0|[32.0,0.0,38.5,52...|
| 32|   0.0|38.5|70.3|18.0|24.7|3.9|11.17| 4.8| 74.0|15.6|76.5|   0.0|[32.0,0.0,38.5,70...|
| 32|   0.0|46.9|74.7|36.2|52.6|6.1| 8.84| 5.2| 86.0|33.2|79.3|   0.0|[32.0,0.0,46.9,74...|
+---+------+----+----+----+----+---+-----+----+-----+----+----+------+--------------------+
only showing top 3 rows



In [115]:
train, test = new_df.randomSplit([0.7, 0.3])

In [116]:
train.count()

408

#### Model Building: 
    pyspark.ml - for Dataframes
    pyspark.mllib - for RDD

In [117]:
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier

In [118]:
log_reg = LogisticRegression(featuresCol='features', labelCol='Target')

In [119]:
lr_model = log_reg.fit(train)

In [120]:
y_pred = lr_model.transform(test) 

In [121]:
y_pred.show() 

+---+------+----+----+----+----+----+-----+----+-----+----+----+------+--------------------+--------------------+--------------------+----------+
|Age|Gender| ALB| ALP| ALT| AST| BIL|  CHE|CHOL| CREA| GGT|PROT|Target|            features|       rawPrediction|         probability|prediction|
+---+------+----+----+----+----+----+-----+----+-----+----+----+------+--------------------+--------------------+--------------------+----------+
| 27|   0.0|45.0|27.5|10.5|37.8|10.0| 8.77| 3.2| 55.2|35.9|74.5|   2.0|[27.0,0.0,45.0,27...|[-53.579351609198...|[1.25343606417086...|       2.0|
| 32|   0.0|38.5|70.3|18.0|24.7| 3.9|11.17| 4.8| 74.0|15.6|76.5|   0.0|[32.0,0.0,38.5,70...|[325.203181200540...|[1.0,2.2050682922...|       0.0|
| 32|   0.0|39.2|74.1|32.6|24.8| 9.6| 9.15|4.32| 76.0|29.9|68.7|   0.0|[32.0,0.0,39.2,74...|[275.533538524718...|[1.0,2.0570071623...|       0.0|
| 32|   0.0|42.4|86.3|20.3|20.0|35.2| 5.46|4.45| 81.0|15.9|69.9|   0.0|[32.0,0.0,42.4,86...|[212.511508073585...|[1.0,1.6241

In [122]:
print(y_pred.columns)

['Age', 'Gender', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT', 'Target', 'features', 'rawPrediction', 'probability', 'prediction']


In [125]:
y_pred.select('rawPrediction', 'probability', 'prediction', 'Target').show()

+--------------------+--------------------+----------+------+
|       rawPrediction|         probability|prediction|Target|
+--------------------+--------------------+----------+------+
|[-53.579351609198...|[1.25343606417086...|       2.0|   2.0|
|[325.203181200540...|[1.0,2.2050682922...|       0.0|   0.0|
|[275.533538524718...|[1.0,2.0570071623...|       0.0|   0.0|
|[212.511508073585...|[1.0,1.6241066267...|       0.0|   0.0|
|[267.179211016653...|[1.0,1.4175507125...|       0.0|   0.0|
|[287.492310366083...|[1.0,1.3361008435...|       0.0|   0.0|
|[315.508379575921...|[1.0,1.2786747712...|       0.0|   0.0|
|[301.339504709834...|[1.0,6.2680820276...|       0.0|   0.0|
|[303.090454293382...|[1.0,9.5042641469...|       0.0|   0.0|
|[305.762813523138...|[1.0,1.3062488096...|       0.0|   0.0|
|[323.458649127355...|[1.0,2.0561083922...|       0.0|   0.0|
|[318.992911749301...|[1.0,7.9819661636...|       0.0|   0.0|
|[78.2703008373108...|[0.99999999999996...|       0.0|   2.0|
|[232.29

#### Model Evaluation using:
    Accuracy 
    F1 score
    Precision
    Recall

In [126]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator 

In [128]:
#Accuracy matric 
multi_evaluator = MulticlassClassificationEvaluator(labelCol='Target', metricName='accuracy')

In [129]:
multi_evaluator.evaluate(y_pred)

0.9668508287292817

In [134]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [138]:
lr_metrics = MulticlassMetrics(y_pred['Target', 'prediction'].rdd)

Exception ignored in: <function JavaModelWrapper.__del__ at 0x7fa5348ed280>
Traceback (most recent call last):
  File "/Users/heenasharma/opt/anaconda3/lib/python3.8/site-packages/pyspark/mllib/common.py", line 137, in __del__
    self._sc._gateway.detach(self._java_model)
AttributeError: 'MulticlassMetrics' object has no attribute '_sc'
Exception ignored in: <function JavaModelWrapper.__del__ at 0x7fa5348ed280>
Traceback (most recent call last):
  File "/Users/heenasharma/opt/anaconda3/lib/python3.8/site-packages/pyspark/mllib/common.py", line 137, in __del__
    self._sc._gateway.detach(self._java_model)
AttributeError: 'MulticlassMetrics' object has no attribute '_sc'
Exception ignored in: <function JavaModelWrapper.__del__ at 0x7fa5348ed280>
Traceback (most recent call last):
  File "/Users/heenasharma/opt/anaconda3/lib/python3.8/site-packages/pyspark/mllib/common.py", line 137, in __del__
    self._sc._gateway.detach(self._java_model)
AttributeError: 'MulticlassMetrics' object has

In [139]:
dir(lr_metrics)

['__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_java_model',
 '_sc',
 'accuracy',
 'call',
 'confusionMatrix',
 'fMeasure',
 'falsePositiveRate',
 'logLoss',
 'precision',
 'recall',
 'truePositiveRate',
 'weightedFMeasure',
 'weightedFalsePositiveRate',
 'weightedPrecision',
 'weightedRecall',
 'weightedTruePositiveRate']

In [140]:
print(lr_metrics.accuracy)

0.9668508287292817


In [145]:
print(lr_metrics.precision(1.0))
print(lr_metrics.recall(1.0))
print(lr_metrics.fMeasure(1.0))


0.9
1.0
0.9473684210526316
