# Machine Learning With PySpark

In [2]:
from pyspark import SparkContext

In [6]:
# Run in the terminal
# import psutil
# psutil.spu_count()

In [3]:
sc = SparkContext(master="local[2]") # 2 is the number of process I want to use

In [7]:
# Using Spark UI
sc

#### Basic /PySpark Crash Course

In [10]:
import pyspark

In [11]:
# Methods/Attrib
dir(pyspark)

['Accumulator',
 'AccumulatorParam',
 'BarrierTaskContext',
 'BarrierTaskInfo',
 'BasicProfiler',
 'Broadcast',
 'HiveContext',
 'InheritableThread',
 'MarshalSerializer',
 'PickleSerializer',
 'Profiler',
 'RDD',
 'RDDBarrier',
 'Row',
 'SQLContext',
 'SparkConf',
 'SparkContext',
 'SparkFiles',
 'SparkJobInfo',
 'SparkStageInfo',
 'StatusTracker',
 'StorageLevel',
 'TaskContext',
 '_NoValue',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_globals',
 'accumulators',
 'broadcast',
 'cloudpickle',
 'conf',
 'context',
 'copy_func',
 'files',
 'find_spark_home',
 'java_gateway',
 'join',
 'keyword_only',
 'profiler',
 'rdd',
 'rddsampler',
 'resource',
 'resultiterable',
 'serializers',
 'shuffle',
 'since',
 'sql',
 'statcounter',
 'status',
 'storagelevel',
 'taskcontext',
 'traceback_utils',
 'types',
 'util',
 'version',
 'wraps']

In [12]:
from pyspark.sql import SparkSession

In [13]:
# Spark Session used for DF
# spark = SparkSession.builder.getOrCreate()
spark = SparkSession.builder.appName("MLwithSpark").getOrCreate() # To choose a app name

In [18]:
# read CSV with header/schema
df = spark.read.csv("data/hcvdata.csv")

In [19]:
# Preview Data
df.show(5)

+----+-------------+---+---+----+----+----+----+----+-----+----+----+----+----+
| _c0|          _c1|_c2|_c3| _c4| _c5| _c6| _c7| _c8|  _c9|_c10|_c11|_c12|_c13|
+----+-------------+---+---+----+----+----+----+----+-----+----+----+----+----+
|null|     Category|Age|Sex| ALB| ALP| ALT| AST| BIL|  CHE|CHOL|CREA| GGT|PROT|
|   1|0=Blood Donor| 32|  m|38.5|52.5| 7.7|22.1| 7.5| 6.93|3.23| 106|12.1|  69|
|   2|0=Blood Donor| 32|  m|38.5|70.3|  18|24.7| 3.9|11.17| 4.8|  74|15.6|76.5|
|   3|0=Blood Donor| 32|  m|46.9|74.7|36.2|52.6| 6.1| 8.84| 5.2|  86|33.2|79.3|
|   4|0=Blood Donor| 32|  m|43.2|  52|30.6|22.6|18.9| 7.33|4.74|  80|33.8|75.7|
+----+-------------+---+---+----+----+----+----+----+-----+----+----+----+----+
only showing top 5 rows



In [61]:
# read CSV with header
df = spark.read.csv("data/hcvdata.csv", header=True)

In [23]:
df.show(5)

+---+-------------+---+---+----+----+----+----+----+-----+----+----+----+----+
|_c0|     Category|Age|Sex| ALB| ALP| ALT| AST| BIL|  CHE|CHOL|CREA| GGT|PROT|
+---+-------------+---+---+----+----+----+----+----+-----+----+----+----+----+
|  1|0=Blood Donor| 32|  m|38.5|52.5| 7.7|22.1| 7.5| 6.93|3.23| 106|12.1|  69|
|  2|0=Blood Donor| 32|  m|38.5|70.3|  18|24.7| 3.9|11.17| 4.8|  74|15.6|76.5|
|  3|0=Blood Donor| 32|  m|46.9|74.7|36.2|52.6| 6.1| 8.84| 5.2|  86|33.2|79.3|
|  4|0=Blood Donor| 32|  m|43.2|  52|30.6|22.6|18.9| 7.33|4.74|  80|33.8|75.7|
|  5|0=Blood Donor| 32|  m|39.2|74.1|32.6|24.8| 9.6| 9.15|4.32|  76|29.9|68.7|
+---+-------------+---+---+----+----+----+----+----+-----+----+----+----+----+
only showing top 5 rows



In [24]:
# Get first row
df.first()

Row(_c0='1', Category='0=Blood Donor', Age='32', Sex='m', ALB='38.5', ALP='52.5', ALT='7.7', AST='22.1', BIL='7.5', CHE='6.93', CHOL='3.23', CREA='106', GGT='12.1', PROT='69')

In [25]:
# Get Head
df.head(5)

[Row(_c0='1', Category='0=Blood Donor', Age='32', Sex='m', ALB='38.5', ALP='52.5', ALT='7.7', AST='22.1', BIL='7.5', CHE='6.93', CHOL='3.23', CREA='106', GGT='12.1', PROT='69'),
 Row(_c0='2', Category='0=Blood Donor', Age='32', Sex='m', ALB='38.5', ALP='70.3', ALT='18', AST='24.7', BIL='3.9', CHE='11.17', CHOL='4.8', CREA='74', GGT='15.6', PROT='76.5'),
 Row(_c0='3', Category='0=Blood Donor', Age='32', Sex='m', ALB='46.9', ALP='74.7', ALT='36.2', AST='52.6', BIL='6.1', CHE='8.84', CHOL='5.2', CREA='86', GGT='33.2', PROT='79.3'),
 Row(_c0='4', Category='0=Blood Donor', Age='32', Sex='m', ALB='43.2', ALP='52', ALT='30.6', AST='22.6', BIL='18.9', CHE='7.33', CHOL='4.74', CREA='80', GGT='33.8', PROT='75.7'),
 Row(_c0='5', Category='0=Blood Donor', Age='32', Sex='m', ALB='39.2', ALP='74.1', ALT='32.6', AST='24.8', BIL='9.6', CHE='9.15', CHOL='4.32', CREA='76', GGT='29.9', PROT='68.7')]

In [27]:
# Check for the column names
df.columns

['_c0',
 'Category',
 'Age',
 'Sex',
 'ALB',
 'ALP',
 'ALT',
 'AST',
 'BIL',
 'CHE',
 'CHOL',
 'CREA',
 'GGT',
 'PROT']

In [29]:
# Check fo the Dtype
df.dtypes

[('_c0', 'string'),
 ('Category', 'string'),
 ('Age', 'string'),
 ('Sex', 'string'),
 ('ALB', 'string'),
 ('ALP', 'string'),
 ('ALT', 'string'),
 ('AST', 'string'),
 ('BIL', 'string'),
 ('CHE', 'string'),
 ('CHOL', 'string'),
 ('CREA', 'string'),
 ('GGT', 'string'),
 ('PROT', 'string')]

In [30]:
# Get the Schema
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- ALB: string (nullable = true)
 |-- ALP: string (nullable = true)
 |-- ALT: string (nullable = true)
 |-- AST: string (nullable = true)
 |-- BIL: string (nullable = true)
 |-- CHE: string (nullable = true)
 |-- CHOL: string (nullable = true)
 |-- CREA: string (nullable = true)
 |-- GGT: string (nullable = true)
 |-- PROT: string (nullable = true)



In [31]:
# Check the number of rowa
df.count()

615

In [32]:
# Check the namber of columns
len(df.columns)

14

In [33]:
# Get the shape (rows, cols)
print(df.count(), len(df.columns))

615 14


In [39]:
#### Descriptive sumary

In [35]:
df.describe()

DataFrame[summary: string, _c0: string, Category: string, Age: string, Sex: string, ALB: string, ALP: string, ALT: string, AST: string, BIL: string, CHE: string, CHOL: string, CREA: string, GGT: string, PROT: string]

In [36]:
df.describe().show()

+-------+------------------+-------------+------------------+----+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+
|summary|               _c0|     Category|               Age| Sex|              ALB|               ALP|               ALT|              AST|               BIL|               CHE|              CHOL|             CREA|              GGT|             PROT|
+-------+------------------+-------------+------------------+----+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+
|  count|               615|          615|               615| 615|              615|               615|               615|              615|               615|               615|               615|              615|              615|           

In [38]:
# Get descriptive sumary of a column
df.describe('age').show()

+-------+------------------+
|summary|               age|
+-------+------------------+
|  count|               615|
|   mean| 47.40813008130081|
| stddev|10.055105445519239|
|    min|                19|
|    max|                77|
+-------+------------------+



#### Selection of Columns
* Dont bring out the entire column
   + Bracket Notation df[col]
   + Dot Notation     df.col

In [40]:
# Select a column
df.columns

['_c0',
 'Category',
 'Age',
 'Sex',
 'ALB',
 'ALP',
 'ALT',
 'AST',
 'BIL',
 'CHE',
 'CHOL',
 'CREA',
 'GGT',
 'PROT']

In [41]:
# Selection
df.select('Age').show()

+---+
|Age|
+---+
| 32|
| 32|
| 32|
| 32|
| 32|
| 32|
| 32|
| 32|
| 32|
| 32|
| 32|
| 33|
| 33|
| 33|
| 33|
| 33|
| 33|
| 33|
| 33|
| 33|
+---+
only showing top 20 rows



In [42]:
# Irrespective of case of column
df.select('AGE').show()

+---+
|AGE|
+---+
| 32|
| 32|
| 32|
| 32|
| 32|
| 32|
| 32|
| 32|
| 32|
| 32|
| 32|
| 33|
| 33|
| 33|
| 33|
| 33|
| 33|
| 33|
| 33|
| 33|
+---+
only showing top 20 rows



In [43]:
# Irrespective of case of column
df.select('age').show()

+---+
|age|
+---+
| 32|
| 32|
| 32|
| 32|
| 32|
| 32|
| 32|
| 32|
| 32|
| 32|
| 32|
| 33|
| 33|
| 33|
| 33|
| 33|
| 33|
| 33|
| 33|
| 33|
+---+
only showing top 20 rows



In [44]:
# Select multiple columns
df.select('Age','Category').show()

+---+-------------+
|Age|     Category|
+---+-------------+
| 32|0=Blood Donor|
| 32|0=Blood Donor|
| 32|0=Blood Donor|
| 32|0=Blood Donor|
| 32|0=Blood Donor|
| 32|0=Blood Donor|
| 32|0=Blood Donor|
| 32|0=Blood Donor|
| 32|0=Blood Donor|
| 32|0=Blood Donor|
| 32|0=Blood Donor|
| 33|0=Blood Donor|
| 33|0=Blood Donor|
| 33|0=Blood Donor|
| 33|0=Blood Donor|
| 33|0=Blood Donor|
| 33|0=Blood Donor|
| 33|0=Blood Donor|
| 33|0=Blood Donor|
| 33|0=Blood Donor|
+---+-------------+
only showing top 20 rows



In [46]:
# Bracket Notation. Bring the comlun name, not the entire column *
df['Age']

Column<'Age'>

In [47]:
# Dot Notation *
df.Age

Column<'Age'>

#### Conditions & Filter
+ df.filter
+ df.where

In [45]:
df.show(5)

+---+-------------+---+---+----+----+----+----+----+-----+----+----+----+----+
|_c0|     Category|Age|Sex| ALB| ALP| ALT| AST| BIL|  CHE|CHOL|CREA| GGT|PROT|
+---+-------------+---+---+----+----+----+----+----+-----+----+----+----+----+
|  1|0=Blood Donor| 32|  m|38.5|52.5| 7.7|22.1| 7.5| 6.93|3.23| 106|12.1|  69|
|  2|0=Blood Donor| 32|  m|38.5|70.3|  18|24.7| 3.9|11.17| 4.8|  74|15.6|76.5|
|  3|0=Blood Donor| 32|  m|46.9|74.7|36.2|52.6| 6.1| 8.84| 5.2|  86|33.2|79.3|
|  4|0=Blood Donor| 32|  m|43.2|  52|30.6|22.6|18.9| 7.33|4.74|  80|33.8|75.7|
|  5|0=Blood Donor| 32|  m|39.2|74.1|32.6|24.8| 9.6| 9.15|4.32|  76|29.9|68.7|
+---+-------------+---+---+----+----+----+----+----+-----+----+----+----+----+
only showing top 5 rows



In [49]:
# Filter : Method 1
df.filter(df['Age'] == 25).show()

+---+-----------+---+---+---+----+----+-----+---+---+----+----+----+----+
|_c0|   Category|Age|Sex|ALB| ALP| ALT|  AST|BIL|CHE|CHOL|CREA| GGT|PROT|
+---+-----------+---+---+---+----+----+-----+---+---+----+----+----+----+
|544|1=Hepatitis| 25|  m| 42|38.2|63.3|187.7| 14|  6|4.28|66.9|40.2|70.5|
+---+-----------+---+---+---+----+----+-----+---+---+----+----+----+----+



In [50]:
# Filter : Method 1
df.filter(df.Age == 25).show()

+---+-----------+---+---+---+----+----+-----+---+---+----+----+----+----+
|_c0|   Category|Age|Sex|ALB| ALP| ALT|  AST|BIL|CHE|CHOL|CREA| GGT|PROT|
+---+-----------+---+---+---+----+----+-----+---+---+----+----+----+----+
|544|1=Hepatitis| 25|  m| 42|38.2|63.3|187.7| 14|  6|4.28|66.9|40.2|70.5|
+---+-----------+---+---+---+----+----+-----+---+---+----+----+----+----+



In [51]:
# Where : Method 1
df.where(df['Sex'] == 'f').show()

+---+-------------+---+---+----+-----+----+----+----+-----+----+----+----+----+
|_c0|     Category|Age|Sex| ALB|  ALP| ALT| AST| BIL|  CHE|CHOL|CREA| GGT|PROT|
+---+-------------+---+---+----+-----+----+----+----+-----+----+----+----+----+
|319|0=Blood Donor| 32|  f|39.9| 35.2|  22|29.8| 6.3| 8.16|4.37|  60| 4.5|72.5|
|320|0=Blood Donor| 32|  f|47.4| 52.5|19.1|17.1| 4.6|10.19|  NA|  63|  23|72.2|
|321|0=Blood Donor| 32|  f|41.1| 42.8|10.1|14.1|23.2| 6.08|3.75|  53| 9.3|68.9|
|322|0=Blood Donor| 32|  f|43.5| 66.2| 9.2|17.8| 5.7| 7.14|4.38|  71|44.6|76.1|
|323|0=Blood Donor| 33|  f|  36| 77.5|14.8|  22| 4.4| 8.61|5.26|  66|13.1|66.1|
|324|0=Blood Donor| 33|  f|36.9| 51.7|17.4|  22| 8.3|    7|5.02|  52|19.1|  72|
|325|0=Blood Donor| 33|  f|44.3|   74|49.7|52.3| 8.5| 6.49|3.34|  73|44.7|73.8|
|326|0=Blood Donor| 33|  f|38.1| 35.2|11.9|18.3|   3| 6.09|5.22|  76|15.4|  72|
|327|0=Blood Donor| 33|  f|  41| 61.1|  27|  28|   6| 8.36|4.93|  70|24.7|70.5|
|328|0=Blood Donor| 33|  f|38.2| 54.4|17

In [53]:
# Where : Method 1
df.where(df['Sex'] == 'f').select('Age','Sex','Category').show(5)

+---+---+-------------+
|Age|Sex|     Category|
+---+---+-------------+
| 32|  f|0=Blood Donor|
| 32|  f|0=Blood Donor|
| 32|  f|0=Blood Donor|
| 32|  f|0=Blood Donor|
| 33|  f|0=Blood Donor|
+---+---+-------------+
only showing top 5 rows



In [54]:
df.show(5)

+---+-------------+---+---+----+----+----+----+----+-----+----+----+----+----+
|_c0|     Category|Age|Sex| ALB| ALP| ALT| AST| BIL|  CHE|CHOL|CREA| GGT|PROT|
+---+-------------+---+---+----+----+----+----+----+-----+----+----+----+----+
|  1|0=Blood Donor| 32|  m|38.5|52.5| 7.7|22.1| 7.5| 6.93|3.23| 106|12.1|  69|
|  2|0=Blood Donor| 32|  m|38.5|70.3|  18|24.7| 3.9|11.17| 4.8|  74|15.6|76.5|
|  3|0=Blood Donor| 32|  m|46.9|74.7|36.2|52.6| 6.1| 8.84| 5.2|  86|33.2|79.3|
|  4|0=Blood Donor| 32|  m|43.2|  52|30.6|22.6|18.9| 7.33|4.74|  80|33.8|75.7|
|  5|0=Blood Donor| 32|  m|39.2|74.1|32.6|24.8| 9.6| 9.15|4.32|  76|29.9|68.7|
+---+-------------+---+---+----+----+----+----+----+-----+----+----+----+----+
only showing top 5 rows



In [62]:
# Add column
df.withColumn('Alb_by_10', df['ALB'] * 10).show()

+---+-------------+---+---+----+----+----+----+----+-----+----+----+----+----+---------+
|_c0|     Category|Age|Sex| ALB| ALP| ALT| AST| BIL|  CHE|CHOL|CREA| GGT|PROT|Alb_by_10|
+---+-------------+---+---+----+----+----+----+----+-----+----+----+----+----+---------+
|  1|0=Blood Donor| 32|  m|38.5|52.5| 7.7|22.1| 7.5| 6.93|3.23| 106|12.1|  69|    385.0|
|  2|0=Blood Donor| 32|  m|38.5|70.3|  18|24.7| 3.9|11.17| 4.8|  74|15.6|76.5|    385.0|
|  3|0=Blood Donor| 32|  m|46.9|74.7|36.2|52.6| 6.1| 8.84| 5.2|  86|33.2|79.3|    469.0|
|  4|0=Blood Donor| 32|  m|43.2|  52|30.6|22.6|18.9| 7.33|4.74|  80|33.8|75.7|    432.0|
|  5|0=Blood Donor| 32|  m|39.2|74.1|32.6|24.8| 9.6| 9.15|4.32|  76|29.9|68.7|    392.0|
|  6|0=Blood Donor| 32|  m|41.6|43.3|18.5|19.7|12.3| 9.92|6.05| 111|  91|  74|    416.0|
|  7|0=Blood Donor| 32|  m|46.3|41.3|17.5|17.8| 8.5| 7.01|4.79|  70|16.9|74.5|    463.0|
|  8|0=Blood Donor| 32|  m|42.2|41.9|35.8|31.1|16.1| 5.82| 4.6| 109|21.5|67.1|    422.0|
|  9|0=Blood Donor| 3

In [63]:
df2 = df.withColumn('Alb_by_10', df['ALB'] * 10)

In [64]:
df2.show(5)

+---+-------------+---+---+----+----+----+----+----+-----+----+----+----+----+---------+
|_c0|     Category|Age|Sex| ALB| ALP| ALT| AST| BIL|  CHE|CHOL|CREA| GGT|PROT|Alb_by_10|
+---+-------------+---+---+----+----+----+----+----+-----+----+----+----+----+---------+
|  1|0=Blood Donor| 32|  m|38.5|52.5| 7.7|22.1| 7.5| 6.93|3.23| 106|12.1|  69|    385.0|
|  2|0=Blood Donor| 32|  m|38.5|70.3|  18|24.7| 3.9|11.17| 4.8|  74|15.6|76.5|    385.0|
|  3|0=Blood Donor| 32|  m|46.9|74.7|36.2|52.6| 6.1| 8.84| 5.2|  86|33.2|79.3|    469.0|
|  4|0=Blood Donor| 32|  m|43.2|  52|30.6|22.6|18.9| 7.33|4.74|  80|33.8|75.7|    432.0|
|  5|0=Blood Donor| 32|  m|39.2|74.1|32.6|24.8| 9.6| 9.15|4.32|  76|29.9|68.7|    392.0|
+---+-------------+---+---+----+----+----+----+----+-----+----+----+----+----+---------+
only showing top 5 rows



In [67]:
# Drop a column
df2.drop('Alb_by_10').show(5)

+---+-------------+---+---+----+----+----+----+----+-----+----+----+----+----+
|_c0|     Category|Age|Sex| ALB| ALP| ALT| AST| BIL|  CHE|CHOL|CREA| GGT|PROT|
+---+-------------+---+---+----+----+----+----+----+-----+----+----+----+----+
|  1|0=Blood Donor| 32|  m|38.5|52.5| 7.7|22.1| 7.5| 6.93|3.23| 106|12.1|  69|
|  2|0=Blood Donor| 32|  m|38.5|70.3|  18|24.7| 3.9|11.17| 4.8|  74|15.6|76.5|
|  3|0=Blood Donor| 32|  m|46.9|74.7|36.2|52.6| 6.1| 8.84| 5.2|  86|33.2|79.3|
|  4|0=Blood Donor| 32|  m|43.2|  52|30.6|22.6|18.9| 7.33|4.74|  80|33.8|75.7|
|  5|0=Blood Donor| 32|  m|39.2|74.1|32.6|24.8| 9.6| 9.15|4.32|  76|29.9|68.7|
+---+-------------+---+---+----+----+----+----+----+-----+----+----+----+----+
only showing top 5 rows



In [68]:
df2.show(3)

+---+-------------+---+---+----+----+----+----+---+-----+----+----+----+----+---------+
|_c0|     Category|Age|Sex| ALB| ALP| ALT| AST|BIL|  CHE|CHOL|CREA| GGT|PROT|Alb_by_10|
+---+-------------+---+---+----+----+----+----+---+-----+----+----+----+----+---------+
|  1|0=Blood Donor| 32|  m|38.5|52.5| 7.7|22.1|7.5| 6.93|3.23| 106|12.1|  69|    385.0|
|  2|0=Blood Donor| 32|  m|38.5|70.3|  18|24.7|3.9|11.17| 4.8|  74|15.6|76.5|    385.0|
|  3|0=Blood Donor| 32|  m|46.9|74.7|36.2|52.6|6.1| 8.84| 5.2|  86|33.2|79.3|    469.0|
+---+-------------+---+---+----+----+----+----+---+-----+----+----+----+----+---------+
only showing top 3 rows



In [69]:
# Drop a column
df2 = df2.drop('Alb_by_10')

In [70]:
df.show(3)

+---+-------------+---+---+----+----+----+----+---+-----+----+----+----+----+
|_c0|     Category|Age|Sex| ALB| ALP| ALT| AST|BIL|  CHE|CHOL|CREA| GGT|PROT|
+---+-------------+---+---+----+----+----+----+---+-----+----+----+----+----+
|  1|0=Blood Donor| 32|  m|38.5|52.5| 7.7|22.1|7.5| 6.93|3.23| 106|12.1|  69|
|  2|0=Blood Donor| 32|  m|38.5|70.3|  18|24.7|3.9|11.17| 4.8|  74|15.6|76.5|
|  3|0=Blood Donor| 32|  m|46.9|74.7|36.2|52.6|6.1| 8.84| 5.2|  86|33.2|79.3|
+---+-------------+---+---+----+----+----+----+---+-----+----+----+----+----+
only showing top 3 rows



In [71]:
# GroupBy
# df['category'].value_counts()  :pandas
df.groupBy('Category').count().show()

+--------------------+-----+
|            Category|count|
+--------------------+-----+
|       0=Blood Donor|  533|
|         3=Cirrhosis|   30|
|          2=Fibrosis|   21|
|0s=suspect Blood ...|    7|
|         1=Hepatitis|   24|
+--------------------+-----+



In [73]:
df.groupBy('Age').mean().show()

+---+
|Age|
+---+
| 51|
| 54|
| 29|
| 42|
| 64|
| 30|
| 34|
| 59|
| 35|
| 52|
| 71|
| 47|
| 43|
| 70|
| 61|
| 27|
| 75|
| 46|
| 77|
| 60|
+---+
only showing top 20 rows



In [None]:
# Aggregation
# df.groupBy('category').agg('col':'sum')

In [74]:
# Unique values
df.select('category').distinct().show()

+--------------------+
|            category|
+--------------------+
|       0=Blood Donor|
|         3=Cirrhosis|
|          2=Fibrosis|
|0s=suspect Blood ...|
|         1=Hepatitis|
+--------------------+



In [76]:
# Unique values
# We do not need to put SHOW() because it is an action
df.select('category').distinct().count()

5

In [77]:
# Unique Values
df.selectExpr('count(distinct(category))').show()

+------------------------+
|count(DISTINCT category)|
+------------------------+
|                       5|
+------------------------+



#### Save Dataset
+ CSV
+ parquet
+ etc

In [78]:
df.show(3)

+---+-------------+---+---+----+----+----+----+---+-----+----+----+----+----+
|_c0|     Category|Age|Sex| ALB| ALP| ALT| AST|BIL|  CHE|CHOL|CREA| GGT|PROT|
+---+-------------+---+---+----+----+----+----+---+-----+----+----+----+----+
|  1|0=Blood Donor| 32|  m|38.5|52.5| 7.7|22.1|7.5| 6.93|3.23| 106|12.1|  69|
|  2|0=Blood Donor| 32|  m|38.5|70.3|  18|24.7|3.9|11.17| 4.8|  74|15.6|76.5|
|  3|0=Blood Donor| 32|  m|46.9|74.7|36.2|52.6|6.1| 8.84| 5.2|  86|33.2|79.3|
+---+-------------+---+---+----+----+----+----+---+-----+----+----+----+----+
only showing top 3 rows



In [79]:
print(df.columns)

['_c0', 'Category', 'Age', 'Sex', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT']


In [80]:
df.select('Age','Sex','ALB','ALP','ALT','AST','BIL','CHE','CHOL','CREA','GGT','PROT','Category').show()

+---+---+----+----+----+----+----+-----+----+----+----+----+-------------+
|Age|Sex| ALB| ALP| ALT| AST| BIL|  CHE|CHOL|CREA| GGT|PROT|     Category|
+---+---+----+----+----+----+----+-----+----+----+----+----+-------------+
| 32|  m|38.5|52.5| 7.7|22.1| 7.5| 6.93|3.23| 106|12.1|  69|0=Blood Donor|
| 32|  m|38.5|70.3|  18|24.7| 3.9|11.17| 4.8|  74|15.6|76.5|0=Blood Donor|
| 32|  m|46.9|74.7|36.2|52.6| 6.1| 8.84| 5.2|  86|33.2|79.3|0=Blood Donor|
| 32|  m|43.2|  52|30.6|22.6|18.9| 7.33|4.74|  80|33.8|75.7|0=Blood Donor|
| 32|  m|39.2|74.1|32.6|24.8| 9.6| 9.15|4.32|  76|29.9|68.7|0=Blood Donor|
| 32|  m|41.6|43.3|18.5|19.7|12.3| 9.92|6.05| 111|  91|  74|0=Blood Donor|
| 32|  m|46.3|41.3|17.5|17.8| 8.5| 7.01|4.79|  70|16.9|74.5|0=Blood Donor|
| 32|  m|42.2|41.9|35.8|31.1|16.1| 5.82| 4.6| 109|21.5|67.1|0=Blood Donor|
| 32|  m|50.9|65.5|23.2|21.2| 6.9| 8.69| 4.1|  83|13.7|71.3|0=Blood Donor|
| 32|  m|42.4|86.3|20.3|  20|35.2| 5.46|4.45|  81|15.9|69.9|0=Blood Donor|
| 32|  m|44.3|52.3|21.7|2

In [81]:
df = df.select('Age','Sex','ALB','ALP','ALT','AST','BIL','CHE','CHOL','CREA','GGT','PROT','Category')

In [82]:
# Save
df.write.format('CSV').option('header','true').save('data/hcvdata_new.csv')