In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[*]').appName('SparkSQLGooglePlaystore').getOrCreate()
cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print(f'You are working with {cores} cores')
spark

You are working with 1 cores


In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType

In [3]:
df_schema = StructType([
        StructField('_c0', IntegerType(), True),
        StructField('Category', StringType(), True),
        StructField('Age', IntegerType(), True),
        StructField('Sex', StringType(), True),
        StructField('ALB', DoubleType(), True),
        StructField('ALP', DoubleType(), True),
        StructField('ALT', DoubleType(), True),
        StructField('AST', DoubleType(), True),
        StructField('BIL', DoubleType(), True),
        StructField('CHE', DoubleType(), True),
        StructField('CHOL', DoubleType(), True),
        StructField('CREA', DoubleType(), True),
        StructField('GGT', DoubleType(), True),
        StructField('PROT', DoubleType(), True),
    ])

In [4]:
df = spark.read.csv("./data/hcvdat0.csv", schema= df_schema, header= True, inferSchema= True)

In [5]:
df.limit(5).toPandas()

Unnamed: 0,_c0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,1,0=Blood Donor,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,2,0=Blood Donor,32,m,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,3,0=Blood Donor,32,m,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,4,0=Blood Donor,32,m,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,5,0=Blood Donor,32,m,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7


In [6]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Category: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- ALB: double (nullable = true)
 |-- ALP: double (nullable = true)
 |-- ALT: double (nullable = true)
 |-- AST: double (nullable = true)
 |-- BIL: double (nullable = true)
 |-- CHE: double (nullable = true)
 |-- CHOL: double (nullable = true)
 |-- CREA: double (nullable = true)
 |-- GGT: double (nullable = true)
 |-- PROT: double (nullable = true)



In [7]:
print(df.columns)

['_c0', 'Category', 'Age', 'Sex', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT']


In [8]:
df = df.select('Age', 'Sex', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT', 'Category' )

In [9]:
df.show(3)

+---+---+----+----+----+----+---+-----+----+-----+----+----+-------------+
|Age|Sex| ALB| ALP| ALT| AST|BIL|  CHE|CHOL| CREA| GGT|PROT|     Category|
+---+---+----+----+----+----+---+-----+----+-----+----+----+-------------+
| 32|  m|38.5|52.5| 7.7|22.1|7.5| 6.93|3.23|106.0|12.1|69.0|0=Blood Donor|
| 32|  m|38.5|70.3|18.0|24.7|3.9|11.17| 4.8| 74.0|15.6|76.5|0=Blood Donor|
| 32|  m|46.9|74.7|36.2|52.6|6.1| 8.84| 5.2| 86.0|33.2|79.3|0=Blood Donor|
+---+---+----+----+----+----+---+-----+----+-----+----+----+-------------+
only showing top 3 rows



In [10]:
df.fillna(0)

DataFrame[Age: int, Sex: string, ALB: double, ALP: double, ALT: double, AST: double, BIL: double, CHE: double, CHOL: double, CREA: double, GGT: double, PROT: double, Category: string]

In [11]:
print(f'shape: {(df.count(), len(df.columns))}')

shape: (615, 13)


In [12]:
df.groupBy(df['Category']).count().show()

+--------------------+-----+
|            Category|count|
+--------------------+-----+
|       0=Blood Donor|  533|
|         3=Cirrhosis|   30|
|          2=Fibrosis|   21|
|0s=suspect Blood ...|    7|
|         1=Hepatitis|   24|
+--------------------+-----+



In [13]:
df.select(df['Sex']).distinct().show()

+---+
|Sex|
+---+
|  m|
|  f|
+---+



In [14]:
from pyspark.ml.feature import VectorAssembler, StringIndexer

In [15]:
df = (StringIndexer(inputCol= 'Sex', outputCol= 'Gender')
      .fit(df)
      .transform(df))

df = (StringIndexer(inputCol= 'Category', outputCol= 'Target')
      .fit(df)
      .transform(df))

In [16]:
col_to_del = ['Sex', 'Category']
df = df.drop(*col_to_del)

In [17]:
df.show(5)

+---+----+----+----+----+----+-----+----+-----+----+----+------+------+
|Age| ALB| ALP| ALT| AST| BIL|  CHE|CHOL| CREA| GGT|PROT|Gender|Target|
+---+----+----+----+----+----+-----+----+-----+----+----+------+------+
| 32|38.5|52.5| 7.7|22.1| 7.5| 6.93|3.23|106.0|12.1|69.0|   0.0|   0.0|
| 32|38.5|70.3|18.0|24.7| 3.9|11.17| 4.8| 74.0|15.6|76.5|   0.0|   0.0|
| 32|46.9|74.7|36.2|52.6| 6.1| 8.84| 5.2| 86.0|33.2|79.3|   0.0|   0.0|
| 32|43.2|52.0|30.6|22.6|18.9| 7.33|4.74| 80.0|33.8|75.7|   0.0|   0.0|
| 32|39.2|74.1|32.6|24.8| 9.6| 9.15|4.32| 76.0|29.9|68.7|   0.0|   0.0|
+---+----+----+----+----+----+-----+----+-----+----+----+------+------+
only showing top 5 rows



In [18]:
print(df.columns)

['Age', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT', 'Gender', 'Target']


In [19]:
df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- ALB: double (nullable = true)
 |-- ALP: double (nullable = true)
 |-- ALT: double (nullable = true)
 |-- AST: double (nullable = true)
 |-- BIL: double (nullable = true)
 |-- CHE: double (nullable = true)
 |-- CHOL: double (nullable = true)
 |-- CREA: double (nullable = true)
 |-- GGT: double (nullable = true)
 |-- PROT: double (nullable = true)
 |-- Gender: double (nullable = false)
 |-- Target: double (nullable = false)



In [20]:
cols = ['Age', 'Gender', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT', 'Target']

In [21]:
vec_as_df = VectorAssembler(inputCols= cols, outputCol= 'Features').transform(df)

In [22]:
vec_as_df.show(3)

+---+----+----+----+----+---+-----+----+-----+----+----+------+------+--------------------+
|Age| ALB| ALP| ALT| AST|BIL|  CHE|CHOL| CREA| GGT|PROT|Gender|Target|            Features|
+---+----+----+----+----+---+-----+----+-----+----+----+------+------+--------------------+
| 32|38.5|52.5| 7.7|22.1|7.5| 6.93|3.23|106.0|12.1|69.0|   0.0|   0.0|[32.0,0.0,38.5,52...|
| 32|38.5|70.3|18.0|24.7|3.9|11.17| 4.8| 74.0|15.6|76.5|   0.0|   0.0|[32.0,0.0,38.5,70...|
| 32|46.9|74.7|36.2|52.6|6.1| 8.84| 5.2| 86.0|33.2|79.3|   0.0|   0.0|[32.0,0.0,46.9,74...|
+---+----+----+----+----+---+-----+----+-----+----+----+------+------+--------------------+
only showing top 3 rows



In [23]:
X_train, X_test = vec_as_df.randomSplit([0.7, 0.3])

In [25]:
# X_test.count()