In [None]:
!pip install pyspark

In [None]:
!pip install ibm-cos-sdk
!pip install ibmos2spark

In [None]:
!wget https://github.com/IBM/coursera/raw/master/hmp.parquet

In [14]:
import os
from pyspark.sql import SparkSession

# Set environment variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
os.environ["SPARK_HOME"] = "/usr/local/spark"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /usr/local/spark/jars/stocator-1.1.5.jar pyspark-shell"

spark = SparkSession.builder \
    .appName("Parquet File Example") \
    .config("spark.jars", "/usr/local/spark/jars/stocator-1.1.5.jar") \
    .config("spark.hadoop.fs.stocator.scheme.list", "cos") \
    .config("spark.hadoop.fs.cos.impl", "com.ibm.stocator.fs.ObjectStoreFileSystem") \
    .config("spark.hadoop.fs.stocator.cos.impl", "com.ibm.stocator.fs.cos.COSAPIClient") \
    .config("spark.hadoop.fs.stocator.cos.scheme", "cos") \
    .getOrCreate()


In [15]:
# Read the Parquet file
try:
    df = spark.read.parquet('hmp.parquet')
    df.show()
except Exception as e:
    print("Error during reading parquet file:", e)


+---+---+---+--------------------+-----------+
|  x|  y|  z|              source|      class|
+---+---+---+--------------------+-----------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|
| 20| 50| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 34|Accelerometer-201...|Brush_teeth|
| 22| 50| 34|Accelerometer-201...|Brush_teeth|
| 22| 51| 35|Accelerometer-201...|Brush_teeth|
| 21| 51| 33|Accelerometer-201...|Brush_teeth|
| 20| 50| 34|Accelerometer-201...|Brush_teeth|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|
| 20| 51| 35|Accelerometer-201...|Brush_teeth|
| 18| 49| 34|Accelerometer-201...|Brush_teeth|
| 19| 48| 34|Accelerometer-201...|Brush_teeth|
| 16| 53| 34|Accelerometer-201...|Brush_teeth|
| 18| 52| 35|

In [16]:
splits = df.randomSplit([0.7, 0.3], seed=42)
train_df = splits[0]
test_df = splits[1]

In [17]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler,Normalizer
from pyspark.ml.linalg import Vectors

indexer = StringIndexer(inputCol = 'class', outputCol='label')

vectorAssembler = VectorAssembler(inputCols=['x', 'y', 'z'], outputCol='features')

normalizer = Normalizer(inputCol='features', outputCol='norm_features', p=1.0)

In [18]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [19]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[indexer, vectorAssembler, normalizer, lr])

In [21]:
model = pipeline.fit(train_df)

In [22]:
prediction = model.transform(train_df)

In [23]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator().setMetricName("accuracy").setLabelCol("label").setPredictionCol("prediction")
accuracy = evaluator.evaluate(prediction)
print("Accuracy:", accuracy)

Accuracy: 0.20656587002740848


In [25]:
prediction = model.transform(test_df)
accuracy = evaluator.evaluate(prediction)
print("Accuracy:", accuracy)

Accuracy: 0.20668767741791277
