**Data Wrangling comparison between PySpark and Pandas 
Note: in Spark 2.4 we will also have the Koalas API for spark**

In [None]:
#Import a few libraries

!pip install wget
import wget
import sys
import os
import boto3

**ENTER VARIABLE VALUES FOR THE TUTORIAL HERE**

In [None]:
os.environ['S3_ENDPOINT'] = 'ENTER LOCATION OF CEPH HERE' #ex. 'https://my.ceph.cluster'
ceph_bucket = 'ENTER CEPH BUCKET HERE' #ex. 'TUTORIAL'

In [None]:


#Create a S3 client that will access Ceph
s3 = boto3.client('s3','us-east-1', endpoint_url= os.environ['S3_ENDPOINT'],
                       aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID'],
                       aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY'])

ceph_bucket = ceph_bucket.upper()

#Define the location to upload data to Ceph
s3.create_bucket(Bucket=ceph_bucket)

ceph_base_location = os.environ['JUPYTERHUB_USER']

#Upload a sample object and verify
s3.put_object(Bucket=ceph_bucket,Key=ceph_base_location + '/my_created_object',Body='data')
for key in s3.list_objects(Bucket=ceph_bucket)['Contents']:
    print(key['Key'])

In [None]:
#Prepare the data for the example by uploading it to Ceph Object Storage
filename = wget.download(url="https://raw.githubusercontent.com/pandas-dev/pandas/master/pandas/tests/data/iris.csv", out='iris.csv')
s3.upload_file(filename, ceph_bucket, ceph_base_location + "/iris.csv")

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles

spark = SparkSession.builder.appName(os.environ['JUPYTERHUB_USER'] + ' Iris Example').master('spark://' + os.environ['SPARK_CLUSTER'] + ':7077').getOrCreate()

#Configure Spark to access data from Ceph
hadoopConf=spark.sparkContext._jsc.hadoopConfiguration()
hadoopConf.set("fs.s3a.endpoint", os.environ['S3_ENDPOINT'])
hadoopConf.set("fs.s3a.access.key", os.environ['AWS_ACCESS_KEY_ID'])
hadoopConf.set("fs.s3a.secret.key", os.environ['AWS_SECRET_ACCESS_KEY'])
hadoopConf.set("fs.s3a.path.style.access", "true")
hadoopConf.set("fs.s3a.connection.ssl.enabled", "false")

iris_spark_df = spark.read.csv("s3a://" + ceph_bucket + "/" + ceph_base_location + "/iris.csv", header="true", inferSchema="true")

In [None]:
iris_spark_df.show()

In [None]:
iris_spark_df.printSchema()

In [None]:
iris_spark_df.columns

In [None]:
iris_spark_df.dtypes

In [None]:
iris_spark_df.head(3)

Now we will run similar commands in Pandas to read into the dataframe and inspect its contents

In [None]:
!pip install s3fs

import pandas as pd
import s3fs

fs = s3fs.S3FileSystem(anon=False, key=os.environ['AWS_ACCESS_KEY_ID'], secret=os.environ['AWS_SECRET_ACCESS_KEY'], client_kwargs={'endpoint_url': 
    os.environ['S3_ENDPOINT']})

iris_pandas_df = pd.read_csv(fs.open("s3://" + ceph_bucket + "/" + ceph_base_location + "/iris.csv"))
iris_pandas_df



In [None]:
iris_pandas_df.columns

In [None]:
iris_pandas_df.dtypes

In [None]:
iris_pandas_df.tail(10)

In [None]:
iris_pandas_df.head(3)

GroupBy in Pyspark 

In [None]:
iris_spark_df.groupBy('Name').count().show()

GroupBy in Pandas

In [None]:
iris_pandas_df['Name'].value_counts()

Rename a column in Pyspark

In [None]:
col_rename_iris_df = iris_spark_df.toDF("sepal_length", "sepal_width", "petal_length", "petal_width", "species")
col_rename_iris_df.show(3)


Rename a column in Pandas

In [None]:
iris_pandas_df.columns = ['sepal_length','sepal_width','petal_length','petal_width','species']
iris_pandas_df.head(3)

Drop a column in Pyspark

In [None]:
drop_col_iris_df = iris_spark_df.drop('petal_width')
drop_col_iris_df.head(3)

Drop a column in Pandas

In [None]:
drop_col_iris_df2 = iris_pandas_df.drop('petal_width', axis=1)
drop_col_iris_df2.head(3)

Apply a filter to a dataframe in Pyspark

In [None]:
spark_df = col_rename_iris_df[col_rename_iris_df.sepal_length < 5 ]
spark_df.show(10)

Apply a filter to a dataframe in Pandas

In [None]:
pandas_df = iris_pandas_df[iris_pandas_df.sepal_length < 5]
pandas_df.head(10)

Import built-in functions in pyspark (note: pandas uses numpy to accomplish this)

In [None]:
import pyspark.sql.functions as F
df_log = col_rename_iris_df.withColumn('log_sep_len', F.log(col_rename_iris_df.sepal_length))
df_log.show(3)