# Spark S3 Connection

we need to make sure the hadoop aws package is available when we load spark

In [1]:
# import os
# os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages=org.apache.hadoop:hadoop-aws:2.7.4 pyspark-shell"
# import pyspark
# sc=pyspark.SparkContext()
# sql=pyspark.sql.SparkSession(sc)

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession


# depending on your set up:
# if you are running the spark app locally, set the driver memory to something your system can handle
# if you are running on a cluster, then also set the executor memory - if necessary (depends on how your cluster is configured)
conf = SparkConf()
conf.set('spark.executor.memory', '8g')
conf.set('spark.driver.memory', '8g')
conf.set('spark.jars', '/usr/local/spark/jars/aws-java-sdk-1.7.4.jar,/usr/local/spark/jars/hadoop-aws-2.7.4.jar')

spark_session = SparkSession.builder \
        .config(conf=conf) \
        .appName('s3-write') \
        .getOrCreate()
spark_session.sparkContext.setLogLevel("INFO")

### Reading config from file

We can use the configparser package to read the credentials from the standard aws file (https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html).

In [2]:
import configparser
aws_profile = "lijo" #user profile name
config = configparser.ConfigParser()
config.read(os.path.expanduser("~/.aws/credentials"))
access_id = config.get(aws_profile, "aws_access_key_id") 
access_key = config.get(aws_profile, "aws_secret_access_key")

### Configure hadoop

In [3]:
hadoop_conf=sc._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
hadoop_conf.set("fs.s3a.awsAccessKeyId", access_id)
hadoop_conf.set("fs.s3a.awsSecretAccessKey", access_key)

### Reading data from S3

In [None]:
df=sql.read.csv("s3a://pyspark-lijo-test/auction.csv",header=True, inferSchema=True)
df.show(2)

### For writing data to S3

In [None]:
# df.write.csv("s3a://pyspark-lijo-test/output/auction",mode="overwrite")
# df.coalesce(20).write.mode("overwrite").format("csv").save("s3a://pyspark-lijo-test/output/auction")
# df.write.save("s3a://pyspark-lijo-test/output", format='csv', header=True, mode="overwrite")