In [1]:
import os
import pyspark
from pyspark.sql import SQLContext
import pyspark.sql.functions as F
import seaborn as sns
import matplotlib.pyplot as plt
import yaml

In [2]:
# Reads the YAML config file
def read_yaml_config():
    global cfg
    with open("configuration.yml", 'r') as ymlfile:
        cfg = yaml.load(ymlfile)

In [3]:
# Get Amazon Access Key Id
def get_amazon_access_key_id():
    if 'cfg' not in globals():
        read_yaml_config()
    return cfg["amazon"]["access_key_id"]

In [4]:
# Get Amazon Access Key Id
def get_amazon_secret_access_key():
    if 'cfg' not in globals():
        read_yaml_config()
    return cfg["amazon"]["secret_access_key"]

In [5]:
KEYSPACE = "sparkassandra"
APP_NAME = KEYSPACE
CASSANDRA_IP = "192.168.5.131"

## Init PySpark

In [67]:
SUBMIT_ARGS = "--packages com.databricks:spark-csv_2.10:1.2.0,com.amazonaws:aws-java-sdk:1.10.34,org.apache.hadoop:hadoop-aws:2.6.0,com.datastax.spark:spark-cassandra-connector_2.10:1.6.0-M1 pyspark-shell"
os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS

In [7]:
# Make sure the driver and workers all use Python2
os.environ['PYSPARK_PYTHON'] = '/opt/conda/envs/python2/bin/python'

In [8]:
#Init Spark Conf
conf = pyspark.SparkConf().setAppName(APP_NAME)
conf.set("spark.cassandra.connection.host",CASSANDRA_IP)

<pyspark.conf.SparkConf at 0x7f0c3fd28810>

In [9]:
# Init Spark Context
sc = pyspark.SparkContext('local[*]', conf = conf)

In [10]:
# Init Spark SQL Context
sql_ctx = SQLContext(sc)

In [11]:
hadoopConf = sc._jsc.hadoopConfiguration()
hadoopConf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")

hadoopConf.set("fs.s3n.awsAccessKeyId", get_amazon_access_key_id())
hadoopConf.set("fs.s3n.awsSecretAccessKey", get_amazon_secret_access_key())

## Create Cassandra Keyspace

https://github.com/dkoepke/cassandra-python-driver

## Create Table us_flights

## Create Table us_airports

## Load Airport file and store in Cassandra

In [13]:
airportRDD = sql_ctx.read.format('com.databricks.spark.csv').options(header='true').load('s3n://sparkassandra/airports.csv')

In [14]:
airportRDD.printSchema()

root
 |-- iata: string (nullable = true)
 |-- airport: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- long: string (nullable = true)



In [15]:
airportRDD.write\
    .format("org.apache.spark.sql.cassandra")\
    .mode('append')\
    .options(table="airports", keyspace=KEYSPACE)\
    .save()

## Load US Flights files and store in Cassandra

In [57]:
flightsRDD = sql_ctx.read.format('com.databricks.spark.csv').options(header='true').load('s3n://sparkassandra/1987.csv')

In [58]:
# Add Id
flightsRDD = flightsRDD.withColumn("Id", F.concat_ws("-", flightsRDD["Year"], flightsRDD["Month"], flightsRDD["DayofMonth"], flightsRDD["DepTime"], flightsRDD["FlightNum"]));

In [59]:
for column in flightsRDD.columns:
    flightsRDD = flightsRDD.withColumnRenamed(column, column.lower());

In [60]:
flightsRDD.registerTempTable("us_flights")

In [61]:
flightsRDD.printSchema()

root
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- dayofmonth: string (nullable = true)
 |-- dayofweek: string (nullable = true)
 |-- deptime: string (nullable = true)
 |-- crsdeptime: string (nullable = true)
 |-- arrtime: string (nullable = true)
 |-- crsarrtime: string (nullable = true)
 |-- uniquecarrier: string (nullable = true)
 |-- flightnum: string (nullable = true)
 |-- tailnum: string (nullable = true)
 |-- actualelapsedtime: string (nullable = true)
 |-- crselapsedtime: string (nullable = true)
 |-- airtime: string (nullable = true)
 |-- arrdelay: string (nullable = true)
 |-- depdelay: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- distance: string (nullable = true)
 |-- taxiin: string (nullable = true)
 |-- taxiout: string (nullable = true)
 |-- cancelled: string (nullable = true)
 |-- cancellationcode: string (nullable = true)
 |-- diverted: string (nullable = true)
 |-- carrierdelay:

In [63]:
flightsRDD.write\
    .format("org.apache.spark.sql.cassandra")\
    .mode('append')\
    .options(table="us_flights", keyspace=KEYSPACE)\
    .save()

In [66]:
#sql_ctx.read.format("org.apache.spark.sql.cassandra").options(table="us_flights", keyspace=KEYSPACE).load().show()

TEST

In [65]:
#RES = sql_ctx.read.format('com.databricks.spark.csv').options(header='true', delimiter=';').load('s3n://sparkassandra/file1.csv')
#RES.show()

In [64]:
#RES.write\
#    .format("org.apache.spark.sql.cassandra")\
#    .mode('append')\
#    .options(table="bonhomme", keyspace=KEYSPACE)\
#    .save()