In [1]:
# initialize spark
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('pro').getOrCreate()

In [2]:
spark

In [3]:
import pandas as pd

In [4]:
# source:
# http://apache-spark-developers-list.1001551.n3.nabble.com/Faster-and-Lower-memory-implementation-toPandas-td22869.html
def _map_to_pandas(rdds):
    """ Needs to be here due to pickling issues """
    return [pd.DataFrame(list(rdds))]

def toPandas(df, n_partitions=None):
    """
    Returns the contents of `df` as a local `pandas.DataFrame` in a speedy fashion. The DataFrame is
    repartitioned if `n_partitions` is passed.
    :param df:              pyspark.sql.DataFrame
    :param n_partitions:    int or None
    :return:                pandas.DataFrame
    """
    if n_partitions is not None: df = df.repartition(n_partitions)
    df_pand = df.rdd.mapPartitions(_map_to_pandas).collect()
    df_pand = pd.concat(df_pand)
    df_pand.columns = df.columns
    return df_pand

In [5]:
# read csv files from s3, convert to pandas df and save as local csv
def saveToCsv(stockname):
    path = 's3://hzhang502/' + stockname
    sparkdf = spark.read.format('csv')\
    .option('inferSchema', 'true')\
    .option('header', 'true')\
    .load(path)
    pddf = toPandas(sparkdf)
    csvname = stockname + '.csv'
    pddf.to_csv(csvname, index=False)

In [7]:
stocks = ['AMZ', 'EBA', 'NFC', 'FB2A', 'MSF', 'TWR', 'DBK', 'DAI', 'CBK', 'ALV', 'BMW', 'AIR', 'VOW3', 'SIE', 'PHIA', 'ADS', 'CON', 'BAS', 'BAYN', '1COV']

In [8]:
for s in stocks:
    saveToCsv(s)