In [1]:
# coding=utf-8
# Author: Chu Chu, Ke Zhou
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, functions, types
import sys
import os
import json
import csv
import requests
import threading
import time
import datetime
from subprocess import PIPE, Popen

STATION_INFO_CSV = 'station_info.csv'
STATION_STATUS_CSV = 'station_status.csv'
DATA_PATH='/user/chowkec/capitalbikeshare/data/'
OUTPUT_PATH='/user/chowkec/capitalbikeshare/output/'

spark = SparkSession.builder.appName('captialbikeshare').getOrCreate()

def write_to_csv(json_dict, csv_path):
    csv_file = open(csv_path, 'w+')
    csv_writer = csv.writer(csv_file)
    counter = 0
    for item in json_dict:
        if counter == 0:
            header = item.keys()
            csv_writer.writerow(header)
            counter += 1
        csv_writer.writerow(item.values())
    csv_file.close()


def get_json_text(url):
    response = requests.get(url)
    return response.text


def upload_to_hdfs(file_name, path_name):
    hdfs_path = os.path.join(os.sep, 'user', 'chowkec',
                             'capitalbikeshare', 'data', path_name, file_name)
    put = Popen(["hadoop", "fs", "-put", '-f', file_name,
                 hdfs_path], stdin=PIPE, bufsize=-1)
    put.communicate()


def task():
    station_info_content = get_json_text(
        'https://gbfs.capitalbikeshare.com/gbfs/en/station_information.json')
    info_json_dict = json.loads(station_info_content)
    info_data_dict = info_json_dict["data"]
    info_dict = info_data_dict["stations"]
    write_to_csv(info_dict, STATION_INFO_CSV)

    station_status_content = get_json_text(
        'https://gbfs.capitalbikeshare.com/gbfs/en/station_status.json')
    status_json_dict = json.loads(station_status_content)
    status_data_dict = status_json_dict["data"]
    status_dict = status_data_dict["stations"]
    write_to_csv(status_dict, STATION_STATUS_CSV)

    upload_to_hdfs(STATION_INFO_CSV, 'stationinfo')
    upload_to_hdfs(STATION_STATUS_CSV, 'stationstatus')
    info = spark.read.csv(DATA_PATH+'stationinfo', header=True)
    info.createOrReplaceTempView('i')
    status = spark.read.csv(DATA_PATH+'stationstatus', header=True)
    status.createOrReplaceTempView('s')

    df= spark.sql("""
    select i.station_id, i.name, i.lon, i.lat,i.capacity, s.num_bikes_available, s.num_docks_available from i
    inner join s on
    i.station_id=s.station_id
    """)
    df.show()
    df.repartition(1).write.format('csv').mode("overwrite").save(OUTPUT_PATH+"stationstatus", header=True)



def main():
    while 1:
        task()
        print (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        time.sleep(10)


if __name__ == "__main__":
    main()



+----------+--------------------+------------------+-----------------+--------+-------------------+-------------------+
|station_id|                name|               lon|              lat|capacity|num_bikes_available|num_docks_available|
+----------+--------------------+------------------+-----------------+--------+-------------------+-------------------+
|         1| Eads St & 15th St S|         -77.05323|        38.858971|      15|                  5|                 10|
|         2| 18th St & S Eads St|         -77.05332|         38.85725|      11|                  3|                  8|
|         3|Crystal Dr & 20th...|        -77.049232|        38.856425|      17|                  9|                  8|
|         4|Crystal Dr & 15th...|        -77.049593|         38.86017|      11|                 11|                  0|
|         5|Aurora Hills Cmty...|         -77.05949|        38.857866|      11|                 10|                  0|
|         6|Pentagon City Met...|       

2019-12-05 01:41:45


KeyboardInterrupt: 