In [97]:

import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"

os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'


In [211]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
import json
import s3fs

In [212]:
spark = SparkSession.builder.appName("Loading data from s3").getOrCreate()
sc = spark.sparkContext

In [213]:
# Creating s3 file system object
s3 = s3fs.S3FileSystem()

#Getting all the available buckets
print(s3.ls(""))

['kafka-project']


In [214]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType

# Defining the schema for my Spark DataFrame
def get_schema():
    spark_schema = StructType([
        StructField("name", StringType(), True),
        StructField("region", StringType(), True),
        StructField("country", StringType(), True),
        StructField("lat", DoubleType(), True),
        StructField("lon", DoubleType(), True),
        StructField("tz_id", StringType(), True),
        StructField("localtime_epoch", IntegerType(), True),
        StructField("localtime", StringType(), True),
        StructField("last_updated_epoch", IntegerType(), True),
        StructField("last_updated", StringType(), True),
        StructField("temp_c", DoubleType(), True),
        StructField("temp_f", DoubleType(), True),
        StructField("is_day", IntegerType(), True),
        StructField("wind_mph", DoubleType(), True),
        StructField("wind_kph", DoubleType(), True),
        StructField("wind_degree", IntegerType(), True),
        StructField("wind_dir", StringType(), True),
        StructField("pressure_mb", DoubleType(), True),
        StructField("pressure_in", DoubleType(), True),
        StructField("precip_mm", DoubleType(), True),
        StructField("precip_in", DoubleType(), True),
        StructField("humidity", IntegerType(), True),
        StructField("cloud", IntegerType(), True),
        StructField("feelslike_c", DoubleType(), True),
        StructField("feelslike_f", DoubleType(), True),
        StructField("vis_km", DoubleType(), True),
        StructField("vis_miles", DoubleType(), True),
        StructField("uv", DoubleType(), True),
        StructField("gust_mph", DoubleType(), True),
        StructField("gust_kph", DoubleType(), True)
    ])
    return spark_schema

In [215]:


def get_processed_data(cities):
    list_df = []
    
    for city in cities:
        
        bucket_name = "kafka-project"
        folder_name = "city_wise_all_data"
        all_file = s3.ls(bucket_name+"/"+folder_name+"/"+city)

        df = spark.createDataFrame([],schema=get_schema())
        
        for file in all_file:
            with s3.open(file,"r") as f:
                d = json.load(f)
                c = d.pop("current")
                c.pop("condition")
                d = d['location']
                d.update(c)
                temp_df = spark.createDataFrame([d],schema=get_schema())
                df = df.union(temp_df)
        list_df.append(df)
    return list_df
    
            
def put_data_into_hdfs(cities):
    
    list_df = get_processed_data(cities)
    
    for df,city in zip(list_df,cities):
        col = ["last_updated_epoch","last_updated","localtime","localtime_epoch","is_day","lat","lon"]
        df = df.drop(*col)
        df = df.withColumnRenamed("name","city")
        df = df.withColumnRenamed("tz_id","timezone")
        df.coalesce(1).write.csv(f"hdfs:///user/talentum/weather_data/{city}",mode="overwrite",header=True)
    print("Data Uploaded Succesfully into HDFS.....")

In [216]:
put_data_into_hdfs(['pune','indore','mumbai'])

Data Uploaded Succesfully into HDFS.....
