In [52]:
import json
from pyspark.sql import Row
from datetime import datetime

##function to get default string value to avoid exception of casting empty string to float
def getString(v):
    if (v!=''):
        return v
    else:
        return '0'

## function to get Row objects for each json key value pair as timestamp and temperature
def createTRows(jsn):
    arr=list()
    for k in jsn:
        arr.append(Row(timestamp=datetime.strptime(k,'%Y-%m-%dT%H:%M:%S'),temperature=float(getString(jsn[k]))))
    return arr

## function to get Row objects for each json key value pair as timestamp and humidity
def createHRows(jsn):
    arr=list()
    for k in jsn:
        arr.append(Row(timestamp=datetime.strptime(k,'%Y-%m-%dT%H:%M:%S'),humidity=float(getString(jsn[k]))))
    return arr

## getting data from files tempm.txt,hum.txt
## each line in file is valid json in below format
## {“timestamp1”: “value1”, “timestamp2”: “value2”,“timestampN”: “valueN”}
## converting each row of file in rdd of json objects
temperatureRdd=sc.textFile("file:///C:/datapath/spark files/tempm.txt")
tempJsonRdd=temperatureRdd.map(lambda x: json.loads(x))
humidityRdd=sc.textFile("file:///C:/datapath/spark files/hum.txt")
humidityJsonRdd=humidityRdd.map(lambda x: json.loads(x))

## converting each timestamp:value pair in DF of timestamp,temperature and humidity,temperature
temperatureDF=tempJsonRdd.flatMap(lambda x: createTRows(x)).toDF()
humidityDF=humidityJsonRdd.flatMap(lambda x: createHRows(x)).toDF()

## joining above created dataframes in one using inner join.Final DF will have timestamp,temperature,humidity columns
joinedDF=temperatureDF.join(humidityDF,'timestamp')
##persisting the joined DF
joinedDF.persist()

DataFrame[timestamp: timestamp, temperature: double, humidity: double]

In [56]:
##In how many days has the temperature reached or exceeded 20 ° C
from pyspark.sql.functions import to_date
query1DF=joinedDF.select(to_date(joinedDF.timestamp)).where(joinedDF.temperature>=20).distinct()
query1DF.show()

+------------------+
|to_date(timestamp)|
+------------------+
|        2014-06-03|
|        2014-04-29|
|        2014-05-24|
|        2014-06-05|
|        2014-05-30|
|        2014-05-25|
|        2014-05-26|
|        2014-05-21|
|        2014-06-02|
|        2014-05-17|
|        2014-05-22|
|        2014-06-04|
|        2014-04-28|
|        2014-05-23|
|        2014-06-01|
|        2014-06-07|
|        2014-06-08|
+------------------+



In [55]:
## Which were the 20 hottest days?
## Here hottest day is considered based on maximum value of average temeprature for the day 
from pyspark.sql.functions import desc
query2DF=joinedDF.groupBy(to_date(joinedDF.timestamp)).avg('temperature').orderBy(desc('avg(temperature)'))
query2DF.show(20)

+------------------+------------------+
|to_date(timestamp)|  avg(temperature)|
+------------------+------------------+
|        2014-05-22|              20.0|
|        2014-05-21| 17.63888888888889|
|        2014-06-08| 17.63888888888889|
|        2014-05-23|16.257142857142856|
|        2014-06-02|16.056338028169016|
|        2014-05-24|15.833333333333334|
|        2014-06-07|15.666666666666666|
|        2014-05-25|15.619718309859154|
|        2014-06-03| 15.61111111111111|
|        2014-06-04|15.485714285714286|
|        2014-06-05|15.430555555555555|
|        2014-05-30|15.112676056338028|
|        2014-06-01|15.027777777777779|
|        2014-05-27| 14.48611111111111|
|        2014-05-26| 14.46376811594203|
|        2014-05-20|14.083333333333334|
|        2014-05-19|13.808823529411764|
|        2014-05-18|             13.75|
|        2014-04-22|13.722222222222221|
|        2014-05-17|13.541666666666666|
+------------------+------------------+
only showing top 20 rows



In [54]:
## Which month did the highest standard deviation in humidity/moisture values?
from pyspark.sql.functions import stddev,month
##maximum standard deviation for temperature
query3TDF=joinedDF.groupBy(month(joinedDF.timestamp)).agg(stddev('temperature')).orderBy(desc('stddev_samp(temperature)'))
query3TDF.show(1)
##maximum standard deviation for humidity
query3HDF=joinedDF.groupBy(month(joinedDF.timestamp)).agg(stddev('humidity')).orderBy(desc('stddev_samp(humidity)'))
query3HDF.show(1)

+----------------+------------------------+
|month(timestamp)|stddev_samp(temperature)|
+----------------+------------------------+
|               5|       4.426954778558993|
+----------------+------------------------+
only showing top 1 row

+----------------+---------------------+
|month(timestamp)|stddev_samp(humidity)|
+----------------+---------------------+
|               4|    17.83959640684249|
+----------------+---------------------+
only showing top 1 row



In [53]:
from pyspark.sql.functions import max,min
## caculating distortion index for each contact
distortionDF=joinedDF.withColumn('DI',joinedDF.temperature-0.55*(1-0.01*joinedDF.humidity)*(joinedDF.temperature-14.5))
## min and max value extracted in DF
query4DF=distortionDF.select(max('DI'),min('DI'))
query4DF.show()

+-------+-------------------+
|max(DI)|            min(DI)|
+-------+-------------------+
|22.1125|-2.3262500000000004|
+-------+-------------------+

