In [1]:
# imports
from ftplib import FTP
import gzip
import string
import os
import csv
from collections import Counter
import math

In [2]:
# local path (make directory 'ZurichWeather' in home directory, if not already existing)
homePath = os.path.expanduser("~")
localPath = homePath + '/ZurichWeather/'
dir = os.path.dirname(localPath)
try:
    os.stat(dir)
except:
    os.mkdir(dir)
localPath = localPath + 'data/'
dir = os.path.dirname(localPath)
try:
    os.stat(dir)
except:
    os.mkdir(dir)

In [3]:
# ftp login
ftpOject = FTP('ftp.ncdc.noaa.gov')
ftpOject.login()
filePath = '/pub/data/gsod/'

In [4]:
# pull down compressed weather data, uncompress, and write into CSV files
for year in range(2015,1972,-1):
    filePathFull = filePath + str(year) +'/'
    fileName = '066700-99999-' + str(year) + '.op.gz'
    fileNameFull = localPath + fileName
    ftpCmd = 'RETR ' + filePathFull + fileName
    ftpOject.retrbinary(ftpCmd, open(fileNameFull, 'wb').write)
    gzFile = gzip.open(fileNameFull, 'rb')
    header = gzFile.readline()
    fileNameText = fileNameFull[:len(fileNameFull)-2] + 'csv'
    # print(fileNameText)
    txtFile = open(fileNameText,'w')
    header = ['station', 'wban', 'yearmoda', 'temp', 'tempNum', 'dewp', 'dewpNum', 'slp', 'slpNum', 'stp', 'stpNum', 'visib', 'visibNum', 'wdsp', 'wdspNum', 'mxspd', 'gust', 'max', 'min', 'prcp', 'sndp', 'frshtt']
    CSVWriter = csv.writer(txtFile, delimiter=',')
    CSVWriter.writerow(header)
    for line in gzFile:
        lineString = line.decode("utf-8")
        shortString = lineString.replace('  ',' ')
        shortString = shortString.replace('  ',' ')
        CSVWriter.writerow(shortString.split())

In [5]:
# read contents of all CSV files into list, 'weatherData', containing all weather data
weatherData = list()
for CSVFile in os.scandir(localPath):
    if not CSVFile.name.startswith('.') and CSVFile.is_file() and CSVFile.name.endswith('.csv'):
        inputCSVFile = open(localPath + CSVFile.name,encoding='utf-8', mode='r')
        CSVReader = csv.DictReader(inputCSVFile)
        for inputRow in CSVReader:
            year = inputRow['yearmoda'][0:4]
            month = inputRow['yearmoda'][4:6]
            day = inputRow['yearmoda'][6:8]
            del inputRow['yearmoda']
            inputRow['year'] = str(year)
            inputRow['month'] = str(month)
            inputRow['day'] = str(day)
            # print(inputRow)
            if CSVReader.line_num > 1:
                weatherData.append(inputRow)

In [6]:
# select values of 'key2' from inList, when 'key1' has value 'value1'
def selectElements(inList,key1,value1,key2):
    outList = list()
    for listItem in inList:
        if listItem[key1] == value1:
            # print(listItem)
            outList.append(listItem[key2])
    return sorted(outList)

In [7]:
# select all values of 'key2', from list 'inList', and group into bins of size, 'binSize', 
# when 'key1' has value 'value1'
def selectBinnedElements(inList,key1,value1,key2,binSize):
    outList = list()
    for listItem in inList:
        if listItem[key1] == value1:
            # print(listItem)
            outList.append(listItem[key2])
    outList = sorted(outList)
    binnedElements = [int(round(float(outElement)/binSize,0)) * binSize for outElement in outList]
    return sorted(binnedElements)

In [8]:
# produce list of '[bin, frequency]' pairs, from list 'inList', when 'key1' has value 'value1', 
# where 'bin' is the binned value of 'key2',
# and 'frequency' is the number of times the bin is observed in the weather data list, 'inList'
def selectWeightedElements(inList,key1,value1,key2,binSize):
    midList = selectBinnedElements(inList,key1,value1,key2,binSize)
    outList = [[x,midList.count(x)] for x in set(midList)]
    return outList

In [9]:
# calculate the entropy of list, 'inList', of '[bin, frequency]' pairs
def entropy(inList):
    sum = 0.
    for pair in inList:
        sum = sum + pair[1]
    info = 0.
    for pair in inList:
        p = pair[1] / sum
        info = info - p * math.log(p,2)
    return info

In [10]:
# calculate average entropy when 'key1' takes on values in range 'valueRange', 
# where values of 'key2' are binned by 'binSize'
def avgEntropy(inList,key1,valueRange,key2,binSize):
    avg = 0.
    for value in valueRange:
        valueStr = str(value).zfill(2)
        avg = avg + entropy(selectWeightedElements(inList,key1,valueStr,key2,binSize))
    return avg / len(valueRange)

In [11]:
# select all values of 'key2', from list 'inList', and group into bins of size, 'binSize'
def binnedElements(inList,key2,binSize):
    outList = list()
    for listItem in inList:
        outList.append(listItem[key2])
    outList = sorted(outList)
    binnedElements = [int(round(float(outElement)/binSize,0)) * binSize for outElement in outList]
    return sorted(binnedElements)

In [12]:
# produce list of '[bin, frequency]' pairs, from list 'inList', where 'bin' is the binned value of 'key2',
# and 'frequency' is the number of times the bin is observed in the weather data list, 'inList'
def weightedElements(inList,key2,binSize):
    midList = binnedElements(inList,key2,binSize)
    outList = [[x,midList.count(x)] for x in set(midList)]
    return outList

In [13]:
# calculate the total entropy in list, 'inList', when values of 'key2' are binned into bins of size, 'binSize'
def totalEntropy(inList,key2,binSize):
    return entropy(weightedElements(inList,key2,binSize))

In [14]:
# calculate the average information, which is the difference between total entropy and average entropy, 
# when 'key1' takes on values in range 'valueRange', and values of 'key2' are binned by 'binSize'
def avgInfo(inList,key1,valueRange,key2,binSize):
    return totalEntropy(inList,key2,binSize) - avgEntropy(inList,key1,valueRange,key2,binSize)

In [15]:
# the average information gained about the average daily temperature, by knowing the day of the month, is 0.1 bit
avgInfo(weatherData,'day',range(1, 32),'temp',1)

0.09330545022148495

In [16]:
# the average information gained about the average daily temperature, by knowing the month of the year, is 1.0 bit
avgInfo(weatherData,'month',range(1, 13),'temp',1)

1.020457267029256

In [17]:
# the average information gained about the average daily temperature, by knowing the year, is 0.2 bit
avgInfo(weatherData,'year',range(1973, 2016),'temp',1)

0.18960866217339767