In [1]:
# imports
import ftplib
import gzip
import string
import os
import csv
import math

In [2]:
# local path (make directories 'ZurichWeather/data' in home directory, if not already existing)
homePath = os.path.expanduser("~")
localPath = homePath + '/ZurichWeather/'
dir = os.path.dirname(localPath)
try:
    os.stat(dir)
except:
    os.mkdir(dir)
localPath = localPath + 'data/'
dir = os.path.dirname(localPath)
try:
    os.stat(dir)
except:
    os.mkdir(dir)

In [3]:
# ftp login
ftpOject = ftplib.FTP('ftp.ncdc.noaa.gov')
ftpOject.login()
filePath = '/pub/data/gsod/'

In [4]:
# pull down compressed weather data, uncompress, and write into CSV files
for year in range(2015,1954,-1):
    filePathFull = filePath + str(year) +'/'
    fileName = '066700-99999-' + str(year) + '.op.gz'
    fileNameFull = localPath + fileName
    ftpCmd = 'RETR ' + filePathFull + fileName
    try:
        ftpOject.retrbinary(ftpCmd, open(fileNameFull, 'wb').write)
        gzFile = gzip.open(fileNameFull, 'rb')
        header = gzFile.readline()
        fileNameText = fileNameFull[:len(fileNameFull)-2] + 'csv'
        txtFile = open(fileNameText,'w')
        header = ['station', 'wban', 'yearmoda', 'temp', 'tempNum', 'dewp', 'dewpNum', 'slp', 'slpNum', 'stp', 'stpNum', 'visib', 'visibNum', 'wdsp', 'wdspNum', 'mxspd', 'gust', 'max', 'min', 'prcp', 'sndp', 'frshtt']
        CSVWriter = csv.writer(txtFile, delimiter=',')
        CSVWriter.writerow(header)
        for line in gzFile:
            lineString = line.decode("utf-8")
            shortString = lineString.replace('  ',' ')
            shortString = shortString.replace('  ',' ')
            CSVWriter.writerow(shortString.split())
    except ftplib.all_errors as e:
        errorcode_string = str(e).split(None, 1)
        print(errorcode_string)

['550', '/pub/data/gsod/1972/066700-99999-1972.op.gz: No such file or directory']
['550', '/pub/data/gsod/1971/066700-99999-1971.op.gz: No such file or directory']
['550', '/pub/data/gsod/1970/066700-99999-1970.op.gz: No such file or directory']


In [5]:
# read contents of all CSV files into list, 'weatherData', containing all weather data
weatherData = list()
for CSVFile in os.scandir(localPath):
    if not CSVFile.name.startswith('.') and CSVFile.is_file() and CSVFile.name.endswith('.csv'):
        inputCSVFile = open(localPath + CSVFile.name,encoding='utf-8', mode='r')
        CSVReader = csv.DictReader(inputCSVFile)
        for inputRow in CSVReader:
            year = inputRow['yearmoda'][0:4]
            month = inputRow['yearmoda'][4:6]
            day = inputRow['yearmoda'][6:8]
            del inputRow['yearmoda']
            inputRow['year'] = str(year)
            inputRow['month'] = str(month)
            inputRow['day'] = str(day)
            if CSVReader.line_num > 1:
                weatherData.append(inputRow)

In [6]:
# select values of 'key2' from inList, when 'key1' has value 'value1', and sort output list
def selectElements(inList,key1,value1,key2):
    outList = list()
    for listItem in inList:
        if listItem[key1] == value1:
            outList.append(listItem[key2])
    return sorted(outList)

In [7]:
# select all values of 'key2', from list 'inList', and group into bins of size, 'binSize', centered on 0,
# when 'key1' has value 'value1'
def selectBinnedElements(inList,key1,value1,key2,binSize):
    outList = list()
    for listItem in inList:
        if listItem[key1] == value1:
            outList.append(listItem[key2])
    outList = sorted(outList)
    binnedElements = [int(round(float(outElement)/binSize,0)) * binSize for outElement in outList]
    return sorted(binnedElements)

In [8]:
# produce list of '[bin, frequency]' pairs, from list 'inList', when 'key1' has value 'value1', 
# where 'bin' is the binned value of 'key2',
# and 'frequency' is the number of times the bin is observed in the list, 'inList'
def selectWeightedElements(inList,key1,value1,key2,binSize):
    midList = selectBinnedElements(inList,key1,value1,key2,binSize)
    outList = [[x,midList.count(x)] for x in set(midList)]
    return outList

In [9]:
# calculate the entropy of list, 'inList', of '[bin, frequency]' pairs
def entropy(inList):
    sum = 0.
    for pair in inList:
        sum = sum + pair[1]
    info = 0.
    for pair in inList:
        p = pair[1] / sum
        info = info - p * math.log(p,2)
    return info

In [10]:
# calculate average entropy when 'key1' takes on values in range 'valueRange', 
# and values of 'key2' are binned by 'binSize'
def avgEntropy(inList,key1,valueRange,key2,binSize):
    avg = 0.
    count = 0
    for value in valueRange:
        valueStr = str(value).zfill(2)
        entropyValue = entropy(selectWeightedElements(inList,key1,valueStr,key2,binSize))
        avg = avg + entropyValue
        if entropyValue > 0:
            count = count + 1
    return avg / count

In [11]:
# select all values of 'key2', from list 'inList', and group into bins of size, 'binSize'
def binnedElements(inList,key2,binSize):
    outList = list()
    for listItem in inList:
        outList.append(listItem[key2])
    outList = sorted(outList)
    binnedElements = [int(round(float(outElement)/binSize,0)) * binSize for outElement in outList]
    return sorted(binnedElements)

In [12]:
# produce list of '[bin, frequency]' pairs, from list 'inList', where 'bin' is the binned value of 'key2',
# and 'frequency' is the number of times the bin is observed in the list, 'inList'
def weightedElements(inList,key2,binSize):
    midList = binnedElements(inList,key2,binSize)
    outList = [[x,midList.count(x)] for x in set(midList)]
    return outList

In [13]:
# calculate the total entropy in list, 'inList', when values of 'key2' are binned into bins of size, 'binSize'
def totalEntropy(inList,key2,binSize):
    return entropy(weightedElements(inList,key2,binSize))

In [14]:
# calculate the average information, which is the difference between total entropy and average entropy, 
# when 'key1' takes on values in range 'valueRange', and values of 'key2' are binned by 'binSize'
def avgInfo(inList,key1,valueRange,key2,binSize):
    return totalEntropy(inList,key2,binSize) - avgEntropy(inList,key1,valueRange,key2,binSize)

In [21]:
# calculate the pairs '[temperature, count]', for all entries in month '01', from 1955 through 2015
# 1 day had average temperature 2degreesF, 4 days had average temperature 3degreesF, etc.
selectWeightedElements(weatherData,'month','01','temp',1)

[[2, 1],
 [3, 4],
 [4, 2],
 [5, 2],
 [6, 3],
 [7, 2],
 [8, 3],
 [9, 3],
 [10, 1],
 [11, 3],
 [12, 11],
 [13, 8],
 [14, 7],
 [15, 9],
 [16, 9],
 [17, 6],
 [18, 22],
 [19, 17],
 [20, 24],
 [21, 23],
 [22, 30],
 [23, 26],
 [24, 40],
 [25, 48],
 [26, 52],
 [27, 60],
 [28, 83],
 [29, 79],
 [30, 88],
 [31, 71],
 [32, 104],
 [33, 92],
 [34, 141],
 [35, 85],
 [36, 118],
 [37, 70],
 [38, 72],
 [39, 47],
 [40, 70],
 [41, 34],
 [42, 47],
 [43, 31],
 [44, 28],
 [45, 18],
 [46, 20],
 [47, 16],
 [48, 12],
 [49, 6],
 [50, 6],
 [51, 5],
 [52, 3],
 [55, 1],
 [56, 1],
 [-4, 1],
 [-3, 1]]

In [22]:
# the entropy in this list is slightly less than 5 bits
entropy(selectWeightedElements(weatherData,'month','01','temp',1))

4.973962828825725

In [15]:
# the average information gained about the average daily temperature, with a resolution (bin) of 1 degree, 
# by knowing the day of month, is less than 0.1 bit
avgInfo(weatherData,'day',range(1, 32),'temp',1)

0.07426720551416821

In [16]:
# the average information gained about the average daily temperature, with a resolution (bin) of 1 degree, 
# by knowing the month, is 1.0 bit
avgInfo(weatherData,'month',range(1, 13),'temp',1)

1.0090716183731745

In [17]:
# the average information gained about the average daily temperature, with a resolution (bin) of 1 degree, 
# by knowing the year, is 0.2 bit
avgInfo(weatherData,'year',range(1955, 2016),'temp',1)

0.20249583423455242

In [23]:
weatherData[0]

{'day': '01',
 'dewp': '19.6',
 'dewpNum': '8',
 'frshtt': '001000',
 'gust': '999.9',
 'max': '32.0',
 'min': '27.0*',
 'month': '01',
 'mxspd': '15.9',
 'prcp': '99.99',
 'slp': '1025.1',
 'slpNum': '8',
 'sndp': '999.9',
 'station': '066700',
 'stp': '9999.9',
 'stpNum': '0',
 'temp': '28.4',
 'tempNum': '8',
 'visib': '7.6',
 'visibNum': '8',
 'wban': '99999',
 'wdsp': '11.6',
 'wdspNum': '8',
 'year': '1955'}

In [34]:
janDays = 0
janMonths = 0
totDays = 0
oldYear = '1955'
for weatherDict in weatherData:
    if weatherDict['month'] == '01':
        newYear = weatherDict['year']
        if newYear == oldYear:
            janDays = janDays + 1
            totDays = totDays + 1
        else:
            print(janDays)
            janDays = 0
            janMonths = janMonths + 1
        oldYear = newYear
print(janMonths)
print(totDays)

31
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
29
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
56
1710


In [33]:
56*30

1680