In [1]:
import sys
import os
import io
import re
import db
import wrangling as wr
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
# from __future__ import statistcs as stats

databaseName = "database.csv"
fundedProjectsFile = "FundedProjects.csv"
dbreaderHandle = db.createReader(databaseName)
fundedProjectsHandle = db.createReader(fundedProjectsFile)
dbreader = dbreaderHandle[0]
fprojectsreader = fundedProjectsHandle[0]

# vectors & dicts
locationLatAndLong = []

countiesDict = {}
sponsorsOfProposedProjects = {}
areasOfInterestByCounty = {}

projFundingAmounts =  []
projTotalCosts = []
fundedProjectsTitles = []

# Vectors used during classification.
proposedProjectsTitles = []
proposedProjectsAbstracts = []
proposedProjectsLocationDescriptions = []
proposedProjectsProjectTypeDescription = []
proposedProjectsDetailedDescription = []
proposedProjectsProjectNeed = []
proposedProjectsCriticalImpacts = []
proposedProjectsBenefits = []

essence = []

# Bitvectors
drinkingWater = []
waterQualityImprovement = []
waterReuseAndRecycling = []
stormwaterImprovements = []
groundwaterBenefits = []
infiltration = []
habitatProtection = []
floodProtection = []

#Constants
areasOfInterest = [
    'Drinking Water Supply',
    'Water Quality Improvement',
    'Water Reuse/Recycling',
    'Stormwater Improvements',
    'Groundwater Benefits',
    'Infiltration',
    'Habitat Protection and Restoration',
    'Flood Protection'
]

In [2]:
def fillSponsorsDict(row):
    sponsorsStr = row[12]
    sponsorsStr = sponsorsStr.strip().replace("\n", "/")
    if len(sponsorsStr) == 0:
        return
    tokens = sponsorsStr.split("/")
    for eachAgency in tokens:
        eachAgency = eachAgency.strip()
        if len(sponsorsStr) == 0:
            continue
        if "zone 7" in sponsorsStr or sponsorsStr is "zone 7":
            sponsorsStr = "zone 7 water agency"
            
        if eachAgency in sponsorsOfProposedProjects:
            sponsorsOfProposedProjects[eachAgency] += 1
        else:
            sponsorsOfProposedProjects[eachAgency] = 1

In [3]:
def appendBitvectors(row):
    appendToV(row, drinkingWater, 13)
    appendToV(row, waterQualityImprovement, 14)
    appendToV(row, waterReuseAndRecycling, 15)
    appendToV(row, stormwaterImprovements, 16)
    appendToV(row, groundwaterBenefits, 17)
    appendToV(row, infiltration, 18)
    appendToV(row, habitatProtection, 19)
    appendToV(row, floodProtection, 20)
    
def appendToV(row, vector, index):
    if("1" in row[index]):
        vector.append(1)
    else:
        vector.append(0)

In [4]:
def incrementAreasOfInterestCounts(ls, row):
    start = 13
    end = 20         
    for i in range(start, end+1):
        if("1" in row[i]):
            ls[i - start] += 1

def initialize(row):
    ls = []
    start = 13
    end = 20
    for i in range(start, end+1):
        if("1" in row[i]):
            ls.append(1)
        else:
            ls.append(0)
    return ls


def appendCounties(row):
    countyStr = row[2]
    counties1 = []
    counties2 = []

    countyStr = countyStr.replace("and ", "").replace("and", "").replace("?", "").replace(".", "").replace("Counties","")
    counties1 = countyStr.split(";")
    for name in counties1:
        name = name.strip();
        if "sf" in name:
            name = "san francisco"
        if "mateo santa" in name:
            counties2.append("san mateo county")
            counties2.append("santa clara county")
            continue
        if "mateo san fr" in name:
            counties2.append("san mateo county")
            counties2.append("san francisco county")
            continue;
        if len(name) > 0 and "county" not in name and not (name.startswith("all") or name.startswith("9")):
            name += " county"
            counties2.append(name)

    for eachName in counties2:
        if eachName:
            if eachName in countiesDict:
                countiesDict[eachName] += 1
            else:
                countiesDict[eachName] = 1
                 
            if eachName in areasOfInterestByCounty:
                incrementAreasOfInterestCounts(areasOfInterestByCounty[eachName], row)
            else:
                areasOfInterestByCounty[eachName] = initialize(row)

In [5]:
def append(to,from_, at):
    to.append(from_[at])

def appendToClassificationVectors(row):
    append(proposedProjectsTitles, row, 0)
    append(proposedProjectsAbstracts, row, 1)
    append(proposedProjectsLocationDescriptions, row, 6)
    append(proposedProjectsProjectTypeDescription, row, 7)
    append(proposedProjectsDetailedDescription, row, 8)
    append(proposedProjectsProjectNeed, row, 9)
    append(proposedProjectsCriticalImpacts, row, 10)
    append(proposedProjectsBenefits, row, 11)
    
    curEssence = row[0] + " " + row[1] + " " + row[6] + " " + row[7] + " " + row[8] + " " + row[9] + " "
    curEssence += row[10] + " " + row[11]
    essence.append(curEssence)

In [6]:
def execute():
    next(dbreader) # Skip header
    
    for row in dbreader:
        appendCounties(row)
        appendBitvectors(row)
        fillSponsorsDict(row)
        appendToClassificationVectors(row)
        
execute()

In [None]:
wr.iplotPlotPie(countiesDict, "Counties of Proposed Projects")

In [209]:
# [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
def createTrace(yvals, i):
    trace = {
      'x': areasOfInterestByCounty.keys(),
      'y': yvals,
      'name': areasOfInterest[i],
      'type': 'bar'
    };
    return trace
    
def gatherValuesAt(index):
    ls = []
    for _, val in areasOfInterestByCounty.iteritems():
        ls.append(val[index])
#     print ls
    return ls

cdata = []
for i in range(0, len(areasOfInterest)):
    yvals = gatherValuesAt(i)
    cdata.append(createTrace(yvals, i))

layout = {
  'xaxis': {'title': 'Bay Area Counties'},
  'yaxis': {'title': 'Areas of interest'},
  'barmode': 'relative',
  'title': 'Areas Of Interest By County.',
 'orientation' : 'h'
};
py.plot({'data': cdata, 'layout': layout})

u'https://plot.ly/~jlikhuva/96'

In [126]:
def produceBubbleChart(data, maxN):
    n = len(data)
    chart = go.Scatter(
        x = wr.getRandomIntList(0, maxN, n),
        y = wr.getRandomIntList(0, maxN, n),
        text = data.keys(),
        mode = "markers",
        marker = dict(
            size = data.values(),
        ) 
    )
    data = [chart]
    py.plot(data)    

In [98]:
def produceAreasOfInterestCharts():
    titles = db.kHeadingNames
    data = [go.Bar(
            x=[titles[13], titles[14], titles[15], titles[16], titles[17], titles[18], titles[19], titles[20]],
            y=[sum(drinkingWater), sum(waterQualityImprovement), sum(waterReuseAndRecycling),
               sum(stormwaterImprovements), sum(groundwaterBenefits), sum(infiltration),
               sum(habitatProtection), sum(floodProtection)],
             marker = dict(
                color = "rgb(193, 7, 184)"
            )
    )]
    py.plot(data)
produceAreasOfInterestCharts()

In [127]:
def produceHistogram(data):
    data = [go.Bar(
            x = data.keys(),
            y = data.values(),
            marker = dict(
                color = "rgb(30, 188, 75)"
            )
        )]
    py.plot(data)

In [107]:
# wr.iplotPlotPie(sponsorsOfProposedProjects, "Sponsors of proposed projects.")
# produceBubbleChart(sponsorsOfProposedProjects, 500)
produceHistogram(sponsorsOfProposedProjects)

In [5]:
def readInFundingAmounts(row):
    fundedProjectsTitles.append(row[0])
    projFundingAmounts.append(row[6])
    projTotalCosts.append(row[7])
    
for row in fprojectsreader:
    readInFundingAmounts(row)

In [226]:
'''
Code in this snippet courtesey of S.O User 
http://stackoverflow.com/users/3923281/ajcr
'''
def mean(data):
    """Return the sample arithmetic mean of data."""
    n = len(data)
    if n < 1:
        raise ValueError('mean requires at least one data point')
    return sum(data)/n # in Python 2 use sum(data)/float(n)

def _ss(data):
    """Return sum of square deviations of sequence data."""
    c = mean(data)
    ss = sum((x-c)**2 for x in data)
    return ss

def getStdev(data):
    """Calculates the population standard deviation."""
    n = len(data)
    if n < 2:
        raise ValueError('variance requires at least two data points')
    ss = _ss(data)
    pvar = ss/n # the population variance
    return pvar**0.5

In [230]:
def getTrace(x, y, title, color):
    return go.Scatter(
        x = x,
        y = y,
        mode = "markers",
        name = title,
        marker = dict(
            color = color
        )
    )

def drawFundingCharts():
    data1 = [getTrace(fundedProjectsTitles, projFundingAmounts, "Amount of DWR Funding.", "rgb(193, 7, 184)")]
    data2 = [getTrace(fundedProjectsTitles, projTotalCosts, "Total Project Costs.","rgb(30, 188, 75)" )]
    py.plot(data1)
    py.plot(data2)
    
drawFundingCharts()
# print "Mean Funding is: " + str(mean(projFundingAmounts))
# print "Standard Deviation of Funding: " + str(getStdev(projFundingAmounts))


In [None]:
'''
The following snippets are deal with the
classification of the data that we have.

It is based on the Amazing tutorial found at
https://github.com/brandomr/document_cluster/blob/master/cluster_analysis.ipynb
'''

In [7]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
import mpld3
from __future__ import print_function
from sklearn import feature_extraction
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.externals import joblib

stopwords = nltk.corpus.stopwords.words("english")
stemmer = SnowballStemmer("english")

In [8]:
'''
Code courtesey of Brandon Rose
https://github.com/brandomr
'''
def tokenizeAndStem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def tokenize(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

def printTopTermsPerCluster(N, referenceDF, modelDF, model, terms):
    print("Top terms per cluster:")
    print()
    #sort cluster centers by proximity to centroid
    order_centroids = model.cluster_centers_.argsort()[:, ::-1] 
    
    for i in range(N):
        print("Cluster %d words:" % i, end='')
    
        for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
            print(' %s' % referenceDF.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'),
                  end=',')
        print() #add whitespace
        print() #add whitespace
    
        print("Cluster %d titles:" % i, end='')
        for title in modelDF.ix[i]['title'].values.tolist():
            print(' %s,' % title, end='')
        print() #add whitespace
        print() #add whitespace
    print()
    print()

In [9]:
def get_TfidfMatrix_Terms_Dist(textList):
    vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenizeAndStem,
                                 ngram_range=(1,3)
                                )
    matrix = vectorizer.fit_transform(textList)
    terms = vectorizer.get_feature_names()
    distance = 1 - cosine_similarity(matrix)
    return matrix , terms, distance  

In [10]:
def runKmeans(matrix, N):
    kmeans = KMeans(N)
    kmeans.fit(matrix)
    return kmeans
    
def openModel(filename):
    try:
        model = joblib.load(filename)
        return model
    except:
        print ("could not open " + str(filename))
        
def saveModel(model, name):
    try:
        joblib.dump(model, name)
    except:
        print ("Error saving model under " + name)
        

In [11]:
def createDataFrameFromDict(dict_, indexList):
    pandasDF = pd.DataFrame(dict_, index = indexList, # may have bug here.[indexList]
                            columns = dict_.keys())
    return pandasDF


In [12]:
'''
name is the name you want this model
to be saved as

byList is expected to be a list of
text.

structure is a dict describing the structure
of the data
'''
def classify(byList, name, structure, N):
    totalvocabStemmed = []
    totalvocabTokenized = []
    for each in byList:
        allwordsStemmed = tokenizeAndStem(each) 
        totalvocabStemmed.extend(allwordsStemmed)
      
        allwordsTokenized = tokenize(each)
        totalvocabTokenized.extend(allwordsTokenized)
    
    frameBuilder = {
        "words" : totalvocabTokenized
    }
    vocabsDataFrame = createDataFrameFromDict(frameBuilder,
                                              totalvocabStemmed)
    
    tfidfmatrix, terms, dist = get_TfidfMatrix_Terms_Dist(byList)
    
    # Should check if name already exists.
    # If it does, do not re-run the model 
    # read in the model insead.
    # openModel(name)
    kmeansModel = runKmeans(tfidfmatrix, N)
    # saveModel(name)
    structure["clusters"] = kmeansModel.labels_.tolist()
    
    byListDataFrame = createDataFrameFromDict(structure,
                                              kmeansModel.labels_.tolist())
    printTopTermsPerCluster(N, vocabsDataFrame, byListDataFrame, kmeansModel, terms)
    
def runClassification():
    structure = {
        "title" : proposedProjectsTitles,
        "abstract" : proposedProjectsAbstracts
    }
    df = classify(essence,
             "classification by title", 
             structure, 
             8)
    
    

In [14]:
runClassification()