Loading baseline poll data using the interpolator.

j3camero · Sep 30, 2015 · 188a87c · 188a87c
1 parent bc6bff7
commit 188a87c
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 52 deletions.
diff --git a/regional_baseline.csv b/regional_baseline.csv
@@ -1,6 +1,35 @@
-,ATL,QC,ON,SK_MB,AB,BC,Canada
-cpc,37.46139295,16.5,44.4,54.7875,66.8,45.5,39.6
-lpc,29.58301806,14.2,25.3,12.92142857,9.3,13.4,18.9
-ndp,29.75,42.9,25.6,28.78883929,16.8,32.5,30.6
-gpc,2.960361135,2.1,3.8,3.186160714,5.3,7.7,3.9
-bq,0,23.4,0,0,0,0,0
+2011-05-02,Canada,ndp,30.6
+2011-05-02,Canada,cpc,39.6
+2011-05-02,Canada,lpc,18.9
+2011-05-02,Canada,bq,0.0
+2011-05-02,Canada,gpc,3.9
+2011-05-02,ON,ndp,25.6
+2011-05-02,ON,cpc,44.4
+2011-05-02,ON,lpc,25.3
+2011-05-02,ON,bq,0.0
+2011-05-02,ON,gpc,3.8
+2011-05-02,AB,ndp,16.8
+2011-05-02,AB,cpc,66.8
+2011-05-02,AB,lpc,9.3
+2011-05-02,AB,bq,0.0
+2011-05-02,AB,gpc,5.3
+2011-05-02,SK_MB,ndp,28.78883929
+2011-05-02,SK_MB,cpc,54.7875
+2011-05-02,SK_MB,lpc,12.92142857
+2011-05-02,SK_MB,bq,0.0
+2011-05-02,SK_MB,gpc,3.186160714
+2011-05-02,BC,ndp,32.5
+2011-05-02,BC,cpc,45.5
+2011-05-02,BC,lpc,13.4
+2011-05-02,BC,bq,0.0
+2011-05-02,BC,gpc,7.7
+2011-05-02,ATL,ndp,29.75
+2011-05-02,ATL,cpc,37.46139295
+2011-05-02,ATL,lpc,29.58301806
+2011-05-02,ATL,bq,0.0
+2011-05-02,ATL,gpc,2.960361135
+2011-05-02,QC,ndp,42.9
+2011-05-02,QC,cpc,16.5
+2011-05-02,QC,lpc,14.2
+2011-05-02,QC,bq,23.4
+2011-05-02,QC,gpc,2.1
diff --git a/regional_poll_interpolator.py b/regional_poll_interpolator.py
@@ -2,10 +2,13 @@
 import datetime
 
 class RegionalPollInterpolator(object):
+    """Routines for interpolating a series of poll data."""
+
     def __init__(self):
         self.series_by_region_then_party = dict()
 
     def LoadFromCsv(self, csv_filename):
+        """Adds data from a csv file. Can be called multiple times."""
         with open(csv_filename) as csv_file:
             csv_reader = csv.reader(csv_file)
             for row in csv_reader:
@@ -24,6 +27,7 @@ def LoadFromCsv(self, csv_filename):
                 series.sort()
 
     def Interpolate(self, region, party, date):
+        """Interpolate the loaded poll data."""
         try:
             series = self.series_by_region_then_party[region][party]
         except:
@@ -41,7 +45,34 @@ def Interpolate(self, region, party, date):
         return 0
 
     def GetMostRecent(self, region, party):
+        """Returns the most recent data point for a region and party."""
         try:
             return self.series_by_region_then_party[region][party][-1][1]
         except:
             return 0
+
+    def UniformSwingProjection(self, region, begin_date, begin_vector):
+        """Projects forward a vector of popular votes."""
+        projection = {}
+        for party in begin_vector:
+            old_poll = self.Interpolate(region, party, begin_date)
+            new_poll = self.GetMostRecent(region, party)
+            projection[party] = begin_vector[party] + new_poll - old_poll
+        return projection
+
+    def ProportionalSwingProjection(self, region, begin_date, begin_vector):
+        """Projects forward a vector of popular votes."""
+        projection = {}
+        for party in begin_vector:
+            old_poll = self.Interpolate(region, party, begin_date)
+            new_poll = self.GetMostRecent(region, party)
+            if old_poll > 0:
+                gain = new_poll / old_poll
+            else:
+                gain = 1
+            projection[party] = begin_vector[party] * gain
+        # Normalize so the projections sum to 1.
+        divisor = sum(projection.values())
+        for k, v in projection.items():
+            projection[k] = v / divisor
+        return projection
diff --git a/riding_forecast.py b/riding_forecast.py
@@ -1,6 +1,5 @@
 import csv
-import sys
-import unicodedata
+import datetime
 
 from scipy.stats import norm
 
@@ -83,44 +82,6 @@ def WhichProvince(s):
             return abbr
     return None
 
-def LoadMatrix(filename):
-    """Loads a table of numbers from a CSV file.
-
-    The table of numbers should have labeled columns and rows. The first row
-    of the CSV file will contain column labels. The first cell in each row
-    thereafter will be a label for that row. The first column of the first
-    row must be blank. All other cells in the CSV file should contain numbers.
-
-    The returned table is indexed first by column label then by row label.
-
-    Example file format:
-    ,ColumnOne,ColumnTwo
-    RowOne,1,2
-    RowTwo,3,4
-
-    Example usage:
-    m = LoadMatrix('example.csv')
-    print m['ColumnTwo']['RowOne']
-    # Prints 2
-    """
-    matrix = {}
-    with open(filename) as csv_file:
-        reader = csv.DictReader(csv_file)
-        for row in reader:
-            row_label = row['']
-            for column_label, value in row.items():
-                if not column_label:
-                    continue
-                if column_label not in matrix:
-                    matrix[column_label] = {}
-                try:
-                    value = float(value)
-                except:
-                    # Blank values default to zero.
-                    value = 0
-                matrix[column_label][row_label] = value
-    return matrix
-
 def NormalizeDictVector(d):
     """Adjusts numerical values so they add up to 1."""
     normalized = {}
@@ -146,14 +107,11 @@ def KeyWithHighestValue(d, forbidden_keys=[]):
             mv = v
     return mk
 
-def RemoveAccentsFromText(s):
-    return ''.join((c for c in unicodedata.normalize('NFD', s)
-                    if unicodedata.category(c) != 'Mn'))
-
 # Load regional polling data.
-regional_support_before = LoadMatrix('regional_baseline.csv')
 interpolator = RegionalPollInterpolator()
 interpolator.LoadFromCsv('regional_poll_averages.csv')
+interpolator.LoadFromCsv('regional_baseline.csv')
+baseline_date = datetime.datetime(2011, 5, 2)
 
 # Load and process per-riding election results from 2011.
 old_ridings = {}
@@ -169,7 +127,7 @@ def RemoveAccentsFromText(s):
         province = WhichProvince(row['Province'])
         region = WhichRegion(row['Province'])
         assert region
-        before = regional_support_before[region][party]
+        before = interpolator.Interpolate(region, party, baseline_date)
         after = interpolator.GetMostRecent(region, party)
         projected_gain = after / before
         projection = popular_vote * projected_gain