In [1]:
import tabula
import pandas as pd
import re
import math
from gdpPDFCleaning import *

pd.set_option("display.max_rows", None, "display.max_columns", None)

# GDP 95-00 Data

In [2]:
# create instance of yearly GDP data
GDP95_01 = YearlyGDP('GDP1995-2001.pdf', 8, 1995, 2000)

In [3]:
# testing that it was initialized with an empty DataFrame
GDP95_01.df

In [4]:
# initializeDataFrame() method reads in the PDF, assigns the first DataFrame in the 
# list as the df (only one table per page), and renames the columns. Lastly, it returns 
# the DataFrame for viewing
GDP95_01.initializeDataFrame()

Unnamed: 0,Area,1995,1996,1997,1998,1999,2000,2001,Percentage of U.S. total
0,,,,,,,,,2000 2001
1,United States.......................,7309516.0,7715901.0,8224960.0,8750174.0,9251541.0,9891187.0,10137190.0,100 100
2,New England....................,416166.0,439596.0,471336.0,503940.0,533324.0,582874.0,594686.0,5.9 5.9
3,Connecticut......................,118645.0,124157.0,134968.0,142701.0,149010.0,161929.0,166165.0,1.6 1.6
4,Maine...............................,27987.0,28925.0,30409.0,32208.0,34102.0,36276.0,37449.0,0.4 0.4
5,Massachusetts..................,197469.0,210127.0,223571.0,241369.0,257802.0,283072.0,287802.0,2.9 2.8
6,New Hampshire...............,32388.0,35068.0,37470.0,40529.0,43360.0,47385.0,47183.0,0.5 0.5
7,Rhode Island....................,25703.0,26656.0,29409.0,30838.0,31895.0,36086.0,36939.0,0.4 0.4
8,Vermont...........................,13974.0,14662.0,15510.0,16294.0,17155.0,18124.0,19149.0,0.2 0.2
9,Mideast..............................,1403270.0,1471796.0,1547124.0,1649536.0,1720155.0,1837583.0,1900223.0,18.6 18.7


In [5]:
GDP95_01.cleanStates('Area')

Unnamed: 0,Area,1995,1996,1997,1998,1999,2000,2001,Percentage of U.S. total,Cleaned Area
0,,,,,,,,,2000 2001,
1,United States.......................,7309516.0,7715901.0,8224960.0,8750174.0,9251541.0,9891187.0,10137190.0,100 100,United States
2,New England....................,416166.0,439596.0,471336.0,503940.0,533324.0,582874.0,594686.0,5.9 5.9,New England
3,Connecticut......................,118645.0,124157.0,134968.0,142701.0,149010.0,161929.0,166165.0,1.6 1.6,Connecticut
4,Maine...............................,27987.0,28925.0,30409.0,32208.0,34102.0,36276.0,37449.0,0.4 0.4,Maine
5,Massachusetts..................,197469.0,210127.0,223571.0,241369.0,257802.0,283072.0,287802.0,2.9 2.8,Massachusetts
6,New Hampshire...............,32388.0,35068.0,37470.0,40529.0,43360.0,47385.0,47183.0,0.5 0.5,New Hampshire
7,Rhode Island....................,25703.0,26656.0,29409.0,30838.0,31895.0,36086.0,36939.0,0.4 0.4,Rhode Island
8,Vermont...........................,13974.0,14662.0,15510.0,16294.0,17155.0,18124.0,19149.0,0.2 0.2,Vermont
9,Mideast..............................,1403270.0,1471796.0,1547124.0,1649536.0,1720155.0,1837583.0,1900223.0,18.6 18.7,Mideast


In [6]:
GDP95_01.addGeoLocColumn('Cleaned Area')

Unnamed: 0,Area,1995,1996,1997,1998,1999,2000,2001,Percentage of U.S. total,Cleaned Area,Geo Loc
0,,,,,,,,,2000 2001,,
1,United States.......................,7309516.0,7715901.0,8224960.0,8750174.0,9251541.0,9891187.0,10137190.0,100 100,United States,
2,New England....................,416166.0,439596.0,471336.0,503940.0,533324.0,582874.0,594686.0,5.9 5.9,New England,
3,Connecticut......................,118645.0,124157.0,134968.0,142701.0,149010.0,161929.0,166165.0,1.6 1.6,Connecticut,New England
4,Maine...............................,27987.0,28925.0,30409.0,32208.0,34102.0,36276.0,37449.0,0.4 0.4,Maine,New England
5,Massachusetts..................,197469.0,210127.0,223571.0,241369.0,257802.0,283072.0,287802.0,2.9 2.8,Massachusetts,New England
6,New Hampshire...............,32388.0,35068.0,37470.0,40529.0,43360.0,47385.0,47183.0,0.5 0.5,New Hampshire,New England
7,Rhode Island....................,25703.0,26656.0,29409.0,30838.0,31895.0,36086.0,36939.0,0.4 0.4,Rhode Island,New England
8,Vermont...........................,13974.0,14662.0,15510.0,16294.0,17155.0,18124.0,19149.0,0.2 0.2,Vermont,New England
9,Mideast..............................,1403270.0,1471796.0,1547124.0,1649536.0,1720155.0,1837583.0,1900223.0,18.6 18.7,Mideast,


In [7]:
GDP95_01.restructureDataFrame('Cleaned Area')

Unnamed: 0,year,state,GDP,GDP_area
0,1995,Alaska,24791,Far West
1,1995,Alabama,95514,Southeast
2,1995,Arkansas,53809,Southeast
3,1995,Arizona,104586,Southwest
4,1995,California,925931,Far West
5,1995,Colorado,109021,Rocky Mountain
6,1995,Connecticut,118645,New England
7,1995,District of Columbia,48408,Mideast
8,1995,Delaware,27575,Mideast
9,1995,Florida,344771,Southeast


In [8]:
GDP95_01.df

Unnamed: 0,year,state,GDP,GDP_area
0,1995,Alaska,24791,Far West
1,1995,Alabama,95514,Southeast
2,1995,Arkansas,53809,Southeast
3,1995,Arizona,104586,Southwest
4,1995,California,925931,Far West
5,1995,Colorado,109021,Rocky Mountain
6,1995,Connecticut,118645,New England
7,1995,District of Columbia,48408,Mideast
8,1995,Delaware,27575,Mideast
9,1995,Florida,344771,Southeast


# GDP 01 - 03 Data 

In [9]:
# create instance of yearly GDP data
GDP01_03 = YearlyGDP('GDP2001-2003.pdf', 5, 2001, 2003)

In [10]:
# testing that it was initialized with an empty DataFrame
GDP01_03.df

In [11]:
# initializeDataFrame() method reads in the PDF, assigns the first DataFrame in the 
# list as the df (only one table per page), and renames the columns. Lastly, it returns 
# the DataFrame for viewing
GDP01_03.initializeDataFrame()

Unnamed: 0,Area,2001,2002 2003,2004,2001.0,2002 2003.1,Percentage of U.S. total
0,United States .........,10058156,"10,412,244 10,923,849",11649827,100.0,100.0 100.0,100.0
1,New England ............,584487,"596,017 620,136",664181,5.8,5.7 5.7,5.7
2,Connecticut .............,165434,"167,235 174,085",187086,1.6,1.6 1.6,1.6
3,Maine .....................,37094,"39,027 40,829",43279,0.4,0.4 0.4,0.4
4,Massachusetts .........,283422,"287,191 297,113",317684,2.8,2.8 2.7,2.7
5,New Hampshire .........,44394,"46,106 48,202",52097,0.4,0.4 0.4,0.4
6,Rhode Island ............,35489,"37,040 39,363",41921,0.4,0.4 0.4,0.4
7,Vermont ..................,18656,"19,419 20,544",22114,0.2,0.2 0.2,0.2
8,Mideast .....................,1868057,"1,922,516 2,010,011",2140662,18.6,18.5 18.4,18.4
9,Delaware ..................,45049,"46,991 50,486",54500,0.4,0.5 0.5,0.5


In [12]:
GDP01_03.splitColumns(2, '2002', '2003')

Unnamed: 0,Area,2001,2002 2003,2004,2001.0,2002 2003.1,Percentage of U.S. total,2002,2003
0,United States .........,10058156,"10,412,244 10,923,849",11649827,100.0,100.0 100.0,100.0,10412244,10923849
1,New England ............,584487,"596,017 620,136",664181,5.8,5.7 5.7,5.7,596017,620136
2,Connecticut .............,165434,"167,235 174,085",187086,1.6,1.6 1.6,1.6,167235,174085
3,Maine .....................,37094,"39,027 40,829",43279,0.4,0.4 0.4,0.4,39027,40829
4,Massachusetts .........,283422,"287,191 297,113",317684,2.8,2.8 2.7,2.7,287191,297113
5,New Hampshire .........,44394,"46,106 48,202",52097,0.4,0.4 0.4,0.4,46106,48202
6,Rhode Island ............,35489,"37,040 39,363",41921,0.4,0.4 0.4,0.4,37040,39363
7,Vermont ..................,18656,"19,419 20,544",22114,0.2,0.2 0.2,0.2,19419,20544
8,Mideast .....................,1868057,"1,922,516 2,010,011",2140662,18.6,18.5 18.4,18.4,1922516,2010011
9,Delaware ..................,45049,"46,991 50,486",54500,0.4,0.5 0.5,0.5,46991,50486


In [13]:
GDP01_03.splitColumns(5, '2002 %', '2003 %')

Unnamed: 0,Area,2001,2002 2003,2004,2001.0,2002 2003.1,Percentage of U.S. total,2002,2003,2002 %,2003 %
0,United States .........,10058156,"10,412,244 10,923,849",11649827,100.0,100.0 100.0,100.0,10412244,10923849,100.0,100.0
1,New England ............,584487,"596,017 620,136",664181,5.8,5.7 5.7,5.7,596017,620136,5.7,5.7
2,Connecticut .............,165434,"167,235 174,085",187086,1.6,1.6 1.6,1.6,167235,174085,1.6,1.6
3,Maine .....................,37094,"39,027 40,829",43279,0.4,0.4 0.4,0.4,39027,40829,0.4,0.4
4,Massachusetts .........,283422,"287,191 297,113",317684,2.8,2.8 2.7,2.7,287191,297113,2.8,2.7
5,New Hampshire .........,44394,"46,106 48,202",52097,0.4,0.4 0.4,0.4,46106,48202,0.4,0.4
6,Rhode Island ............,35489,"37,040 39,363",41921,0.4,0.4 0.4,0.4,37040,39363,0.4,0.4
7,Vermont ..................,18656,"19,419 20,544",22114,0.2,0.2 0.2,0.2,19419,20544,0.2,0.2
8,Mideast .....................,1868057,"1,922,516 2,010,011",2140662,18.6,18.5 18.4,18.4,1922516,2010011,18.5,18.4
9,Delaware ..................,45049,"46,991 50,486",54500,0.4,0.5 0.5,0.5,46991,50486,0.5,0.5


In [14]:
GDP01_03.cleanStates('Area')

Unnamed: 0,Area,2001,2002 2003,2004,2001.0,2002 2003.1,Percentage of U.S. total,2002,2003,2002 %,2003 %,Cleaned Area
0,United States .........,10058156,"10,412,244 10,923,849",11649827,100.0,100.0 100.0,100.0,10412244,10923849,100.0,100.0,United States
1,New England ............,584487,"596,017 620,136",664181,5.8,5.7 5.7,5.7,596017,620136,5.7,5.7,New England
2,Connecticut .............,165434,"167,235 174,085",187086,1.6,1.6 1.6,1.6,167235,174085,1.6,1.6,Connecticut
3,Maine .....................,37094,"39,027 40,829",43279,0.4,0.4 0.4,0.4,39027,40829,0.4,0.4,Maine
4,Massachusetts .........,283422,"287,191 297,113",317684,2.8,2.8 2.7,2.7,287191,297113,2.8,2.7,Massachusetts
5,New Hampshire .........,44394,"46,106 48,202",52097,0.4,0.4 0.4,0.4,46106,48202,0.4,0.4,New Hampshire
6,Rhode Island ............,35489,"37,040 39,363",41921,0.4,0.4 0.4,0.4,37040,39363,0.4,0.4,Rhode Island
7,Vermont ..................,18656,"19,419 20,544",22114,0.2,0.2 0.2,0.2,19419,20544,0.2,0.2,Vermont
8,Mideast .....................,1868057,"1,922,516 2,010,011",2140662,18.6,18.5 18.4,18.4,1922516,2010011,18.5,18.4,Mideast
9,Delaware ..................,45049,"46,991 50,486",54500,0.4,0.5 0.5,0.5,46991,50486,0.5,0.5,Delaware


In [15]:
GDP01_03.addGeoLocColumn('Cleaned Area')

Unnamed: 0,Area,2001,2002 2003,2004,2001.0,2002 2003.1,Percentage of U.S. total,2002,2003,2002 %,2003 %,Cleaned Area,Geo Loc
0,United States .........,10058156,"10,412,244 10,923,849",11649827,100.0,100.0 100.0,100.0,10412244,10923849,100.0,100.0,United States,
1,New England ............,584487,"596,017 620,136",664181,5.8,5.7 5.7,5.7,596017,620136,5.7,5.7,New England,
2,Connecticut .............,165434,"167,235 174,085",187086,1.6,1.6 1.6,1.6,167235,174085,1.6,1.6,Connecticut,New England
3,Maine .....................,37094,"39,027 40,829",43279,0.4,0.4 0.4,0.4,39027,40829,0.4,0.4,Maine,New England
4,Massachusetts .........,283422,"287,191 297,113",317684,2.8,2.8 2.7,2.7,287191,297113,2.8,2.7,Massachusetts,New England
5,New Hampshire .........,44394,"46,106 48,202",52097,0.4,0.4 0.4,0.4,46106,48202,0.4,0.4,New Hampshire,New England
6,Rhode Island ............,35489,"37,040 39,363",41921,0.4,0.4 0.4,0.4,37040,39363,0.4,0.4,Rhode Island,New England
7,Vermont ..................,18656,"19,419 20,544",22114,0.2,0.2 0.2,0.2,19419,20544,0.2,0.2,Vermont,New England
8,Mideast .....................,1868057,"1,922,516 2,010,011",2140662,18.6,18.5 18.4,18.4,1922516,2010011,18.5,18.4,Mideast,
9,Delaware ..................,45049,"46,991 50,486",54500,0.4,0.5 0.5,0.5,46991,50486,0.5,0.5,Delaware,Mideast


In [16]:
GDP01_03.restructureDataFrame('Cleaned Area')

Unnamed: 0,year,state,GDP,GDP_area
0,2001,Alaska,27358,Far West
1,2001,Alabama,118263,Southeast
2,2001,Arkansas,68574,Southeast
3,2001,Arizona,164263,Southwest
4,2001,California,1307880,Far West
5,2001,Colorado,177526,Rocky Mountain
6,2001,Connecticut,165434,New England
7,2001,District of Columbia,63223,Mideast
8,2001,Delaware,45049,Mideast
9,2001,Florida,496861,Southeast


In [17]:
GDP01_03.df

Unnamed: 0,year,state,GDP,GDP_area
0,2001,Alaska,27358,Far West
1,2001,Alabama,118263,Southeast
2,2001,Arkansas,68574,Southeast
3,2001,Arizona,164263,Southwest
4,2001,California,1307880,Far West
5,2001,Colorado,177526,Rocky Mountain
6,2001,Connecticut,165434,New England
7,2001,District of Columbia,63223,Mideast
8,2001,Delaware,45049,Mideast
9,2001,Florida,496861,Southeast


#  GDP 04 - 06

In [18]:
# create instance of yearly GDP data
GDP04_06 = YearlyGDP('GDP2004-2006.pdf', 11, 2004, 2006)

In [19]:
# testing that it was initialized with an empty DataFrame
GDP04_06.df

In [20]:
# initializeDataFrame() method reads in the PDF, assigns the first DataFrame in the 
# list as the df (only one table per page), and renames the columns. Lastly, it returns 
# the DataFrame for viewing
GDP04_06.initializeDataFrame()

Unnamed: 0,Area,2004,2005 2006,2007,2004.0,2005 2006.1,Percentage of U.S. total
0,United States...............,11607041,"12,346,871 13,119,938",13743021,100.0,100.0 100.0,100.0
1,New England..................,647473,"674,562 712,051",744672,5.6,5.5 5.4,5.4
2,Connecticut.....................,182112,"193,281 204,964",216266,1.6,1.6 1.6,1.6
3,Maine............................,43191,"44,364 46,340",48108,0.4,0.4 0.4,0.4
4,Massachusetts..................,306827,"317,626 335,313",351514,2.6,2.6 2.6,2.6
5,New Hampshire...............,51432,"53,468 56,073",57341,0.4,0.4 0.4,0.4
6,Rhode Island..................,42073,"43,078 45,733",46900,0.4,0.3 0.3,0.3
7,Vermont........................,21839,"22,745 23,628",24543,0.2,0.2 0.2,0.2
8,Mideast...........................,2124891,"2,245,718 2,390,856",2522240,18.3,18.2 18.2,18.4
9,Delaware........................,52305,"57,334 59,589",60118,0.5,0.5 0.5,0.4


In [21]:
GDP04_06.splitColumns(2, '2005', '2006')

Unnamed: 0,Area,2004,2005 2006,2007,2004.0,2005 2006.1,Percentage of U.S. total,2005,2006
0,United States...............,11607041,"12,346,871 13,119,938",13743021,100.0,100.0 100.0,100.0,12346871,13119938
1,New England..................,647473,"674,562 712,051",744672,5.6,5.5 5.4,5.4,674562,712051
2,Connecticut.....................,182112,"193,281 204,964",216266,1.6,1.6 1.6,1.6,193281,204964
3,Maine............................,43191,"44,364 46,340",48108,0.4,0.4 0.4,0.4,44364,46340
4,Massachusetts..................,306827,"317,626 335,313",351514,2.6,2.6 2.6,2.6,317626,335313
5,New Hampshire...............,51432,"53,468 56,073",57341,0.4,0.4 0.4,0.4,53468,56073
6,Rhode Island..................,42073,"43,078 45,733",46900,0.4,0.3 0.3,0.3,43078,45733
7,Vermont........................,21839,"22,745 23,628",24543,0.2,0.2 0.2,0.2,22745,23628
8,Mideast...........................,2124891,"2,245,718 2,390,856",2522240,18.3,18.2 18.2,18.4,2245718,2390856
9,Delaware........................,52305,"57,334 59,589",60118,0.5,0.5 0.5,0.4,57334,59589


In [22]:
GDP04_06.splitColumns(5, '2005 %', '2006 %')

Unnamed: 0,Area,2004,2005 2006,2007,2004.0,2005 2006.1,Percentage of U.S. total,2005,2006,2005 %,2006 %
0,United States...............,11607041,"12,346,871 13,119,938",13743021,100.0,100.0 100.0,100.0,12346871,13119938,100.0,100.0
1,New England..................,647473,"674,562 712,051",744672,5.6,5.5 5.4,5.4,674562,712051,5.5,5.4
2,Connecticut.....................,182112,"193,281 204,964",216266,1.6,1.6 1.6,1.6,193281,204964,1.6,1.6
3,Maine............................,43191,"44,364 46,340",48108,0.4,0.4 0.4,0.4,44364,46340,0.4,0.4
4,Massachusetts..................,306827,"317,626 335,313",351514,2.6,2.6 2.6,2.6,317626,335313,2.6,2.6
5,New Hampshire...............,51432,"53,468 56,073",57341,0.4,0.4 0.4,0.4,53468,56073,0.4,0.4
6,Rhode Island..................,42073,"43,078 45,733",46900,0.4,0.3 0.3,0.3,43078,45733,0.3,0.3
7,Vermont........................,21839,"22,745 23,628",24543,0.2,0.2 0.2,0.2,22745,23628,0.2,0.2
8,Mideast...........................,2124891,"2,245,718 2,390,856",2522240,18.3,18.2 18.2,18.4,2245718,2390856,18.2,18.2
9,Delaware........................,52305,"57,334 59,589",60118,0.5,0.5 0.5,0.4,57334,59589,0.5,0.5


In [23]:
GDP04_06.cleanStates('Area')

Unnamed: 0,Area,2004,2005 2006,2007,2004.0,2005 2006.1,Percentage of U.S. total,2005,2006,2005 %,2006 %,Cleaned Area
0,United States...............,11607041,"12,346,871 13,119,938",13743021,100.0,100.0 100.0,100.0,12346871,13119938,100.0,100.0,United States
1,New England..................,647473,"674,562 712,051",744672,5.6,5.5 5.4,5.4,674562,712051,5.5,5.4,New England
2,Connecticut.....................,182112,"193,281 204,964",216266,1.6,1.6 1.6,1.6,193281,204964,1.6,1.6,Connecticut
3,Maine............................,43191,"44,364 46,340",48108,0.4,0.4 0.4,0.4,44364,46340,0.4,0.4,Maine
4,Massachusetts..................,306827,"317,626 335,313",351514,2.6,2.6 2.6,2.6,317626,335313,2.6,2.6,Massachusetts
5,New Hampshire...............,51432,"53,468 56,073",57341,0.4,0.4 0.4,0.4,53468,56073,0.4,0.4,New Hampshire
6,Rhode Island..................,42073,"43,078 45,733",46900,0.4,0.3 0.3,0.3,43078,45733,0.3,0.3,Rhode Island
7,Vermont........................,21839,"22,745 23,628",24543,0.2,0.2 0.2,0.2,22745,23628,0.2,0.2,Vermont
8,Mideast...........................,2124891,"2,245,718 2,390,856",2522240,18.3,18.2 18.2,18.4,2245718,2390856,18.2,18.2,Mideast
9,Delaware........................,52305,"57,334 59,589",60118,0.5,0.5 0.5,0.4,57334,59589,0.5,0.5,Delaware


In [24]:
GDP04_06.addGeoLocColumn('Cleaned Area')

Unnamed: 0,Area,2004,2005 2006,2007,2004.0,2005 2006.1,Percentage of U.S. total,2005,2006,2005 %,2006 %,Cleaned Area,Geo Loc
0,United States...............,11607041,"12,346,871 13,119,938",13743021,100.0,100.0 100.0,100.0,12346871,13119938,100.0,100.0,United States,
1,New England..................,647473,"674,562 712,051",744672,5.6,5.5 5.4,5.4,674562,712051,5.5,5.4,New England,
2,Connecticut.....................,182112,"193,281 204,964",216266,1.6,1.6 1.6,1.6,193281,204964,1.6,1.6,Connecticut,New England
3,Maine............................,43191,"44,364 46,340",48108,0.4,0.4 0.4,0.4,44364,46340,0.4,0.4,Maine,New England
4,Massachusetts..................,306827,"317,626 335,313",351514,2.6,2.6 2.6,2.6,317626,335313,2.6,2.6,Massachusetts,New England
5,New Hampshire...............,51432,"53,468 56,073",57341,0.4,0.4 0.4,0.4,53468,56073,0.4,0.4,New Hampshire,New England
6,Rhode Island..................,42073,"43,078 45,733",46900,0.4,0.3 0.3,0.3,43078,45733,0.3,0.3,Rhode Island,New England
7,Vermont........................,21839,"22,745 23,628",24543,0.2,0.2 0.2,0.2,22745,23628,0.2,0.2,Vermont,New England
8,Mideast...........................,2124891,"2,245,718 2,390,856",2522240,18.3,18.2 18.2,18.4,2245718,2390856,18.2,18.2,Mideast,
9,Delaware........................,52305,"57,334 59,589",60118,0.5,0.5 0.5,0.4,57334,59589,0.5,0.5,Delaware,Mideast


In [25]:
GDP04_06.restructureDataFrame('Cleaned Area')

Unnamed: 0,year,state,GDP,GDP_area
0,2004,Alaska,35102,Far West
1,2004,Alabama,141527,Southeast
2,2004,Arkansas,82137,Southeast
3,2004,Arizona,193448,Southwest
4,2004,California,1519443,Far West
5,2004,Colorado,197329,Rocky Mountain
6,2004,Connecticut,182112,New England
7,2004,District of Columbia,77913,Mideast
8,2004,Delaware,52305,Mideast
9,2004,Florida,607284,Southeast


In [26]:
GDP04_06.df

Unnamed: 0,year,state,GDP,GDP_area
0,2004,Alaska,35102,Far West
1,2004,Alabama,141527,Southeast
2,2004,Arkansas,82137,Southeast
3,2004,Arizona,193448,Southwest
4,2004,California,1519443,Far West
5,2004,Colorado,197329,Rocky Mountain
6,2004,Connecticut,182112,New England
7,2004,District of Columbia,77913,Mideast
8,2004,Delaware,52305,Mideast
9,2004,Florida,607284,Southeast
