In [59]:
import os 
import pandas as pd
from pandas._testing import assert_frame_equal

In [60]:
# Test that the state-level gdp data matches a manually created csv representation of pdf data for 1995-2001

# Import manually created csv
gdpTestingData = pd.read_csv('gdp_test_data.csv')

# Import Katie's output
yearlyGDPbyState = pd.read_csv('../DataSet/yearlyGDPbyState.csv')

In [61]:
# Filter out years after 2001, drop the area column, sort by year then state, and drop added index 
yearlyGDPbyState_toTest = yearlyGDPbyState[yearlyGDPbyState['year'] < 2002].drop(columns = ['GDP_area']).sort_values(['year','state']).reset_index(drop=True).drop(columns = ['Unnamed: 0'])

yearlyGDPbyState_toTest.tail(100)

Unnamed: 0,year,state,current dollars,GDP
257,2000,Arizona,2001,153469.0
258,2000,Arkansas,2001,66793.0
259,2000,California,2001,1330025.0
260,2000,Colorado,2001,169341.0
261,2000,Connecticut,2001,161929.0
...,...,...,...,...
352,2001,Virginia,2004,277214.0
353,2001,Washington,2004,225656.0
354,2001,West Virginia,2004,43512.0
355,2001,Wisconsin,2004,182373.0


In [62]:
# Testing found that spaces need to be stripped in manual data 
gdpTestingData['state'] = gdpTestingData['state'].str.strip()

gdpTestingData.head()

Unnamed: 0,year,state,current dollars,GDP
0,1995,Alabama,2001,95514
1,1995,Alaska,2001,24791
2,1995,Arizona,2001,104586
3,1995,Arkansas,2001,53809
4,1995,California,2001,925931


In [63]:
# Testing found that the scraping script was overwriting preliminary dollar values with updated values from later years. 
# Reflect that change in the dataframes by dropping 2001 from the comparison.
yearlyGDPbyState_toTest = yearlyGDPbyState_toTest[yearlyGDPbyState_toTest['year'] < 2001]

# Check the last values to confirm it was dropped correctly
#yearlyGDPbyState_toTest.tail()

# Filter out 2001 data from testing dataset
gdpTestingData = gdpTestingData[gdpTestingData['year'] < 2001]
gdpTestingData.tail()

# Drop commas and convert to float
gdpTestingData['GDP'] = gdpTestingData['GDP'].str.replace(',', '').astype(float)

In [64]:
# Use the assert datatrame here
# Documentation: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.testing.assert_frame_equal.html

assert_frame_equal(yearlyGDPbyState_toTest, gdpTestingData)

In [65]:
# No differences indicate that the dataframes match, and that the scraping provided correct output.