# Population forecasting

1. Clean population dataframe
    - append missing data to population dataframe (24 cities)
2. Transform dataframe
3. Forecast population using **fb prophet**
4. Clean predictions
5. Create pickle
6. Test

## 1. Clean population dataframe

In [1]:
import pandas as pd
import numpy as np



In [2]:
population = pd.read_csv('https://raw.githubusercontent.com/Lambda-School-Labs/PT17_cityspire-a-ds/main/notebooks/datasets/data/population2010-2019/metropop_2010_2019.csv')

### Check population csv

In [3]:
print(population.shape)
population.head()

(415, 14)


Unnamed: 0,Metro-Area,State,Census,Estimate Base,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Abilene,TX,165252.0,165252.0,165585.0,166634.0,167442.0,167473.0,168342.0,169688.0,170017.0,170429.0,171150.0,172060.0
1,Akron,OH,703200.0,703196.0,703031.0,703200.0,702109.0,703621.0,704908.0,704382.0,703524.0,703987.0,703855.0,703479.0
2,Albany,GA,153857.0,154033.0,154145.0,154545.0,153976.0,152667.0,151949.0,150387.0,149137.0,148090.0,147840.0,146726.0
3,Albany-Lebanon,OR,116672.0,116681.0,116891.0,118164.0,118273.0,118405.0,119042.0,120236.0,122769.0,125035.0,127451.0,129749.0
4,Albany-Schenectady-Troy,NY,870716.0,870713.0,871082.0,872778.0,874698.0,877065.0,878113.0,879085.0,879792.0,882158.0,882263.0,880381.0


### Combine city state
- use explode to separate combined cities
- combine separated city and states

In [4]:
def explode_str(population, col='Metro-Area', sep='-'):
    s = population[col]
    i = np.arange(len(s)).repeat(s.str.count(sep) +1)
    return population.iloc[i].assign(**{col: sep.join(s).split(sep)})

population = explode_str(population)

In [5]:
population['Metro-Area'] = population['Metro-Area'].str.strip()

In [6]:
def explode_str(population, col='State', sep='-'):
    s = population[col]
    i = np.arange(len(s)).repeat(s.str.count(sep) +1)
    return population.iloc[i].assign(**{col: sep.join(s).split(sep)})

population = explode_str(population)

In [7]:
population['State'] = population['State'].str.strip()

In [8]:
print(population['Metro-Area'].nunique())
population['Metro-Area'].unique()

575


array(['Abilene', 'Akron', 'Albany', 'Lebanon', 'Schenectady', 'Troy',
       'Albuquerque', 'Alexandria', 'Allentown', 'Bethlehem', 'Easton',
       'Altoona', 'Amarillo', 'Ames', 'Anchorage', 'Ann Arbor',
       'Anniston', 'Oxford', 'Appleton', 'Asheville', 'Athens',
       'Clarke County', 'Atlanta', 'Sandy Springs', 'Alpharetta',
       'Atlantic City', 'Hammonton', 'Auburn', 'Opelika', 'Augusta',
       'Richmond County', 'Austin', 'Round Rock', 'Georgetown',
       'Bakersfield', 'Baltimore', 'Columbia', 'Towson', 'Bangor',
       'Barnstable Town', 'Baton Rouge', 'Battle Creek', 'Bay City',
       'Beaumont', 'Port Arthur', 'Beckley', 'Bellingham', 'Bend',
       'Billings', 'Binghamton', 'Birmingham', 'Hoover', 'Bismarck',
       'Blacksburg', 'Christiansburg', 'Bloomington', 'Bloomsburg',
       'Berwick', 'Boise City', 'Boston', 'Cambridge', 'Newton',
       'Framingham', 'Rockingham County', 'Strafford County', 'Boulder',
       'Bowling Green', 'Bremerton', 'Silverdale', '

In [9]:
population = population[population['Metro-Area'] != '']

In [10]:
print(population['State'].nunique())
population['State'].unique()

51


array(['TX', 'OH', 'GA', 'OR', 'NY', 'NM', 'LA', 'PA', 'NJ', 'IA', 'AK',
       'MI', 'AL', 'WI', 'NC', 'SC', 'CA', 'MD', 'ME', 'MA', 'WV', 'WA',
       'MT', 'ND', 'VA', 'IL', 'IN', 'ID', 'NH', 'CO', 'KY', 'CT', 'VT',
       'FL', 'MO', 'NV', 'WY', 'TN', 'DE', 'MN', 'OK', 'AR', 'AZ', 'NE',
       'MS', 'HI', 'KS', 'UT', 'RI', 'SD', 'DC'], dtype=object)

In [11]:
population['City,State'] = population['Metro-Area'] + ', ' + population['State']

In [12]:
population.shape

(750, 15)

### Drop duplicate rows and unused columns

In [13]:
population[population.duplicated(subset=['City,State'], keep=False)]

Unnamed: 0,Metro-Area,State,Census,Estimate Base,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,"City,State"
42,Boston,MA,4552402.0,4552595.0,4566348.0,4609790.0,4656593.0,4702877.0,4746931.0,4778340.0,4809061.0,4841772.0,4859536.0,4873019.0,"Boston, MA"
42,Cambridge,MA,4552402.0,4552595.0,4566348.0,4609790.0,4656593.0,4702877.0,4746931.0,4778340.0,4809061.0,4841772.0,4859536.0,4873019.0,"Cambridge, MA"
42,Newton,MA,4552402.0,4552595.0,4566348.0,4609790.0,4656593.0,4702877.0,4746931.0,4778340.0,4809061.0,4841772.0,4859536.0,4873019.0,"Newton, MA"
43,Boston,MA,1887792.0,1888025.0,1894717.0,1913107.0,1933463.0,1953625.0,1972382.0,1986314.0,2001862.0,2016866.0,2024799.0,2031884.0,"Boston, MA"
44,Cambridge,MA,2246244.0,2246215.0,2253142.0,2276355.0,2300205.0,2324767.0,2346646.0,2362297.0,2374511.0,2388396.0,2396653.0,2400733.0,"Cambridge, MA"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,Arlington,WV,4444378.0,4445001.0,4468671.0,4553850.0,4634218.0,4707011.0,4763947.0,4816135.0,4863456.0,4913897.0,4944420.0,4970252.0,"Arlington, WV"
396,Alexandria,DC,4444378.0,4445001.0,4468671.0,4553850.0,4634218.0,4707011.0,4763947.0,4816135.0,4863456.0,4913897.0,4944420.0,4970252.0,"Alexandria, DC"
396,Alexandria,VA,4444378.0,4445001.0,4468671.0,4553850.0,4634218.0,4707011.0,4763947.0,4816135.0,4863456.0,4913897.0,4944420.0,4970252.0,"Alexandria, VA"
396,Alexandria,MD,4444378.0,4445001.0,4468671.0,4553850.0,4634218.0,4707011.0,4763947.0,4816135.0,4863456.0,4913897.0,4944420.0,4970252.0,"Alexandria, MD"


In [14]:
population = population.drop_duplicates(subset=['City,State' ], keep='last')
print(population.shape)

(706, 15)


In [15]:
population = population.drop(columns = ['Census', 'Estimate Base'])
population = population[['Metro-Area', 'State', 'City,State', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']]

In [16]:
population['Metro-Area'] = population['Metro-Area'].str.replace('Urban Honolulu', 'Honolulu')
population['City,State'] = population['City,State'].str.replace('Urban Honolulu, HI', 'Honolulu, HI')
population.loc[population['Metro-Area'] == 'Honolulu']

Unnamed: 0,Metro-Area,State,"City,State",2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
383,Honolulu,HI,"Honolulu, HI",956285.0,967336.0,977994.0,986059.0,987414.0,991064.0,992268.0,986353.0,979858.0,974563.0


### Add cities that are missing from main database

In [17]:
extended = pd.read_excel('csv/Extended_cities_2010_2019.xlsx')

In [18]:
extended = extended.rename(columns = {
                                                 2010: '2010',
                                                 2011: '2011',
                                                 2012: '2012',
                                                 2013: '2013',
                                                 2014: '2014',
                                                 2015: '2015',
                                                 2016: '2016',
                                                 2017: '2017',
                                                 2018: '2018',
                                                 2019: '2019'})

In [19]:
population = population.append(extended, ignore_index=True)

In [20]:
print(population.shape)
population.head()

(730, 13)


Unnamed: 0,Metro-Area,State,"City,State",2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Abilene,TX,"Abilene, TX",165585.0,166634.0,167442.0,167473.0,168342.0,169688.0,170017.0,170429.0,171150.0,172060.0
1,Akron,OH,"Akron, OH",703031.0,703200.0,702109.0,703621.0,704908.0,704382.0,703524.0,703987.0,703855.0,703479.0
2,Albany,GA,"Albany, GA",154145.0,154545.0,153976.0,152667.0,151949.0,150387.0,149137.0,148090.0,147840.0,146726.0
3,Albany,OR,"Albany, OR",116891.0,118164.0,118273.0,118405.0,119042.0,120236.0,122769.0,125035.0,127451.0,129749.0
4,Lebanon,OR,"Lebanon, OR",116891.0,118164.0,118273.0,118405.0,119042.0,120236.0,122769.0,125035.0,127451.0,129749.0


In [21]:
population.tail()

Unnamed: 0,Metro-Area,State,"City,State",2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
725,Porterville,CA,"Porterville, CA",54165.0,53531.0,53531.0,54534.0,54534.0,55218.0,58472.0,58782.0,59797.0,59797.0
726,Brookings,SD,"Brookings, SD",22056.0,21767.0,22065.0,22356.0,22645.0,22974.0,23292.0,23471.0,23863.0,24108.0
727,Fairfield,CA,"Fairfield, CA",105321.0,104404.0,105407.0,106533.0,107983.0,109468.0,110953.0,112790.0,114101.0,115282.0
728,Scottsdale,AZ,"Scottsdale, AZ",258069.0,219311.0,219867.0,221283.0,223519.0,227471.0,234495.0,239283.0,246026.0,250602.0
729,Pascagoula,MS,"Pascagoula, MS",22392.0,22765.0,22523.0,22372.0,22239.0,22230.0,234495.0,21935.0,21865.0,21809.0


In [22]:
population.to_csv('csv/population_cleaned.csv', index=False)

## 2. Stack dataframe 
- this is to try the groupby so I don't have to create separate csv's and run each city csv separately
- https://stackoverflow.com/questions/64179626/stack-unstack-melt-pivot-transpose-what-is-the-simple-method-to-convert-mul

In [23]:
population_melt = population[['City,State', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']]

In [24]:
population_melt['City,State'] = population_melt['City,State'].astype(str)
population_melt[['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']] = population_melt[['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  population_melt['City,State'] = population_melt['City,State'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [25]:
population_melt = (population.melt(id_vars=['City,State'],
                    var_name = 'ds',
                    value_name = 'y'
                    ).reset_index(drop=True))

In [26]:
population_melt = population_melt[population_melt['ds'] != 'State']
population_melt = population_melt[population_melt['ds'] != 'Metro-Area']

In [27]:
population_melt.head()

Unnamed: 0,"City,State",ds,y
1460,"Abilene, TX",2010,165585
1461,"Akron, OH",2010,703031
1462,"Albany, GA",2010,154145
1463,"Albany, OR",2010,116891
1464,"Lebanon, OR",2010,116891


In [28]:
population_melt.to_csv('csv/population_melt.csv', index=False)

## 3. Forecast population using **fb prophet**

In [29]:
from fbprophet import Prophet
from fbprophet.plot import add_changepoints_to_plot

In [30]:
cities_list = list(population['City,State'])

In [31]:
def rnd_series(city):
    subset = population_melt[population_melt['City,State']== city]
    dates = (pd.DataFrame({'ds': pd.to_datetime(population_melt['ds'])}))
    return subset

In [32]:
series = [rnd_series(city) for city in cities_list]

In [33]:
len(series)

730

In [34]:
series[0]

Unnamed: 0,"City,State",ds,y
1460,"Abilene, TX",2010,165585
2190,"Abilene, TX",2011,166634
2920,"Abilene, TX",2012,167442
3650,"Abilene, TX",2013,167473
4380,"Abilene, TX",2014,168342
5110,"Abilene, TX",2015,169688
5840,"Abilene, TX",2016,170017
6570,"Abilene, TX",2017,170429
7300,"Abilene, TX",2018,171150
8030,"Abilene, TX",2019,172060


In [35]:
def run_prophet(series):
    model = Prophet(daily_seasonality=False,
                    weekly_seasonality=False,
                    yearly_seasonality=False)
    model.fit(series)
    forecast = model.make_future_dataframe(periods=10, freq='Y')
    forecast = model.predict(forecast)
    forecast = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
    forecast['City,State'] = series['City,State'].iloc[0]
    forecast = forecast[['City,State','ds', 'yhat', 'yhat_lower', 'yhat_upper']]
    forecast['ds'] = pd.DatetimeIndex(forecast['ds']).year
    forecast[['yhat', 'yhat_lower', 'yhat_upper']] = forecast[['yhat', 'yhat_lower', 'yhat_upper']].astype(float)
    forecast[['yhat', 'yhat_lower', 'yhat_upper']] = forecast[['yhat', 'yhat_lower', 'yhat_upper']].apply(np.ceil)
    return forecast

In [36]:
f = run_prophet(series[0])
f.head()

INFO:fbprophet:n_changepoints greater than number of observations. Using 7.


Unnamed: 0,"City,State",ds,yhat,yhat_lower,yhat_upper
0,"Abilene, TX",2010,165586.0,165555.0,165618.0
1,"Abilene, TX",2011,166634.0,166601.0,166663.0
2,"Abilene, TX",2012,167441.0,167409.0,167472.0
3,"Abilene, TX",2013,167475.0,167443.0,167504.0
4,"Abilene, TX",2014,168344.0,168311.0,168374.0


In [37]:
for i in range(len(series)):
    f = run_prophet(series[i])
    f.to_csv('csv/population_prediction.csv', mode='a', index='False')

INFO:fbprophet:n_changepoints greater than number of observations. Using 7.
INFO:fbprophet:n_changepoints greater than number of observations. Using 7.
INFO:fbprophet:n_changepoints greater than number of observations. Using 7.
INFO:fbprophet:n_changepoints greater than number of observations. Using 7.
INFO:fbprophet:n_changepoints greater than number of observations. Using 7.
INFO:fbprophet:n_changepoints greater than number of observations. Using 7.
INFO:fbprophet:n_changepoints greater than number of observations. Using 7.
INFO:fbprophet:n_changepoints greater than number of observations. Using 7.
INFO:fbprophet:n_changepoints greater than number of observations. Using 7.
INFO:fbprophet:n_changepoints greater than number of observations. Using 7.
INFO:fbprophet:n_changepoints greater than number of observations. Using 7.
INFO:fbprophet:n_changepoints greater than number of observations. Using 7.
INFO:fbprophet:n_changepoints greater than number of observations. Using 7.
INFO:fbproph

In [38]:
predictions = pd.read_csv('csv/population_prediction.csv')

In [39]:
predictions.head()

Unnamed: 0.1,Unnamed: 0,"City,State",ds,yhat,yhat_lower,yhat_upper
0,0.0,"Abilene, TX",2010,165586.0,165552.0,165617.0
1,1.0,"Abilene, TX",2011,166634.0,166602.0,166664.0
2,2.0,"Abilene, TX",2012,167441.0,167408.0,167474.0
3,3.0,"Abilene, TX",2013,167475.0,167443.0,167508.0
4,4.0,"Abilene, TX",2014,168344.0,168313.0,168377.0


## 4. Clean Predictions
- clean predictions.df
- create pivot table to show in long format

In [40]:
predictions = predictions.loc[predictions['City,State'] != 'City,State']

In [41]:
predictions[['yhat','yhat_lower', 'yhat_upper']] = predictions[['yhat','yhat_lower', 'yhat_upper']].astype(float) 
predictions['year'] = pd.DatetimeIndex(predictions['ds']).year
predictions = predictions[['City,State', 'year', 'yhat', 'yhat_lower', 'yhat_upper']]

In [42]:
print(predictions.shape)
predictions.head()

(14600, 5)


Unnamed: 0,"City,State",year,yhat,yhat_lower,yhat_upper
0,"Abilene, TX",2010,165586.0,165552.0,165617.0
1,"Abilene, TX",2011,166634.0,166602.0,166664.0
2,"Abilene, TX",2012,167441.0,167408.0,167474.0
3,"Abilene, TX",2013,167475.0,167443.0,167508.0
4,"Abilene, TX",2014,168344.0,168313.0,168377.0


In [43]:
predictions.to_csv('csv/population_prediction.csv', index=False)

In [44]:
population_predictions = predictions[['City,State', 'year', 'yhat']]

In [45]:
population_predictions = population_predictions.pivot_table(index = 'City,State',
                                                          columns = 'year',
                                                          values = 'yhat')

In [46]:
print(population_predictions.shape)
population_predictions.head()

(730, 19)


year,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025,2026,2027,2028
"City,State",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
"Abilene, TX",165586.0,166634.0,167441.0,167475.0,168344.0,169686.0,170019.0,170398.0,171214.0,172435.5,173659.0,174474.0,175289.0,176105.0,176922.0,177737.0,178552.0,179368.0,180185.0
"Akron, OH",703032.0,703200.0,702111.0,703621.0,704908.0,704382.0,703526.0,704028.0,703774.0,703393.5,703013.0,702760.0,702506.0,702252.0,701998.0,701744.0,701491.0,701237.0,700983.0
"Albany, GA",154161.0,154530.0,153945.0,152730.0,151887.0,150419.0,149137.0,148242.0,147553.0,146519.0,145483.0,144793.0,144103.0,143413.0,142722.0,142032.0,141342.0,140652.0,139960.0
"Albany, NY",871069.0,872792.0,874726.0,877038.0,878113.0,879058.0,879847.0,882460.0,881592.0,880291.0,878988.0,878119.0,877251.0,876383.0,875512.0,874644.0,873776.0,872908.0,872037.0
"Albany, OR",116892.0,118164.0,118274.0,118405.0,119043.0,120237.0,122768.0,125056.0,127412.0,130944.0,134482.0,136838.0,139195.0,141551.0,143914.0,146271.0,148627.0,150984.0,153347.0


In [47]:
population_predictions.to_csv('csv/population_prediction_long.csv', index=False)