# Population forecasting

1. Clean population dataframe
2. Transform dataframe
3. Forecast population using **fb prophet**

## 1. Clean population dataframe

In [1]:
import pandas as pd
import numpy as np



In [2]:
population = pd.read_csv('https://raw.githubusercontent.com/Lambda-School-Labs/PT17_cityspire-a-ds/main/notebooks/datasets/data/population2010-2019/metropop_2010_2019.csv')

### Check population csv

In [3]:
print(population.shape)
population.head()

(415, 14)


Unnamed: 0,Metro-Area,State,Census,Estimate Base,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Abilene,TX,165252.0,165252.0,165585.0,166634.0,167442.0,167473.0,168342.0,169688.0,170017.0,170429.0,171150.0,172060.0
1,Akron,OH,703200.0,703196.0,703031.0,703200.0,702109.0,703621.0,704908.0,704382.0,703524.0,703987.0,703855.0,703479.0
2,Albany,GA,153857.0,154033.0,154145.0,154545.0,153976.0,152667.0,151949.0,150387.0,149137.0,148090.0,147840.0,146726.0
3,Albany-Lebanon,OR,116672.0,116681.0,116891.0,118164.0,118273.0,118405.0,119042.0,120236.0,122769.0,125035.0,127451.0,129749.0
4,Albany-Schenectady-Troy,NY,870716.0,870713.0,871082.0,872778.0,874698.0,877065.0,878113.0,879085.0,879792.0,882158.0,882263.0,880381.0


### Combine city state
- use explode to separate combined cities
- combine separated city and states

In [4]:
def explode_str(population, col='Metro-Area', sep='-'):
    s = population[col]
    i = np.arange(len(s)).repeat(s.str.count(sep) +1)
    return population.iloc[i].assign(**{col: sep.join(s).split(sep)})

population = explode_str(population)

In [5]:
population['Metro-Area'] = population['Metro-Area'].str.strip()

In [6]:
def explode_str(population, col='State', sep='-'):
    s = population[col]
    i = np.arange(len(s)).repeat(s.str.count(sep) +1)
    return population.iloc[i].assign(**{col: sep.join(s).split(sep)})

population = explode_str(population)

In [7]:
population['State'] = population['State'].str.strip()

In [8]:
print(population['Metro-Area'].nunique())
population['Metro-Area'].unique()

575


array(['Abilene', 'Akron', 'Albany', 'Lebanon', 'Schenectady', 'Troy',
       'Albuquerque', 'Alexandria', 'Allentown', 'Bethlehem', 'Easton',
       'Altoona', 'Amarillo', 'Ames', 'Anchorage', 'Ann Arbor',
       'Anniston', 'Oxford', 'Appleton', 'Asheville', 'Athens',
       'Clarke County', 'Atlanta', 'Sandy Springs', 'Alpharetta',
       'Atlantic City', 'Hammonton', 'Auburn', 'Opelika', 'Augusta',
       'Richmond County', 'Austin', 'Round Rock', 'Georgetown',
       'Bakersfield', 'Baltimore', 'Columbia', 'Towson', 'Bangor',
       'Barnstable Town', 'Baton Rouge', 'Battle Creek', 'Bay City',
       'Beaumont', 'Port Arthur', 'Beckley', 'Bellingham', 'Bend',
       'Billings', 'Binghamton', 'Birmingham', 'Hoover', 'Bismarck',
       'Blacksburg', 'Christiansburg', 'Bloomington', 'Bloomsburg',
       'Berwick', 'Boise City', 'Boston', 'Cambridge', 'Newton',
       'Framingham', 'Rockingham County', 'Strafford County', 'Boulder',
       'Bowling Green', 'Bremerton', 'Silverdale', '

In [9]:
population = population[population['Metro-Area'] != '']

In [10]:
print(population['State'].nunique())
population['State'].unique()

51


array(['TX', 'OH', 'GA', 'OR', 'NY', 'NM', 'LA', 'PA', 'NJ', 'IA', 'AK',
       'MI', 'AL', 'WI', 'NC', 'SC', 'CA', 'MD', 'ME', 'MA', 'WV', 'WA',
       'MT', 'ND', 'VA', 'IL', 'IN', 'ID', 'NH', 'CO', 'KY', 'CT', 'VT',
       'FL', 'MO', 'NV', 'WY', 'TN', 'DE', 'MN', 'OK', 'AR', 'AZ', 'NE',
       'MS', 'HI', 'KS', 'UT', 'RI', 'SD', 'DC'], dtype=object)

In [11]:
population['City,State'] = population['Metro-Area'] + ', ' + population['State']

In [12]:
population.shape

(750, 15)

### Drop duplicate rows and unused columns

In [13]:
population[population.duplicated(subset=['City,State'], keep=False)]

Unnamed: 0,Metro-Area,State,Census,Estimate Base,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,"City,State"
42,Boston,MA,4552402.0,4552595.0,4566348.0,4609790.0,4656593.0,4702877.0,4746931.0,4778340.0,4809061.0,4841772.0,4859536.0,4873019.0,"Boston, MA"
42,Cambridge,MA,4552402.0,4552595.0,4566348.0,4609790.0,4656593.0,4702877.0,4746931.0,4778340.0,4809061.0,4841772.0,4859536.0,4873019.0,"Cambridge, MA"
42,Newton,MA,4552402.0,4552595.0,4566348.0,4609790.0,4656593.0,4702877.0,4746931.0,4778340.0,4809061.0,4841772.0,4859536.0,4873019.0,"Newton, MA"
43,Boston,MA,1887792.0,1888025.0,1894717.0,1913107.0,1933463.0,1953625.0,1972382.0,1986314.0,2001862.0,2016866.0,2024799.0,2031884.0,"Boston, MA"
44,Cambridge,MA,2246244.0,2246215.0,2253142.0,2276355.0,2300205.0,2324767.0,2346646.0,2362297.0,2374511.0,2388396.0,2396653.0,2400733.0,"Cambridge, MA"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,Arlington,WV,4444378.0,4445001.0,4468671.0,4553850.0,4634218.0,4707011.0,4763947.0,4816135.0,4863456.0,4913897.0,4944420.0,4970252.0,"Arlington, WV"
396,Alexandria,DC,4444378.0,4445001.0,4468671.0,4553850.0,4634218.0,4707011.0,4763947.0,4816135.0,4863456.0,4913897.0,4944420.0,4970252.0,"Alexandria, DC"
396,Alexandria,VA,4444378.0,4445001.0,4468671.0,4553850.0,4634218.0,4707011.0,4763947.0,4816135.0,4863456.0,4913897.0,4944420.0,4970252.0,"Alexandria, VA"
396,Alexandria,MD,4444378.0,4445001.0,4468671.0,4553850.0,4634218.0,4707011.0,4763947.0,4816135.0,4863456.0,4913897.0,4944420.0,4970252.0,"Alexandria, MD"


In [14]:
population = population.drop_duplicates(subset=['City,State' ], keep='last')
print(population.shape)

(706, 15)


In [15]:
population = population.drop(columns = ['Census', 'Estimate Base', 'Metro-Area', 'State'])
population = population[['City,State', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']]

In [16]:
print(population.shape)
population.head()

(706, 11)


Unnamed: 0,"City,State",2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,"Abilene, TX",165585.0,166634.0,167442.0,167473.0,168342.0,169688.0,170017.0,170429.0,171150.0,172060.0
1,"Akron, OH",703031.0,703200.0,702109.0,703621.0,704908.0,704382.0,703524.0,703987.0,703855.0,703479.0
2,"Albany, GA",154145.0,154545.0,153976.0,152667.0,151949.0,150387.0,149137.0,148090.0,147840.0,146726.0
3,"Albany, OR",116891.0,118164.0,118273.0,118405.0,119042.0,120236.0,122769.0,125035.0,127451.0,129749.0
3,"Lebanon, OR",116891.0,118164.0,118273.0,118405.0,119042.0,120236.0,122769.0,125035.0,127451.0,129749.0


## 2. Stack dataframe 
- this is to try the groupby so I don't have to create separate csv's and run each city csv separately
- https://stackoverflow.com/questions/64179626/stack-unstack-melt-pivot-transpose-what-is-the-simple-method-to-convert-mul

In [17]:
population_melt = (population.melt(id_vars=['City,State'],
                    var_name = 'ds',
                    value_name = 'y'
                    ).reset_index(drop=True))

In [18]:
population_melt

Unnamed: 0,"City,State",ds,y
0,"Abilene, TX",2010,165585.0
1,"Akron, OH",2010,703031.0
2,"Albany, GA",2010,154145.0
3,"Albany, OR",2010,116891.0
4,"Lebanon, OR",2010,116891.0
...,...,...,...
7055,"Warren, PA",2019,536081.0
7056,"Boardman, OH",2019,536081.0
7057,"Boardman, PA",2019,536081.0
7058,"Yuba City, CA",2019,175639.0


## 3. Forecast population using **fb prophet**

In [19]:
from fbprophet import Prophet
from fbprophet.plot import add_changepoints_to_plot

In [20]:
cities_list = list(population['City,State'])

In [21]:
def rnd_series(city):
    subset = population_melt[population_melt['City,State']== city]
    dates = (pd.DataFrame({'ds': pd.to_datetime(population_melt['ds'])}))
    
    return subset

In [22]:
series = [rnd_series(city) for city in cities_list]

In [23]:
len(series)

706

In [24]:
series[0]

Unnamed: 0,"City,State",ds,y
0,"Abilene, TX",2010,165585.0
706,"Abilene, TX",2011,166634.0
1412,"Abilene, TX",2012,167442.0
2118,"Abilene, TX",2013,167473.0
2824,"Abilene, TX",2014,168342.0
3530,"Abilene, TX",2015,169688.0
4236,"Abilene, TX",2016,170017.0
4942,"Abilene, TX",2017,170429.0
5648,"Abilene, TX",2018,171150.0
6354,"Abilene, TX",2019,172060.0


In [25]:
def run_prophet(series):
    model = Prophet(daily_seasonality=False,
                    weekly_seasonality=False,
                    yearly_seasonality=False)
    model.fit(series)
    forecast = model.make_future_dataframe(periods=10, freq='Y')
    forecast = model.predict(forecast)
    forecast = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
    forecast['City,State'] = series['City,State'].iloc[0]
    forecast = forecast[['City,State','ds', 'yhat', 'yhat_lower', 'yhat_upper']]
    forecast['ds'] = pd.DatetimeIndex(forecast['ds']).year
    forecast[['yhat', 'yhat_lower', 'yhat_upper']] = forecast[['yhat', 'yhat_lower', 'yhat_upper']].apply(np.ceil)
    return forecast

In [26]:
f = run_prophet(series[0])
f.head()

INFO:fbprophet:n_changepoints greater than number of observations. Using 7.


Unnamed: 0,"City,State",ds,yhat,yhat_lower,yhat_upper
0,"Abilene, TX",2010,165586.0,165556.0,165620.0
1,"Abilene, TX",2011,166634.0,166602.0,166664.0
2,"Abilene, TX",2012,167441.0,167409.0,167472.0
3,"Abilene, TX",2013,167475.0,167447.0,167508.0
4,"Abilene, TX",2014,168344.0,168310.0,168375.0


In [27]:
from time import time, ctime

In [None]:
start = ctime()

for i in range(len(series)):
    f = run_prophet(series[i])
    f.to_csv('population_prediction.csv', mode='a', index='False')

end = ctime()
total_time = end-start
print(f'{Total_time} = {end} - {start}')

In [None]:
predictions = pd.read_csv('population_prediction.csv')

In [None]:
prediction.loc[prediction['City,State'] == 'City,State']

In [None]:
prediction = prediction[prediction['City,State'] != 'City,State']

In [None]:
prediction.to_csv('population_prediction.csv', index=False)

In [None]:
population_prediction = predictions[['City,State', 'ds', 'yhat']]

In [None]:
population_prediction = population_prediction.pivot_table(index = 'City,State',
                                                          columns = 'ds',
                                                          values = 'yhat')

In [None]:
population_prediction.head()

In [None]:
population_prediction.to_csv('population_prediction_long.csv', index=False)