In [1]:
# Imports
import pandas as pd
import numpy as np
from pathlib import Path
import hvplot.pandas
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from pandas.tseries.offsets import DateOffset
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
migration = pd.read_csv(Path('Resources/state_to_state.csv'))
migration

Unnamed: 0,IBRC_Geo_ID,State,District Name,Year,Births,Deaths,Net International Migration,Net Domestic Migration,Residual
0,6500001,AK,Kenai Peninsula Economic Development District,1990,195,55,3.0,155,0
1,6500001,AK,Kenai Peninsula Economic Development District,1991,766,146,18.0,800,-9
2,6500001,AK,Kenai Peninsula Economic Development District,1992,736,151,43.0,129,-15
3,6500001,AK,Kenai Peninsula Economic Development District,1993,703,194,42.0,70,-12
4,6500001,AK,Kenai Peninsula Economic Development District,1994,693,192,26.0,778,-24
...,...,...,...,...,...,...,...,...,...
12842,6500418,MT,Mission West Community Development Partners,2016,493,520,9.0,367,0
12843,6500418,MT,Mission West Community Development Partners,2017,528,535,29.0,872,2
12844,6500418,MT,Mission West Community Development Partners,2018,462,512,11.0,227,-3
12845,6500418,MT,Mission West Community Development Partners,2019,428,489,27.0,589,-5


In [3]:
# Grouping by the 'State' column
grouped_by_state = migration.groupby('State')

# Now, you can perform operations on each group, for example, calculating the mean
by_state = grouped_by_state.sum()

slice_year = migration[(migration['Year'] >= 2000) & (migration['Year'] <= 2019)]
slice_year_prediction = migration[(migration['Year'] >= 2020) & (migration['Year'] <= 2024)]
slice_year = slice_year.drop(['District Name','IBRC_Geo_ID'], axis=1)
slice_year

Unnamed: 0,State,Year,Births,Deaths,Net International Migration,Net Domestic Migration,Residual
11,AK,2000,135,80,3.0,-99,-2
12,AK,2001,622,278,20.0,81,-57
13,AK,2002,615,293,17.0,487,-40
14,AK,2003,643,317,1.0,-150,-76
15,AK,2004,620,312,32.0,97,-39
...,...,...,...,...,...,...,...
12841,MT,2015,551,501,-6.0,186,-4
12842,MT,2016,493,520,9.0,367,0
12843,MT,2017,528,535,29.0,872,2
12844,MT,2018,462,512,11.0,227,-3


In [4]:
grouped_df = slice_year.groupby(['State', 'Year']).sum()
grouped_df = grouped_df.reset_index()
#ca_df = grouped_df[grouped_df['State'] == 'CA']
#grouped_df['Percentage']=grouped_df['Net Domestic Migration'].pct_change()
#grouped_df = grouped_df.dropna()

In [5]:
import hvplot.pandas
grouped_df.hvplot(groupby='State',
                x='Year', 
                y='Net Domestic Migration',
                xlabel='Year', 
                ylabel='Domestic Migration',
                title='Domestic Migration Change by State').opts(yformatter='%.0f',
                                      active_tools=[])

In [6]:
grouped_df

Unnamed: 0,State,Year,Births,Deaths,Net International Migration,Net Domestic Migration,Residual
0,AK,2000,455,188,61.0,-497,-9
1,AK,2001,1880,783,251.0,-2380,-108
2,AK,2002,1873,736,157.0,-521,-107
3,AK,2003,1871,770,-53.0,-1176,-119
4,AK,2004,1791,774,335.0,-1102,-138
...,...,...,...,...,...,...,...
915,WV,2015,20089,22757,1476.0,-5989,-56
916,WV,2016,19342,22458,1290.0,-9056,-15
917,WV,2017,18952,23139,592.0,-10139,-18
918,WV,2018,18589,23625,54.0,-7734,-14


In [7]:
# Create the signal to buy
grouped_df['Target'] = np.where(grouped_df['Net Domestic Migration'] >= 0, 'Yes', 'No')
grouped_df.loc[172:400]

Unnamed: 0,State,Year,Births,Deaths,Net International Migration,Net Domestic Migration,Residual,Target
172,GA,2012,131860,70456,22454.0,14974,1118,Yes
173,GA,2013,128689,74852,23207.0,-5842,810,No
174,GA,2014,129543,74843,17777.0,22919,216,Yes
175,GA,2015,131678,78566,26234.0,32650,153,Yes
176,GA,2016,131395,80067,35276.0,38504,-19,Yes
...,...,...,...,...,...,...,...,...
396,MI,2016,46137,40168,7580.0,-3922,-140,No
397,MI,2017,45680,41436,5727.0,4481,-154,Yes
398,MI,2018,44829,41943,4044.0,469,-69,Yes
399,MI,2019,43822,40889,2916.0,-1216,12,No


In [8]:
# Segment the features from the target
y = grouped_df['Target']
X = grouped_df.drop(['Target'], axis =1 )

X = pd.get_dummies(X, columns=['State','Year'])
X

Unnamed: 0,Births,Deaths,Net International Migration,Net Domestic Migration,Residual,State_AK,State_AL,State_AR,State_AZ,State_CA,...,Year_2010,Year_2011,Year_2012,Year_2013,Year_2014,Year_2015,Year_2016,Year_2017,Year_2018,Year_2019
0,455,188,61.0,-497,-9,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1880,783,251.0,-2380,-108,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1873,736,157.0,-521,-107,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1871,770,-53.0,-1176,-119,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1791,774,335.0,-1102,-138,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,20089,22757,1476.0,-5989,-56,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
916,19342,22458,1290.0,-9056,-15,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
917,18952,23139,592.0,-10139,-18,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
918,18589,23625,54.0,-7734,-14,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split (X, y)
X_train

Unnamed: 0,Births,Deaths,Net International Migration,Net Domestic Migration,Residual,State_AK,State_AL,State_AR,State_AZ,State_CA,...,Year_2010,Year_2011,Year_2012,Year_2013,Year_2014,Year_2015,Year_2016,Year_2017,Year_2018,Year_2019
8,1901,752,227.0,-1011,26,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
112,10549,7549,639.0,-3062,50,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
846,3298,2773,133.0,334,-207,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
240,9018,5581,992.0,-1169,76,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
116,10343,8256,579.0,9057,39,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47,41782,28065,3719.0,8199,1462,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
611,1985,1993,-40.0,-1540,-2,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
761,79221,55475,10607.0,11673,6174,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
653,36155,35607,3597.0,-6664,-316,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False


In [10]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled =  X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)
X_test_scaled

array([[ 0.16441495,  0.3808112 , -0.19610936, ..., -0.23116037,
        -0.18569534,  4.39250699],
       [-0.55717862, -0.74150166, -0.4325939 , ..., -0.23116037,
        -0.18569534, -0.22766042],
       [-0.59132994, -0.70757187, -0.4075839 , ..., -0.23116037,
        -0.18569534, -0.22766042],
       ...,
       [-0.75088539, -0.89526019, -0.4762407 , ..., -0.23116037,
         5.38516481, -0.22766042],
       [ 0.20074202,  0.89979572, -0.30025333, ..., -0.23116037,
        -0.18569534, -0.22766042],
       [-0.63876142, -0.77712795, -0.44669221, ..., -0.23116037,
        -0.18569534, -0.22766042]])

In [11]:
#X_train_scaled[:, 4] *= 0.25
#X_test_scaled[:, 4] *= 0.25
#X_train_scaled

In [12]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
naive_bayes_pred = gnb.fit(X_train_scaled, y_train).predict(X_test_scaled)

In [13]:
# Print classification reports
print(classification_report(y_test, naive_bayes_pred))

# Create and save the confustion matrix for the training data
training_matrix = confusion_matrix(y_test, naive_bayes_pred)
# Print the confusion matrix for the training data
training_matrix

              precision    recall  f1-score   support

          No       0.83      0.73      0.78        97
         Yes       0.82      0.89      0.85       133

    accuracy                           0.82       230
   macro avg       0.82      0.81      0.81       230
weighted avg       0.82      0.82      0.82       230



array([[ 71,  26],
       [ 15, 118]])

In [14]:
naive_bayes_pred

array(['No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes',
       'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes',
       'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes',
       'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No',
       'No', 'Yes', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes',
       'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No',
       'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes',
       'Yes', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No',
       'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No',
       'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No',
       'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No',
       'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes',
       'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes',
       'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No

In [15]:
#predictions = gnb.predi

In [16]:
slice_year_prediction = slice_year_prediction.drop(['District Name','IBRC_Geo_ID'], axis=1)
slice_year_prediction

Unnamed: 0,State,Year,Births,Deaths,Net International Migration,Net Domestic Migration,Residual
31,AK,2020,680,550,35.0,335,0
84,AK,2020,555,363,82.0,-600,-3
116,AK,2020,297,154,201.0,-775,-3
148,AL,2020,1819,2520,38.0,-2424,-15
180,AL,2020,4850,6716,74.0,283,9
...,...,...,...,...,...,...,...
12718,MO,2020,7551,6927,315.0,3461,-1
12750,AZ,2020,4976,5866,594.0,7537,-16
12782,CT,2020,9461,7423,3386.0,-7417,31
12814,NC,2020,2390,3136,51.0,2126,-10


In [17]:
grouped_df_pre = slice_year_prediction.groupby(['State', 'Year']).sum()
grouped_df_pre = grouped_df_pre.reset_index()

In [18]:
grouped_df_pre

Unnamed: 0,State,Year,Births,Deaths,Net International Migration,Net Domestic Migration,Residual
0,AK,2020,1532,1067,318.0,-1040,-6
1,AL,2020,57302,58800,2077.0,9624,-26
2,AR,2020,36332,33832,1517.0,5550,-30
3,AZ,2020,18211,19006,1503.0,30253,-137
4,CA,2020,96677,75240,17172.0,-89207,-132
5,CO,2020,9506,9086,242.0,3061,-19
6,CT,2020,51404,49388,14570.0,-29677,161
7,FL,2020,219996,230396,78072.0,174645,-1061
8,GA,2020,125818,94638,13275.0,37563,-21
9,IA,2020,23857,21842,3478.0,-9015,-4


In [None]:
# Segment the features from the target
y_2020 = grouped_df_pre['Target']
X_2020 = grouped_df_pre.drop(['Target'], axis =1 )

X = pd.get_dummies(X, columns=['State','Year'])
X