In [1]:
# Imports
import pandas as pd
import numpy as np
from pathlib import Path
import hvplot.pandas
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from pandas.tseries.offsets import DateOffset
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Create a Path and Read csv file 
migration_df = pd.read_csv(Path('Resources/state_to_state.csv'))
migration_df

Unnamed: 0,IBRC_Geo_ID,State,District Name,Year,Births,Deaths,Net International Migration,Net Domestic Migration,Residual
0,6500001,AK,Kenai Peninsula Economic Development District,1990,195,55,3.0,155,0
1,6500001,AK,Kenai Peninsula Economic Development District,1991,766,146,18.0,800,-9
2,6500001,AK,Kenai Peninsula Economic Development District,1992,736,151,43.0,129,-15
3,6500001,AK,Kenai Peninsula Economic Development District,1993,703,194,42.0,70,-12
4,6500001,AK,Kenai Peninsula Economic Development District,1994,693,192,26.0,778,-24
...,...,...,...,...,...,...,...,...,...
12842,6500418,MT,Mission West Community Development Partners,2016,493,520,9.0,367,0
12843,6500418,MT,Mission West Community Development Partners,2017,528,535,29.0,872,2
12844,6500418,MT,Mission West Community Development Partners,2018,462,512,11.0,227,-3
12845,6500418,MT,Mission West Community Development Partners,2019,428,489,27.0,589,-5


In [3]:
# Group by the 'State' column
#grouped_by_state = migration_df.groupby('State')

# Calculate the mean
#by_state = grouped_by_state.sum()

# Slice and Drop columns
slice_year = migration_df[(migration_df['Year'] >= 1990) & (migration_df['Year'] <= 2024)]
slice_year = slice_year.drop(['District Name','IBRC_Geo_ID'], axis=1)
slice_year

Unnamed: 0,State,Year,Births,Deaths,Net International Migration,Net Domestic Migration,Residual
0,AK,1990,195,55,3.0,155,0
1,AK,1991,766,146,18.0,800,-9
2,AK,1992,736,151,43.0,129,-15
3,AK,1993,703,194,42.0,70,-12
4,AK,1994,693,192,26.0,778,-24
...,...,...,...,...,...,...,...
12842,MT,2016,493,520,9.0,367,0
12843,MT,2017,528,535,29.0,872,2
12844,MT,2018,462,512,11.0,227,-3
12845,MT,2019,428,489,27.0,589,-5


In [4]:
# Group by 'State' and 'Year' columns and calculate the mean
grouped_df = slice_year.groupby(['State', 'Year']).sum()
grouped_df = grouped_df.reset_index()

# Plot 
import hvplot.pandas
grouped_df.hvplot(groupby='State',
                  x='Year', 
                  y='Net Domestic Migration',
                  xlabel='Year', 
                  ylabel='Domestic Migration',
                  line_width=2.0,    
                  width=750,        
                  height=400,
                  title='Domestic Migration Change by State').opts(yformatter='%.0f',
                                      active_tools=[])

In [5]:
# Calculate mean of all States for the last 30 years
sum_by_state = grouped_df.groupby('State')['Net Domestic Migration'].mean().reset_index()

#Plot
sum_by_state.hvplot.bar(
    x='State',
    y='Net Domestic Migration',
    xlabel='State',
    ylabel='30_Year Average Domestic Migration',
    title='30_Year Average Migration by State',
    line_width=2.0,    
    width=850,        
    height=500        
).opts(yformatter='%.0f', active_tools=[])


In [6]:
# Create the target, of State populations either growing or declining
grouped_df['Target'] = np.where(grouped_df['Net Domestic Migration'] >= 0, 'Yes', 'No')
grouped_df.loc[172:400]

Unnamed: 0,State,Year,Births,Deaths,Net International Migration,Net Domestic Migration,Residual,Target
172,CO,2007,12128,7123,1565.0,9051,106,Yes
173,CO,2008,12369,7107,1503.0,8691,101,Yes
174,CO,2009,12498,7276,1541.0,7107,93,Yes
175,CO,2010,2798,1769,225.0,-1909,125,No
176,CO,2011,10862,7519,1526.0,-3862,36,No
...,...,...,...,...,...,...,...,...
396,IN,2014,32962,25828,1611.0,-4760,-275,No
397,IN,2015,33142,27072,1113.0,-9530,-116,No
398,IN,2016,32921,26560,2244.0,-9124,-50,No
399,IN,2017,32258,27790,1317.0,-4419,-74,No


In [7]:
# Segment the features from the target
y = grouped_df['Target']
X = grouped_df.drop(['Target'], axis =1 )

# Run get dummies to the specified column
X = pd.get_dummies(X, columns=['State','Year'])
X

Unnamed: 0,Births,Deaths,Net International Migration,Net Domestic Migration,Residual,State_AK,State_AL,State_AR,State_AZ,State_CA,...,Year_2011,Year_2012,Year_2013,Year_2014,Year_2015,Year_2016,Year_2017,Year_2018,Year_2019,Year_2020
0,769,153,66.0,208,163,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2829,624,230.0,832,-199,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2726,606,245.0,-1259,-62,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,2688,699,311.0,-156,-72,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2491,682,232.0,-3481,-119,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1421,19342,22458,1290.0,-9056,-15,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
1422,18952,23139,592.0,-10139,-18,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1423,18589,23625,54.0,-7734,-14,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1424,17839,22767,1185.0,-6929,-18,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [8]:
# Split data into default 75% Train and 25% Test 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split (X, y)
X_train

Unnamed: 0,Births,Deaths,Net International Migration,Net Domestic Migration,Residual,State_AK,State_AL,State_AR,State_AZ,State_CA,...,Year_2011,Year_2012,Year_2013,Year_2014,Year_2015,Year_2016,Year_2017,Year_2018,Year_2019,Year_2020
969,80042,56226,10604.0,-38074,-848,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
616,45680,41436,5727.0,4481,-154,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
50,63211,47721,5202.0,7065,-258,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
759,121616,74541,23173.0,73070,-5149,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1053,50970,45232,2883.0,13450,34,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1329,3010,3390,419.0,-54,-20,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
420,36159,21053,4490.0,-2990,884,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
950,496,480,52.0,-342,21,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
646,18267,15817,1911.0,-889,-74,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False


In [9]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled =  X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)
X_test_scaled

array([[-0.17862338, -0.17476913, -0.3009854 , ..., -0.18124636,
        -0.16698687, -0.18124636],
       [-0.43358624, -0.41651442, -0.30643294, ..., -0.18124636,
        -0.16698687,  5.51735231],
       [-0.09926167, -0.08387469, -0.22279228, ..., -0.18124636,
        -0.16698687, -0.18124636],
       ...,
       [-0.48859716, -0.49904891, -0.31869688, ..., -0.18124636,
        -0.16698687, -0.18124636],
       [ 2.36568538,  2.39700247,  0.02139709, ..., -0.18124636,
        -0.16698687, -0.18124636],
       [-0.212078  , -0.12571164, -0.30472883, ..., -0.18124636,
         5.98849472, -0.18124636]])

In [10]:
# Create the model, fit and predict 
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
naive_bayes_pred = gnb.fit(X_train_scaled, y_train).predict(X_test_scaled)
# Print classification reports
print(classification_report(y_test, naive_bayes_pred))

# Create and save the confustion matrix for the training data
naive_bayes_matrix = confusion_matrix(y_test, naive_bayes_pred)

# Print the confusion matrix for the training data
naive_bayes_matrix

              precision    recall  f1-score   support

          No       0.61      0.95      0.74       164
         Yes       0.91      0.49      0.64       193

    accuracy                           0.70       357
   macro avg       0.76      0.72      0.69       357
weighted avg       0.78      0.70      0.69       357



array([[155,   9],
       [ 98,  95]])

In [11]:
# Import the KNeighborsClassifier module from sklearn
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the KNeighborsClassifier model with n_neighbors = 25
knn = KNeighborsClassifier(n_neighbors=50)

# Train the model using the training data
knn.fit(X_train_scaled, y_train)

# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)

# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test, y_pred))

# Create and save the confustion matrix for the training data
KNeighbors_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix for the training data
KNeighbors_matrix

              precision    recall  f1-score   support

          No       0.85      0.73      0.78       164
         Yes       0.79      0.89      0.84       193

    accuracy                           0.82       357
   macro avg       0.82      0.81      0.81       357
weighted avg       0.82      0.82      0.81       357



array([[119,  45],
       [ 21, 172]])

In [12]:
from sklearn.tree import DecisionTreeClassifier

# Instantiate a DecisionTreeClassifier instance
dt_classifier = DecisionTreeClassifier()

# Train the Decision Tree classifier on the scaled training data
dt_classifier.fit(X_train_scaled, y_train)

# Make predictions on the testing data
dt_pred = dt_classifier.predict(X_test_scaled)

# Print classification reports
print("Classification Report for Decision Tree Classifier:")
print(classification_report(y_test, dt_pred))

# Create and save the confusion matrix for the testing data
conf_matrix_dt = confusion_matrix(y_test, dt_pred)
print("Confusion Matrix for Decision Tree Classifier:")
print(conf_matrix_dt)

# Evaluate accuracy
accuracy_dt = accuracy_score(y_test, dt_pred)
print("Accuracy Score for Decision Tree Classifier:", accuracy_dt)

Classification Report for Decision Tree Classifier:
              precision    recall  f1-score   support

          No       1.00      1.00      1.00       164
         Yes       1.00      1.00      1.00       193

    accuracy                           1.00       357
   macro avg       1.00      1.00      1.00       357
weighted avg       1.00      1.00      1.00       357

Confusion Matrix for Decision Tree Classifier:
[[164   0]
 [  0 193]]
Accuracy Score for Decision Tree Classifier: 1.0


In [13]:
# Creating individual DataFrames for each state based
state_dfs = {}
for state in grouped_df['State'].unique():
    state_df = grouped_df[grouped_df['State'] == state][['Year', 'Net Domestic Migration']]
    state_df['Percentage Change'] = state_df['Net Domestic Migration'].pct_change() * 100
    state_dfs[state] = state_df

for state, state_df in state_dfs.items():
    locals()[f"{state}_df"] = state_df

In [16]:
CA_df.hvplot(x='Year', 
                y='Percentage Change',
                xlabel='Year', 
                ylabel='Percentage Change',
                title='Migration Percentage Change by State').opts(yformatter='%.0f',
                                      active_tools=[])