In [1]:
import pymssql
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

database = "arctic_analysts_capstone"
user = "arctic_analysts"
password  = "ThisPassw0rd!"
server = "gen10-data-fundamentals-22-02-sql-server.database.windows.net"

def sql_query(query):
    conn = pymssql.connect(server, user, password, database)
    cursor = conn.cursor()
    queried_data = pd.read_sql(query, conn)
    return queried_data

In [2]:
def run_queries():
    tables = ["year", "month", "county", "median_income", "main_table"]

    query = f"SELECT * FROM {tables[0]}"
    year_df = sql_query(query)

    query = f"SELECT * FROM {tables[1]}"
    month_df = sql_query(query)

    query = f"SELECT * FROM {tables[2]}"
    county_df = sql_query(query)

    query = f"SELECT * FROM {tables[3]}"
    median_income_df = sql_query(query)

    query = f"SELECT * FROM {tables[4]}"
    main_table = sql_query(query)

    all_df = [year_df, month_df, county_df, median_income_df, main_table]
    return all_df

In [3]:
def join_tables(all_df):
    year_df = all_df[0]
    month_df = all_df[1]
    county_df = all_df[2]
    median_income_df = all_df[3]
    main_table = all_df[4]

    master_table = pd.merge(
        main_table, year_df, left_on="YearID", right_on="YearID", how="outer"
    )
    print(master_table.shape[0])
    # Now has 5607 rows

    master_table = pd.merge(
        master_table, month_df, left_on="MonthID", right_on="MonthID", how="outer"
    )
    # Now has 5607 rows
    print(master_table.shape[0])

    master_table = pd.merge(
        master_table, county_df, left_on="FIPS", right_on="FIPS", how="outer"
    )
    # Now has 5607 rows
    print(master_table.shape[0])

    master_table = pd.merge(
        master_table,
        median_income_df,
        left_on=["FIPS", "YearID"],
        right_on=["FIPS", "YearID"],
        how="outer",
    )
    # Now has 20727 rows
    print(master_table.shape[0])

    master_table.loc[(master_table.MedianIncome < 0), "MedianIncome"] = None
    return master_table

In [4]:
all_df = run_queries()
master_table = join_tables(all_df)

5607
5607
5607
20727


In [5]:
master_table[(master_table['AgeGroup'] == '25-44')]


Unnamed: 0,FIPS,YearID,MonthID,NewUnits,NewBuildings,MedianHousePrice,AverageRate,AveragePoints,Year,Month,County,AgeGroup,MedianIncome
60,34001,6,1,107,103,249344.0,5.71,0.7,2005,Jan,Atlantic County,25-44,52040.0
65,34001,6,2,129,129,252209.0,5.63,0.7,2005,Feb,Atlantic County,25-44,52040.0
70,34001,6,3,165,162,254847.0,5.93,0.7,2005,Mar,Atlantic County,25-44,52040.0
75,34001,6,4,184,163,258515.0,5.86,0.6,2005,Apr,Atlantic County,25-44,52040.0
80,34001,6,5,135,134,262680.0,5.72,0.6,2005,May,Atlantic County,25-44,52040.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20675,34041,20,8,12,8,247344.0,3.62,0.5,2019,Aug,Warren County,25-44,86705.0
20680,34041,20,9,10,6,246153.0,3.61,0.5,2019,Sep,Warren County,25-44,86705.0
20685,34041,20,10,19,17,246000.0,3.69,0.6,2019,Oct,Warren County,25-44,86705.0
20690,34041,20,11,15,14,246880.0,3.70,0.6,2019,Nov,Warren County,25-44,86705.0


In [6]:
# year 2020-2022 aggregated

predicted_years = master_table[(master_table['Year'] == 2020) | (master_table['Year'] == 2021) | (master_table['Year'] == 2022)]
predicted_years


Unnamed: 0,FIPS,YearID,MonthID,NewUnits,NewBuildings,MedianHousePrice,AverageRate,AveragePoints,Year,Month,County,AgeGroup,MedianIncome
960,34001,21,1,82,49,213953.0,3.62,0.7,2020,Jan,Atlantic County,,
961,34001,21,2,54,42,215292.0,3.47,0.7,2020,Feb,Atlantic County,,
962,34001,21,3,46,31,216287.0,3.45,0.7,2020,Mar,Atlantic County,,
963,34001,21,4,56,48,217516.0,3.31,0.7,2020,Apr,Atlantic County,,
964,34001,21,5,26,19,219130.0,3.23,0.7,2020,May,Atlantic County,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20722,34041,22,11,26,14,334266.0,3.07,0.7,2021,Nov,Warren County,,
20723,34041,22,12,19,18,336388.0,3.10,0.6,2021,Dec,Warren County,,
20724,34041,23,1,13,12,338845.0,3.45,0.7,2022,Jan,Warren County,,
20725,34041,23,2,29,18,340046.0,3.76,0.8,2022,Feb,Warren County,,


In [7]:
predicted_df = predicted_years[['FIPS','Year','MedianHousePrice','AverageRate','AveragePoints']]
new_df = predicted_df.groupby(by=['FIPS','Year']).agg('mean')
new_df

Unnamed: 0_level_0,Unnamed: 1_level_0,MedianHousePrice,AverageRate,AveragePoints
FIPS,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
34001,2020,225222.000000,3.114167,0.733333
34001,2021,270244.500000,2.956667,0.691667
34001,2022,303865.666667,3.793333,0.766667
34003,2020,528979.500000,3.114167,0.733333
34003,2021,587393.333333,2.956667,0.691667
...,...,...,...,...
34039,2021,468329.166667,2.956667,0.691667
34039,2022,503446.333333,3.793333,0.766667
34041,2020,261353.666667,3.114167,0.733333
34041,2021,311226.583333,2.956667,0.691667


In [10]:
list = []
for row in new_df.index:
    values_list = []
    values_list.append(int(row[0]))
    values_list.append(row[1])
    values_list.append(new_df['MedianHousePrice'][row])
    values_list.append(new_df['AverageRate'][row])
    values_list.append(new_df['AveragePoints'][row])
    list.append(values_list)

columns = ['FIPS', 'Year', 'MedianHousePrice', 'AverageRate','AveragePoints']

df_next_years = pd.DataFrame(data=list,columns=columns)
df_next_years


Unnamed: 0,FIPS,Year,MedianHousePrice,AverageRate,AveragePoints
0,34001,2020,225222.000000,3.114167,0.733333
1,34001,2021,270244.500000,2.956667,0.691667
2,34001,2022,303865.666667,3.793333,0.766667
3,34003,2020,528979.500000,3.114167,0.733333
4,34003,2021,587393.333333,2.956667,0.691667
...,...,...,...,...,...
58,34039,2021,468329.166667,2.956667,0.691667
59,34039,2022,503446.333333,3.793333,0.766667
60,34041,2020,261353.666667,3.114167,0.733333
61,34041,2021,311226.583333,2.956667,0.691667


In [46]:
# reading in predictions 
df = pd.read_csv('PredictedMedianIncome_Round_2.csv')
df = df[df['AgeGroup'] == '25-44']
list = []
for row in df.index:
    values_list = []
    if pd.isna(df['FIPS'][row]) & (df['Year'][row] == 2020):
        values_list.append(df['Year'][row])
        values_list.append(int(df['FIPS'][row-1]))
        values_list.append(df['AgeGroup'][row])
        values_list.append(int(df['train_and_predicted'][row]))
    elif pd.isna(df['FIPS'][row]) & (df['Year'][row] == 2021):
        values_list.append(df['Year'][row])
        values_list.append(int(df['FIPS'][row-2]))
        values_list.append(df['AgeGroup'][row])
        values_list.append(int(df['train_and_predicted'][row]))
    elif pd.isna(df['FIPS'][row]) & (df['Year'][row] == 2022):
        values_list.append(df['Year'][row])
        values_list.append(int(df['FIPS'][row-3]))        
        values_list.append(df['AgeGroup'][row])
        values_list.append(int(df['train_and_predicted'][row]))
    if len(values_list) > 0:
        list.append(values_list)

columns = ['Year', 'FIPS', 'AgeGroup', 'PredictedIncome']

income_predictions = pd.DataFrame(data=list,columns=columns)

income_predictions = income_predictions[income_predictions['AgeGroup'] =='25-44']
income_predictions.drop('AgeGroup',axis=1,inplace=True)
income_predictions


Unnamed: 0,Year,FIPS,PredictedIncome
0,2020,34001,66938
1,2021,34001,68852
2,2022,34001,70736
3,2020,34003,122791
4,2021,34003,119151
5,2022,34003,118553
6,2020,34005,91828
7,2021,34005,92656
8,2022,34005,93484
9,2020,34007,78244


In [34]:
counties_list = df_next_years['FIPS'].unique()
len(counties_list)

21

In [35]:
income_counties_list = income_predictions['FIPS'].unique()
len(income_counties_list)

20

In [37]:
np.setdiff1d(counties_list, income_counties_list)

array([34035], dtype=int64)

In [45]:
main_counties_list = master_table[['FIPS','County']]
main_counties_list = main_counties_list.drop_duplicates()
main_counties_list[main_counties_list['FIPS'] == '34035']

Unnamed: 0,FIPS,County
16779,34035,Somerset County


In [51]:
main_predictions = pd.merge(df_next_years, income_predictions, on=['Year','FIPS'],how='inner')
main_predictions

Unnamed: 0,FIPS,Year,MedianHousePrice,AverageRate,AveragePoints,PredictedIncome
0,34001,2020,225222.0,3.114167,0.733333,66938
1,34001,2021,270244.5,2.956667,0.691667,68852
2,34001,2022,303865.666667,3.793333,0.766667,70736
3,34003,2020,528979.5,3.114167,0.733333,122791
4,34003,2021,587393.333333,2.956667,0.691667,119151
5,34003,2022,622556.0,3.793333,0.766667,118553
6,34005,2020,258670.0,3.114167,0.733333,91828
7,34005,2021,299478.166667,2.956667,0.691667,92656
8,34005,2022,327865.0,3.793333,0.766667,93484
9,34007,2020,201420.083333,3.114167,0.733333,78244
