In [1]:
import pymssql
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:

database = "arctic_analysts_capstone"
user = "arctic_analysts"
password  = "ThisPassw0rd!"
server = "gen10-data-fundamentals-22-02-sql-server.database.windows.net"

def sql_query(query):
    conn = pymssql.connect(server, user, password, database)
    cursor = conn.cursor()
    queried_data = pd.read_sql(query, conn)
    return queried_data

In [3]:
def run_queries():
    tables = ["year", "month", "county", "median_income", "main_table"]

    query = f"SELECT * FROM {tables[0]}"
    year_df = sql_query(query)

    query = f"SELECT * FROM {tables[1]}"
    month_df = sql_query(query)

    query = f"SELECT * FROM {tables[2]}"
    county_df = sql_query(query)

    query = f"SELECT * FROM {tables[3]}"
    median_income_df = sql_query(query)

    query = f"SELECT * FROM {tables[4]}"
    main_table = sql_query(query)

    all_df = [year_df, month_df, county_df, median_income_df, main_table]
    return all_df

In [44]:
def join_tables(all_df):
    year_df = all_df[0]
    month_df = all_df[1]
    county_df = all_df[2]
    median_income_df = all_df[3]
    main_table = all_df[4]

    master_table = pd.merge(
        main_table, year_df, left_on="YearID", right_on="YearID", how="outer"
    )
    print(master_table.shape[0])
    # Now has 5607 rows

    master_table = pd.merge(
        master_table, month_df, left_on="MonthID", right_on="MonthID", how="outer"
    )
    # Now has 5607 rows
    print(master_table.shape[0])

    master_table = pd.merge(
        master_table, county_df, left_on="FIPS", right_on="FIPS", how="outer"
    )
    # Now has 5607 rows
    print(master_table.shape[0])

    master_table = pd.merge(
        master_table,
        median_income_df,
        left_on=["FIPS", "YearID"],
        right_on=["FIPS", "YearID"],
        how="outer",
    )
    # Now has 20727 rows
    print(master_table.shape[0])

    master_table.loc[(master_table.MedianIncome < 0), "MedianIncome"] = None
    return master_table

In [77]:
all_df = run_queries()
master_table = join_tables(all_df)

5607
5607
5607
20727


In [78]:
master_table.describe()

Unnamed: 0,YearID,MonthID,NewUnits,NewBuildings,MedianHousePrice,AverageRate,AveragePoints,Year,MedianIncome
count,20727.0,20727.0,20727.0,20727.0,20706.0,20727.0,20727.0,20727.0,18888.0
mean,12.629179,6.486322,96.352921,50.717615,320840.295711,4.782736,0.612563,2011.629179,63866.022872
std,5.015223,3.456069,105.821928,48.076455,108387.308961,1.111331,0.12131,5.015223,26138.573168
min,1.0,1.0,0.0,0.0,96963.0,2.68,0.4,2000.0,2499.0
25%,9.0,3.0,30.0,19.0,231949.0,3.91,0.5,2008.0,43918.0
50%,13.0,6.0,63.0,37.0,323147.0,4.46,0.6,2012.0,61402.0
75%,17.0,9.0,127.0,67.0,410766.25,5.82,0.7,2016.0,81849.0
max,23.0,12.0,1174.0,696.0,638470.0,8.52,1.0,2022.0,152424.0


In [99]:
master_table.AgeGroup.value_counts()

25-44       3780
45-64       3780
65-plus     3780
overall     3780
under-25    3780
Name: AgeGroup, dtype: int64

In [105]:
master_table[(master_table.AgeGroup.isna())].Year.unique()

array([2000, 2001, 2002, 2003, 2004, 2020, 2021, 2022], dtype=int64)

### Explanation
**What is happening?**
1. We need the main table that has all the data.
2. We will need our predicted data.
3. We are only trying to learn about 2020 through 2022.
**PLAN**
> 1. Filter the main table
> 2. Filter the predicted table
> 3. Combine the two tables.

**Note**
> The predicted data is yearly, the main data is monthly.

In [129]:
# reading in predictions
path = 'PredictedIncomeFinal.csv'
df = pd.read_csv(path)

df['MedianIncome'].update(df['train_and_predicted'])
cleaned_predictions = df[(df.Year > 2019) & (df.Year < 2023)].drop(columns = ['train_and_predicted'])
cleaned_predictions['FIPS'] = cleaned_predictions['FIPS'].astype('str')

# Calculating monthly income
cleaned_predictions['MonthlyIncome'] = cleaned_predictions['MedianIncome'] / 12
cleaned_predictions.drop(columns = ['MedianIncome'], inplace = True)
cleaned_predictions.head(3)

Unnamed: 0,Year,FIPS,AgeGroup,MonthlyIncome
15,2020,34001,25-44,5400.5294
16,2021,34001,25-44,5431.095149
17,2022,34001,25-44,5429.162613


In [130]:
# Filtering master table to just the targeted data
target_df = master_table[(master_table.Year > 2019) & (master_table.Year < 2023)]
target_df = target_df[['FIPS','Year','AverageRate','AveragePoints','County','MedianHousePrice']]
target_df

Unnamed: 0,FIPS,Year,AverageRate,AveragePoints,County,MedianHousePrice
960,34001,2020,3.62,0.7,Atlantic County,213953.0
961,34001,2020,3.47,0.7,Atlantic County,215292.0
962,34001,2020,3.45,0.7,Atlantic County,216287.0
963,34001,2020,3.31,0.7,Atlantic County,217516.0
964,34001,2020,3.23,0.7,Atlantic County,219130.0
...,...,...,...,...,...,...
20722,34041,2021,3.07,0.7,Warren County,334266.0
20723,34041,2021,3.10,0.6,Warren County,336388.0
20724,34041,2022,3.45,0.7,Warren County,338845.0
20725,34041,2022,3.76,0.8,Warren County,340046.0


In [131]:
# Merging predicting with actual
merged_tables = pd.merge(cleaned_predictions, target_df, left_on = ['Year', 'FIPS'], right_on = ['Year', 'FIPS'], how = 'outer')
merged_tables

Unnamed: 0,Year,FIPS,AgeGroup,MonthlyIncome,AverageRate,AveragePoints,County,MedianHousePrice
0,2020,34001,25-44,5400.529400,3.62,0.7,Atlantic County,213953.0
1,2020,34001,25-44,5400.529400,3.47,0.7,Atlantic County,215292.0
2,2020,34001,25-44,5400.529400,3.45,0.7,Atlantic County,216287.0
3,2020,34001,25-44,5400.529400,3.31,0.7,Atlantic County,217516.0
4,2020,34001,25-44,5400.529400,3.23,0.7,Atlantic County,219130.0
...,...,...,...,...,...,...,...,...
2182,2022,34017,overall,7678.823381,3.76,0.8,Hudson County,566691.0
2183,2022,34017,overall,7678.823381,4.17,0.8,Hudson County,573799.0
2184,2022,34017,under-25,5238.154591,3.45,0.7,Hudson County,563176.0
2185,2022,34017,under-25,5238.154591,3.76,0.8,Hudson County,566691.0


#### Bringing in Hans Calculation Code

In [134]:
# CALCULATIONS BASED ON 12% DOWNPAYMENT

#calculate monthly income
# This step was already completed.
#final_table['MonthlyIncome'] = final_table['MedianIncome']/12

#calculate montly mortgage payment
#https://www.educba.com/mortgage-formula/

final_table = merged_tables.copy()

for row in final_table:
    P = final_table['MedianHousePrice']-(final_table['MedianHousePrice']*.12)
    r = (final_table['AverageRate']/100)
    t = 30
    n = 12
    monthly_tax = (final_table['MedianHousePrice']*.0189)/12
    final_table['MonthlyMortgage'] = (P * (((r/n) * pow((1+(r/n)),(n*t))) / (pow((1+r/n),(n*t))-1))) + monthly_tax


# mortgage to income ratio
final_table['mortgage_income_ratio'] = final_table['MonthlyMortgage']/final_table['MonthlyIncome']

#affordability determination
def affordable_condition(x):
    if x <= .25:
        return 'Yes'
    elif np.isnan(x):
        return 'Missing'
    else:
        return 'No'

final_table['affordable'] = final_table['mortgage_income_ratio'].apply(affordable_condition)
final_table

Unnamed: 0,Year,FIPS,AgeGroup,MonthlyIncome,AverageRate,AveragePoints,County,MedianHousePrice,MonthlyMortgage,mortgage_income_ratio,affordable
0,2020,34001,25-44,5400.529400,3.62,0.7,Atlantic County,213953.0,1195.093395,0.221292,Yes
1,2020,34001,25-44,5400.529400,3.47,0.7,Atlantic County,215292.0,1186.661787,0.219731,Yes
2,2020,34001,25-44,5400.529400,3.45,0.7,Atlantic County,216287.0,1190.026862,0.220354,Yes
3,2020,34001,25-44,5400.529400,3.31,0.7,Atlantic County,217516.0,1181.950169,0.218858,Yes
4,2020,34001,25-44,5400.529400,3.23,0.7,Atlantic County,219130.0,1182.242012,0.218912,Yes
...,...,...,...,...,...,...,...,...,...,...,...
2182,2022,34017,overall,7678.823381,3.76,0.8,Hudson County,566691.0,3204.871233,0.417365,No
2183,2022,34017,overall,7678.823381,4.17,0.8,Hudson County,573799.0,3364.158642,0.438109,No
2184,2022,34017,under-25,5238.154591,3.45,0.7,Hudson County,563176.0,3098.635461,0.591551,No
2185,2022,34017,under-25,5238.154591,3.76,0.8,Hudson County,566691.0,3204.871233,0.611832,No


In [135]:
final_table.affordable.value_counts()

No         1299
Yes         807
Missing      81
Name: affordable, dtype: int64

In [144]:
final_table.AgeGroup.value_counts()

25-44       513
overall     486
under-25    459
45-64       378
65-plus     351
Name: AgeGroup, dtype: int64

In [145]:
#final_table.to_csv('affordability_results.csv', index = False)

In [151]:
final_annual_df = final_table.groupby(by = ['Year','FIPS','AgeGroup','County'])[['MedianHousePrice','MonthlyIncome','MonthlyMortgage']].agg('mean').reset_index()

final_annual_df['mortgage_income_ratio'] = final_annual_df['MonthlyMortgage']/final_annual_df['MonthlyIncome']
final_annual_df['affordable'] = final_annual_df['mortgage_income_ratio'].apply(affordable_condition)
final_annual_df

In [155]:
monthly_final_table.affordable.value_counts()

No         152
Yes         82
Missing      9
Name: affordable, dtype: int64

# Jed is working above this cell
### Below this cell is the original version that I slightly adjusted so it's not going to work as intended.

In [133]:
# year 2020-2022 aggregated
predicted_years = master_table[(master_table['Year'] == 2020) | (master_table['Year'] == 2021) | (master_table['Year'] == 2022)]
target_df = predicted_years[['FIPS','Year','YearID','County','MedianHousePrice','AverageRate','AveragePoints']].groupby(by=['FIPS','Year','County']).agg('mean').reset_index()
target_df['FIPS'] = target_df['FIPS'].astype('str')


# reading in predictions
path = 'PredictedIncomeFinal.csv'
df = pd.read_csv(path)

df['MedianIncome'].update(df['train_and_predicted'])
cleaned_predictions = df[(df.Year > 2019) & (df.Year < 2024)].drop(columns = ['train_and_predicted'])
cleaned_predictions['FIPS'] = cleaned_predictions['FIPS'].astype('str')
cleaned_predictions.head(3)


# list = []
# for row in df.index:
#     values_list = []
#     if pd.isna(df['FIPS'][row]) & (df['Year'][row] == 2020):
#         values_list.append(df['Year'][row])
#         values_list.append(int(df['FIPS'][row-1]))
#         values_list.append(df['AgeGroup'][row])
#         values_list.append(int(df['train_and_predicted'][row]))
#     elif pd.isna(df['FIPS'][row]) & (df['Year'][row] == 2021):
#         values_list.append(df['Year'][row])
#         values_list.append(int(df['FIPS'][row-2]))
#         values_list.append(df['AgeGroup'][row])
#         values_list.append(int(df['train_and_predicted'][row]))
#     elif pd.isna(df['FIPS'][row]) & (df['Year'][row] == 2022):
#         values_list.append(df['Year'][row])
#         values_list.append(int(df['FIPS'][row-3]))        
#         values_list.append(df['AgeGroup'][row])
#         values_list.append(int(df['train_and_predicted'][row]))
#     if len(values_list) > 0:
#         list.append(values_list)

In [75]:
# Nulls in the Predicted MedianIncome
cleaned_predictions[(cleaned_predictions.MedianIncome.isna())]

Unnamed: 0,Year,FIPS,MedianIncome,AgeGroup
240,2020,34021,,25-44
241,2021,34021,,25-44
242,2022,34021,,25-44
243,2023,34021,,25-44
865,2020,34003,,65-plus
866,2021,34003,,65-plus
867,2022,34003,,65-plus
868,2023,34003,,65-plus
1515,2020,34033,,overall
1516,2021,34033,,overall


In [61]:
# columns = ['Year', 'FIPS', 'AgeGroup', 'MedianIncome']

# income_predictions = pd.DataFrame(data=list,columns=columns)

# merging income predictions to get the rest of the data
main_predictions = pd.merge(target_df, cleaned_predictions, on=['Year','FIPS'],how='inner')
main_predictions

Unnamed: 0,FIPS,Year,County,YearID,MedianHousePrice,AverageRate,AveragePoints,MedianIncome,AgeGroup
0,34001,2020,Atlantic County,21.0,225222.000000,3.114167,0.733333,64806.352802,25-44
1,34001,2020,Atlantic County,21.0,225222.000000,3.114167,0.733333,87133.999209,45-64
2,34001,2020,Atlantic County,21.0,225222.000000,3.114167,0.733333,46802.105007,65-plus
3,34001,2021,Atlantic County,22.0,270244.500000,2.956667,0.691667,65173.141791,25-44
4,34001,2021,Atlantic County,22.0,270244.500000,2.956667,0.691667,93862.889829,45-64
...,...,...,...,...,...,...,...,...,...
238,34041,2021,Warren County,22.0,311226.583333,2.956667,0.691667,98532.990771,overall
239,34041,2021,Warren County,22.0,311226.583333,2.956667,0.691667,39347.604590,under-25
240,34041,2022,Warren County,23.0,340601.000000,3.793333,0.766667,69144.780119,25-44
241,34041,2022,Warren County,23.0,340601.000000,3.793333,0.766667,105559.985892,overall


In [62]:
# I don't think this is accurately brining the data in
#########################################################
# # adding income predictions to main table
# final_table = master_table.dropna()
# #final_table['FIPS'] = final_table['FIPS'].astype(int)
# final_table = final_table[['FIPS','Year','YearID','MonthID','MedianHousePrice', 'AverageRate', 'AveragePoints',
#        'MedianIncome', 'County', 'AgeGroup']]
# final_table = pd.concat([final_table,main_predictions])

In [None]:
final_table = pd.merge(main_predictions, master_table, left_on = ['Year', 'FIPS'])

In [59]:
# CALCULATIONS BASED ON 12% DOWNPAYMENT

#calculate monthly income
final_table['MonthlyIncome'] = final_table['MedianIncome']/12

#calculate montly mortgage payment
#https://www.educba.com/mortgage-formula/

for row in final_table:
    P = final_table['MedianHousePrice']-(final_table['MedianHousePrice']*.12)
    r = (final_table['AverageRate']/100)
    t = 30
    n = 12
    monthly_tax = (final_table['MedianHousePrice']*.0189)/12
    final_table['MonthlyMortgage'] = (P * (((r/n) * pow((1+(r/n)),(n*t))) / (pow((1+r/n),(n*t))-1))) + monthly_tax


# mortgage to income ratio
final_table['mortgage_income_ratio'] = final_table['MonthlyMortgage']/final_table['MonthlyIncome']

#affordability determination
def affordable_condition(x):
    if x <= .25:
        return 'Yes'
    elif np.isnan(x):
        return 'Missing'
    else:
        return 'No'

final_table['affordable'] = final_table['mortgage_income_ratio'].apply(affordable_condition)
final_table

Unnamed: 0,FIPS,Year,YearID,MonthID,MedianHousePrice,AverageRate,AveragePoints,MedianIncome,County,AgeGroup,MonthlyIncome,MonthlyMortgage,mortgage_income_ratio,affordable
60,34001,2005,6.0,1.0,249344.000000,5.710000,0.700000,52040.000000,Atlantic County,25-44,4336.666667,1667.638046,0.384544,No
61,34001,2005,6.0,1.0,249344.000000,5.710000,0.700000,62816.000000,Atlantic County,45-64,5234.666667,1667.638046,0.318576,No
62,34001,2005,6.0,1.0,249344.000000,5.710000,0.700000,30697.000000,Atlantic County,65-plus,2558.083333,1667.638046,0.651909,No
63,34001,2005,6.0,1.0,249344.000000,5.710000,0.700000,50377.000000,Atlantic County,overall,4198.083333,1667.638046,0.397238,No
64,34001,2005,6.0,1.0,249344.000000,5.710000,0.700000,28074.000000,Atlantic County,under-25,2339.500000,1667.638046,0.712818,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238,34041,2021,22.0,,311226.583333,2.956667,0.691667,98532.990771,Warren County,overall,8211.082564,1638.477538,0.199545,Yes
239,34041,2021,22.0,,311226.583333,2.956667,0.691667,39347.604590,Warren County,under-25,3278.967049,1638.477538,0.499693,No
240,34041,2022,23.0,,340601.000000,3.793333,0.766667,69144.780119,Warren County,25-44,5762.065010,1931.918066,0.335282,No
241,34041,2022,23.0,,340601.000000,3.793333,0.766667,105559.985892,Warren County,overall,8796.665491,1931.918066,0.219619,Yes


In [24]:
# IF NEEDED
# FILTERING OUT SOMMERSET BECAUSE WE DON'T HAVE INCOME PREDICTIONS FOR 2020-2022

# excluded_list = ['Somerset County']
# counties = master_table['County'].unique()
# included_counties = np.setdiff1d(counties, excluded_list)
# final_table = final_table[final_table['County'].isin(included_counties)]
# final_table

In [38]:
final_table[(final_table.MonthID.isna())]

Unnamed: 0,FIPS,Year,YearID,MonthID,MedianHousePrice,AverageRate,AveragePoints,MedianIncome,County,AgeGroup,MonthlyIncome,MonthlyMortgage,mortgage_income_ratio,affordable
0,34001,2020,21.0,,225222.000000,3.114167,0.733333,64806.352802,Atlantic County,25-44,5400.529400,1202.577122,0.222678,Yes
1,34001,2020,21.0,,225222.000000,3.114167,0.733333,87133.999209,Atlantic County,45-64,7261.166601,1202.577122,0.165618,Yes
2,34001,2020,21.0,,225222.000000,3.114167,0.733333,46802.105007,Atlantic County,65-plus,3900.175417,1202.577122,0.308339,No
3,34001,2021,22.0,,270244.500000,2.956667,0.691667,65173.141791,Atlantic County,25-44,5431.095149,1422.724043,0.261959,No
4,34001,2021,22.0,,270244.500000,2.956667,0.691667,93862.889829,Atlantic County,45-64,7821.907486,1422.724043,0.181890,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238,34041,2021,22.0,,311226.583333,2.956667,0.691667,98532.990771,Warren County,overall,8211.082564,1638.477538,0.199545,Yes
239,34041,2021,22.0,,311226.583333,2.956667,0.691667,39347.604590,Warren County,under-25,3278.967049,1638.477538,0.499693,No
240,34041,2022,23.0,,340601.000000,3.793333,0.766667,69144.780119,Warren County,25-44,5762.065010,1931.918066,0.335282,No
241,34041,2022,23.0,,340601.000000,3.793333,0.766667,105559.985892,Warren County,overall,8796.665491,1931.918066,0.219619,Yes


In [26]:
final_table.affordable.value_counts()

No         14813
Yes         4294
Missing        9
Name: affordable, dtype: int64

In [27]:
null_counts = final_table.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

MedianIncome             9
MonthlyIncome            9
mortgage_income_ratio    9
dtype: int64

In [28]:
final_table = final_table.drop(columns = ['YearID'])
final_table

Unnamed: 0,FIPS,Year,MedianHousePrice,AverageRate,AveragePoints,MedianIncome,County,AgeGroup,MonthlyIncome,MonthlyMortgage,mortgage_income_ratio,affordable
60,34001,2005,249344.000000,5.710000,0.700000,52040.000000,Atlantic County,25-44,4336.666667,1667.638046,0.384544,No
61,34001,2005,249344.000000,5.710000,0.700000,62816.000000,Atlantic County,45-64,5234.666667,1667.638046,0.318576,No
62,34001,2005,249344.000000,5.710000,0.700000,30697.000000,Atlantic County,65-plus,2558.083333,1667.638046,0.651909,No
63,34001,2005,249344.000000,5.710000,0.700000,50377.000000,Atlantic County,overall,4198.083333,1667.638046,0.397238,No
64,34001,2005,249344.000000,5.710000,0.700000,28074.000000,Atlantic County,under-25,2339.500000,1667.638046,0.712818,No
...,...,...,...,...,...,...,...,...,...,...,...,...
238,34041,2021,311226.583333,2.956667,0.691667,98532.990771,Warren County,overall,8211.082564,1638.477538,0.199545,Yes
239,34041,2021,311226.583333,2.956667,0.691667,39347.604590,Warren County,under-25,3278.967049,1638.477538,0.499693,No
240,34041,2022,340601.000000,3.793333,0.766667,69144.780119,Warren County,25-44,5762.065010,1931.918066,0.335282,No
241,34041,2022,340601.000000,3.793333,0.766667,105559.985892,Warren County,overall,8796.665491,1931.918066,0.219619,Yes


In [29]:
final_table.to_csv('affordability_results.csv', index = False)

In [31]:
final_table[(final_table.Year == 2005) & (final_table.AgeGroup == '65-plus')]

Unnamed: 0,FIPS,Year,MedianHousePrice,AverageRate,AveragePoints,MedianIncome,County,AgeGroup,MonthlyIncome,MonthlyMortgage,mortgage_income_ratio,affordable
62,34001,2005,249344.0,5.71,0.7,30697.0,Atlantic County,65-plus,2558.083333,1667.638046,0.651909,No
67,34001,2005,252209.0,5.63,0.7,30697.0,Atlantic County,65-plus,2558.083333,1675.564286,0.655008,No
72,34001,2005,254847.0,5.93,0.7,30697.0,Atlantic County,65-plus,2558.083333,1735.891911,0.678591,No
77,34001,2005,258515.0,5.86,0.6,30697.0,Atlantic County,65-plus,2558.083333,1750.689182,0.684375,No
82,34001,2005,262680.0,5.72,0.6,30697.0,Atlantic County,65-plus,2558.083333,1758.296520,0.687349,No
...,...,...,...,...,...,...,...,...,...,...,...,...
19837,34041,2005,314481.0,5.82,0.5,29612.0,Warren County,65-plus,2467.666667,2122.633537,0.860178,No
19842,34041,2005,317453.0,5.77,0.6,29612.0,Warren County,65-plus,2467.666667,2133.800722,0.864704,No
19847,34041,2005,320453.0,6.07,0.5,29612.0,Warren County,65-plus,2467.666667,2208.149808,0.894833,No
19852,34041,2005,323522.0,6.33,0.6,29612.0,Warren County,65-plus,2467.666667,2277.329925,0.922868,No


In [156]:
print(2002/2022)

0.990108803165183


In [157]:
2022-2020

2

In [189]:
for i in range(2000,2022):
    val = ((i/2000)-1)*100
    print(val)

0.0
0.04999999999999449
0.09999999999998899
0.15000000000000568
0.20000000000000018
0.24999999999999467
0.29999999999998916
0.35000000000000586
0.40000000000000036
0.44999999999999485
0.49999999999998934
0.550000000000006
0.6000000000000005
0.649999999999995
0.6999999999999895
0.7500000000000062
0.8000000000000007
0.8499999999999952
0.8999999999999897
0.9500000000000064
1.0000000000000009
1.0499999999999954
