In [1]:
import pymssql
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

database = "arctic_analysts_capstone"
user = "arctic_analysts"
password  = "ThisPassw0rd!"
server = "gen10-data-fundamentals-22-02-sql-server.database.windows.net"

def sql_query(query):
    conn = pymssql.connect(server, user, password, database)
    cursor = conn.cursor()
    queried_data = pd.read_sql(query, conn)
    return queried_data


tables = [
    'year',
    'month',
    'county',
    'median_income',
    'main_table'
]

query = f"SELECT * FROM {tables[0]}" # Query Example
year_df = sql_query(query)

query = f"SELECT * FROM {tables[1]}" # Query Example
month_df = sql_query(query)

query = f"SELECT * FROM {tables[2]}" # Query Example
county_df = sql_query(query)

query = f"SELECT * FROM {tables[3]}" # Query Example
median_income_df = sql_query(query)

query = f"SELECT * FROM {tables[4]}" # Query Example
main_table = sql_query(query)


master_table = pd.merge(main_table, year_df, left_on = 'YearID', right_on = 'YearID', how = 'inner')
# Now has 5607 rows

master_table = pd.merge(master_table, month_df, left_on = 'MonthID', right_on = 'MonthID', how = 'inner')
# Now has 5607 rows

master_table = pd.merge(master_table, county_df, left_on = 'FIPS', right_on = 'FIPS', how = 'inner')
# Now has 5607 rows

master_table = pd.merge(master_table, median_income_df, left_on = ['FIPS','YearID'], right_on = ['FIPS','YearID'], how = 'inner')
# Now has 18900 rows

master_table = master_table.drop(columns = ['YearID','MonthID'])
master_table

Unnamed: 0,FIPS,NewUnits,NewBuildings,MedianHousePrice,AverageRate,AveragePoints,Year,Month,County,AgeGroup,MedianIncome
0,34001,107,103,249344.0,5.71,0.7,2005,Jan,Atlantic County,25-44,52040
1,34001,107,103,249344.0,5.71,0.7,2005,Jan,Atlantic County,45-64,62816
2,34001,107,103,249344.0,5.71,0.7,2005,Jan,Atlantic County,65-plus,30697
3,34001,107,103,249344.0,5.71,0.7,2005,Jan,Atlantic County,overall,50377
4,34001,107,103,249344.0,5.71,0.7,2005,Jan,Atlantic County,under-25,28074
...,...,...,...,...,...,...,...,...,...,...,...
18895,34041,10,6,248332.0,3.72,0.7,2019,Dec,Warren County,25-44,86705
18896,34041,10,6,248332.0,3.72,0.7,2019,Dec,Warren County,45-64,101995
18897,34041,10,6,248332.0,3.72,0.7,2019,Dec,Warren County,65-plus,55687
18898,34041,10,6,248332.0,3.72,0.7,2019,Dec,Warren County,overall,84479


In [None]:


filtered = master_table[(master_table['AgeGroup'] == '25-44')].copy()

def convert_to_date(year, month):
    date = dt.datetime.strptime(f'{year}-{month}-15', '%Y-%b-%d').date()
    date = dt.datetime.strftime(date, '%Y-%m-%d')
    return date

filtered['date'] = filtered.apply(lambda row: convert_to_date(row.Year, row.Month), axis = 1)

In [3]:
def income_data():
    """ This is a static call that will only return data once."""

    print("Querying Income Data")
    query = "SELECT * FROM median_income"
    income_df = sql_query(query)
    year_df = sql_query('SELECT * FROM year')
    
    income_df = pd.merge(income_df, year_df, left_on = 'YearID', right_on = 'YearID')
    income_df = income_df.sort_values(by = 'MedianIncome', ascending = False)
    return income_df

In [4]:
df = income_data()

Querying Income Data


In [9]:
df.loc[(df.MedianIncome < 0), 'MedianIncome'] = None

In [None]:
plt.rcParams['xtick.labelsize'] = 13
plt.rcParams['ytick.labelsize'] = 13
plt.rcParams['legend.title_fontsize'] = 'xx-large'
fig = plt.figure(figsize = (15,4))
plt.title("Average Mortgage Rates", fontsize = 32)
ax = sns.scatterplot(data = monthly_mr,
                     x = 'Date',
                     y = 'AverageRate',
                     hue = 'AveragePoints',
                     palette = 'magma', 
                     s = 100,
                     alpha = .8, 
                     edgecolor = 'black', 
                     linewidth = 1,
                     legend = True)

ax.set_xlabel('Date', fontsize = 14)
ax.set_ylabel('Average Rate', fontsize = 14)
ax.tick_params(size = 10)
ax.legend(title = 'Average Points', fontsize = 18, markerscale = 4)
sns.despine()
plt.show()

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [None]:
monthly_mr.reset_index(inplace = True)
monthly_mr.dropna(inplace = True)

In [None]:
Input=[('polynomial', PolynomialFeatures(degree=5)),('modal',LinearRegression())]
pipe=Pipeline(Input)
pipe.fit(monthly_mr[['index']], monthly_mr[['AverageRate']])

poly_pred = pipe.predict(monthly_mr[['index']])

sorted_zip = sorted(zip(monthly_mr['index'], poly_pred))

x_poly, poly_pred = zip(*sorted_zip)

x = monthly_mr[['index']]
y = monthly_mr[['AverageRate']]

plt.rcParams['xtick.labelsize'] = 13
plt.rcParams['ytick.labelsize'] = 13
#plt.rcParams['legend.title_fontsize'] = 'xx-large'
fig = plt.figure(figsize = (20,8))
plt.title("Average Mortgage Rates", fontsize = 32)
ax = sns.scatterplot(data = monthly_mr,
                     x = 'index',
                     y = 'AverageRate',
                     hue = 'AveragePoints',
                     palette = 'magma', 
                     s = 100,
                     alpha = .32, 
                     edgecolor = 'black', 
                     linewidth = 1,
                     legend = False)

plt.plot(x_poly, poly_pred, color = 'green', linewidth = 4, linestyle= 'dashed', label = 'Polynomial Regression')

ax.set_xlabel('Predictor', fontsize = 14)
ax.set_ylabel('Target', fontsize = 14)
ax.tick_params(size = 10)
ax.legend(fontsize = 18, markerscale = 1, loc = 5)
sns.despine()
plt.show()

plt.show()