In [77]:
# Imports

import pymssql
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [78]:
# Credentials
# Functions to read from database and join tables

database = "arctic_analysts_capstone"
user = "arctic_analysts"
password  = "ThisPassw0rd!"
server = "gen10-data-fundamentals-22-02-sql-server.database.windows.net"

def sql_query(query):
    conn = pymssql.connect(server, user, password, database)
    cursor = conn.cursor()
    queried_data = pd.read_sql(query, conn)
    return queried_data

def run_queries():
    tables = ["year", "month", "county", "median_income", "main_table"]

    query = f"SELECT * FROM {tables[0]}"
    year_df = sql_query(query)

    query = f"SELECT * FROM {tables[1]}"
    month_df = sql_query(query)

    query = f"SELECT * FROM {tables[2]}"
    county_df = sql_query(query)

    query = f"SELECT * FROM {tables[3]}"
    median_income_df = sql_query(query)

    query = f"SELECT * FROM {tables[4]}"
    main_table = sql_query(query)

    all_df = [year_df, month_df, county_df, median_income_df, main_table]
    return all_df

def join_tables(all_df):
    year_df = all_df[0]
    month_df = all_df[1]
    county_df = all_df[2]
    median_income_df = all_df[3]
    main_table = all_df[4]

    master_table = pd.merge(
        main_table, year_df, left_on="YearID", right_on="YearID", how="outer"
    )
    print(master_table.shape[0])
    # Now has 5607 rows

    master_table = pd.merge(
        master_table, month_df, left_on="MonthID", right_on="MonthID", how="outer"
    )
    # Now has 5607 rows
    print(master_table.shape[0])

    master_table = pd.merge(
        master_table, county_df, left_on="FIPS", right_on="FIPS", how="outer"
    )
    # Now has 5607 rows
    print(master_table.shape[0])

    master_table = pd.merge(
        master_table,
        median_income_df,
        left_on=["FIPS", "YearID"],
        right_on=["FIPS", "YearID"],
        how="outer",
    )
    # Now has 20727 rows
    print(master_table.shape[0])

    master_table.loc[(master_table.MedianIncome < 0), "MedianIncome"] = None
    return master_table

all_df = run_queries()
master_table = join_tables(all_df)

5607
5607
5607
20727


In [91]:
# dropping nulls for ml model
master_table = master_table.dropna()
final_table = master_table.copy()

# Calculating affordability 

# calculate monthly income
final_table['MonthlyIncome'] = final_table['MedianIncome']/12

# calculate montly mortgage payment
#https://www.educba.com/mortgage-formula/

for row in final_table:
    P = final_table['MedianHousePrice']-(final_table['MedianHousePrice']*.2).copy()
    r = (final_table['AverageRate']/100)
    t = 30
    n = 12
    monthly_tax = (final_table['MedianHousePrice']*.0189)/12
    final_table['MonthlyMortgage'] = (P * (((r/n) * pow((1+(r/n)),(n*t))) / (pow((1+r/n),(n*t))-1))) + monthly_tax


# mortgage to income ratio
final_table['mortgage_income_ratio'] = final_table['MonthlyMortgage']/final_table['MonthlyIncome']

# affordability determination
def affordable_condition(x):
    if x <= .25:
        return 'Yes'
    elif np.isnan(x):
        return 'Missing'
    else:
        return 'No'

final_table['affordable'] = final_table['mortgage_income_ratio'].apply(affordable_condition)

# MACHINE LEARNING PREP

# dropping calculated columns
ml_table = final_table.drop(['MonthlyMortgage','MonthlyIncome','mortgage_income_ratio'],axis=1)

# list of numerical columns
numerical = ['NewUnits','NewBuildings', 'MedianHousePrice', 'AverageRate',
       'AveragePoints', 'MedianIncome']

# standardizing 
sc_X = StandardScaler()
numerical_standard = sc_X.fit_transform(ml_table[['NewUnits','NewBuildings', 'MedianHousePrice', 'AverageRate',
       'AveragePoints', 'MedianIncome']])

# dropping numerical columns from mmain ml_table
ml_table.drop(numerical,axis=1, inplace=True)

# changing categorical columns to string for categorical
ml_table['Year'] = ml_table['Year'].astype('string')

# categorical columns get dummies
Year = pd.get_dummies(ml_table['Year'])
County = pd.get_dummies(ml_table['County'])
AgeGroup = pd.get_dummies(ml_table['AgeGroup'])
Month = pd.get_dummies(ml_table['Month'])

# creating labels as y 
labels = pd.DataFrame(ml_table['affordable'])
labels.affordable[labels.affordable =='Yes'] =1
labels.affordable[labels.affordable=='No']=0

# joining tables for ml

numerical_ml = pd.DataFrame(data=numerical_standard,columns=numerical)
ml_table.drop(['Year','County','AgeGroup','Month','FIPS'],axis=1, inplace=True)
ml_table.reset_index(drop=True)
categorical_ml = pd.concat([Year,County,AgeGroup,Month],axis=1)
all_ml = pd.concat([numerical_ml.reset_index(drop=True),categorical_ml.reset_index(drop=True)],axis=1)
labels = labels.apply(pd.to_numeric)


In [92]:
# numerical columns ml feature importance

labels = labels.apply(pd.to_numeric)

# LOGISTIC MODEL
X_train, X_test, y_train, y_test = train_test_split(numerical_ml, labels.values.ravel(),test_size=0.25,random_state=0)
numerical_ml_log = LogisticRegression(random_state=0,class_weight='balanced')
numerical_ml_log.fit(X_train,y_train)
y_pred = numerical_ml_log.predict(X_test)
print('Classification Report:\n',classification_report(y_test, y_pred))
importance = numerical_ml_log.coef_.flatten()
df = pd.DataFrame(importance,numerical_ml.columns)

# plotting
fig = px.bar(data_frame=importance, y=numerical_ml.columns, x=importance, orientation='h', width=800, height=400,\
    title='Feature Importance Summary')
fig.update_layout(xaxis_title="Score",yaxis_title="Features",)
fig.show()

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99      3487
           1       0.96      1.00      0.98      1232

    accuracy                           0.99      4719
   macro avg       0.98      0.99      0.99      4719
weighted avg       0.99      0.99      0.99      4719



In [93]:
# categorical columns ml feature importance

labels = labels.apply(pd.to_numeric)

X_train, X_test, y_train, y_test = train_test_split(categorical_ml, labels.values.ravel(),test_size=0.25,random_state=0)
categorical_ml_log = LogisticRegression(random_state=0,class_weight='balanced')
categorical_ml_log.fit(X_train,y_train)
y_pred = categorical_ml_log.predict(X_test)
print('Classification Report:\n',classification_report(y_test, y_pred))
importance = categorical_ml_log.coef_.flatten()
df = pd.DataFrame(importance,categorical_ml.columns)

# plotting
fig = px.bar(data_frame=importance, y=categorical_ml.columns, x=importance, orientation='h', width=800, height=1000,\
    title='Feature Importance Summary')
fig.update_layout(xaxis_title="Score",yaxis_title="Features",)
fig.show()

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.94      0.97      3487
           1       0.85      0.98      0.91      1232

    accuracy                           0.95      4719
   macro avg       0.92      0.96      0.94      4719
weighted avg       0.96      0.95      0.95      4719



In [94]:
# all columns ml feature importance

labels = labels.apply(pd.to_numeric)

X_train, X_test, y_train, y_test = train_test_split(all_ml, labels.values.ravel(),test_size=0.25,random_state=0)
all_ml_log = LogisticRegression(random_state=0,class_weight='balanced')
all_ml_log.fit(X_train,y_train)
y_pred = all_ml_log.predict(X_test)
print('Classification Report:\n',classification_report(y_test, y_pred))
importance = all_ml_log.coef_.flatten()
df = pd.DataFrame(importance,all_ml.columns)

# plotting
fig = px.bar(data_frame=importance, y=all_ml.columns, x=importance, orientation='h', width=800, height=1000,\
    title='Feature Importance Summary')
fig.update_layout(xaxis_title="Score",yaxis_title="Features",)
fig.show()

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99      3487
           1       0.97      1.00      0.98      1232

    accuracy                           0.99      4719
   macro avg       0.99      0.99      0.99      4719
weighted avg       0.99      0.99      0.99      4719



In [95]:
# BUILDING MODELS FOR LAGGED BUILDING PERMITS BY 2 YEARS

# dropping nulls for ml model
master_table = master_table.dropna()
final_table = master_table.copy()

# lagging building permits by 2 years
buildings_df = final_table[['FIPS','Year','Month','NewUnits','NewBuildings','AgeGroup']]
buildings_df['Year'] = buildings_df['Year']+2

# merging back building permits 
final_table.drop(['NewUnits','NewBuildings'],axis=1, inplace=True)
final_table = pd.merge(final_table, buildings_df, left_on=['FIPS','Year','Month','AgeGroup'],right_on=['FIPS','Year','Month','AgeGroup'],how='inner')

# Calculating affordability 

# calculate monthly income
final_table['MonthlyIncome'] = final_table['MedianIncome']/12

# calculate montly mortgage payment
#https://www.educba.com/mortgage-formula/

for row in final_table:
    P = final_table['MedianHousePrice']-(final_table['MedianHousePrice']*.2).copy()
    r = (final_table['AverageRate']/100)
    t = 30
    n = 12
    monthly_tax = (final_table['MedianHousePrice']*.0189)/12
    final_table['MonthlyMortgage'] = (P * (((r/n) * pow((1+(r/n)),(n*t))) / (pow((1+r/n),(n*t))-1))) + monthly_tax


# mortgage to income ratio
final_table['mortgage_income_ratio'] = final_table['MonthlyMortgage']/final_table['MonthlyIncome']

# affordability determination
def affordable_condition(x):
    if x <= .25:
        return 'Yes'
    elif np.isnan(x):
        return 'Missing'
    else:
        return 'No'

final_table['affordable'] = final_table['mortgage_income_ratio'].apply(affordable_condition)

# MACHINE LEARNING PREP

# dropping calculated columns
ml_table = final_table.drop(['MonthlyMortgage','MonthlyIncome','mortgage_income_ratio'],axis=1)

# list of numerical columns
numerical = ['NewUnits','NewBuildings', 'MedianHousePrice', 'AverageRate',
       'AveragePoints', 'MedianIncome']

# standardizing 
sc_X = StandardScaler()
numerical_standard = sc_X.fit_transform(ml_table[['NewUnits','NewBuildings', 'MedianHousePrice', 'AverageRate',
       'AveragePoints', 'MedianIncome']])

# dropping numerical columns from mmain ml_table
ml_table.drop(numerical,axis=1, inplace=True)

# changing categorical columns to string for categorical
ml_table['Year'] = ml_table['Year'].astype('string')

# categorical columns get dummies
Year = pd.get_dummies(ml_table['Year'])
County = pd.get_dummies(ml_table['County'])
AgeGroup = pd.get_dummies(ml_table['AgeGroup'])
Month = pd.get_dummies(ml_table['Month'])

# creating labels as y 
labels = pd.DataFrame(ml_table['affordable'])
labels.affordable[labels.affordable =='Yes'] =1
labels.affordable[labels.affordable=='No']=0

# joining tables for ml

numerical_ml = pd.DataFrame(data=numerical_standard,columns=numerical)
ml_table.drop(['Year','County','AgeGroup','Month','FIPS'],axis=1, inplace=True)
ml_table.reset_index(drop=True)
categorical_ml = pd.concat([Year,County,AgeGroup,Month],axis=1)
all_ml = pd.concat([numerical_ml.reset_index(drop=True),categorical_ml.reset_index(drop=True)],axis=1)
labels = labels.apply(pd.to_numeric)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [96]:
# numerical columns ml feature importance

labels = labels.apply(pd.to_numeric)

X_train, X_test, y_train, y_test = train_test_split(numerical_ml, labels.values.ravel(),test_size=0.25,random_state=0)
numerical_ml_log = LogisticRegression(random_state=0,class_weight='balanced')
numerical_ml_log.fit(X_train,y_train)
y_pred = numerical_ml_log.predict(X_test)
print('Classification Report:\n',classification_report(y_test, y_pred))
importance = numerical_ml_log.coef_.flatten()
df = pd.DataFrame(importance,numerical_ml.columns)

# plotting
fig = px.bar(data_frame=importance, y=numerical_ml.columns, x=importance, orientation='h', width=800, height=400,\
    title='Feature Importance Summary')
fig.update_layout(xaxis_title="Score",yaxis_title="Features",)
fig.show()

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99      2880
           1       0.96      1.00      0.98      1202

    accuracy                           0.99      4082
   macro avg       0.98      0.99      0.98      4082
weighted avg       0.99      0.99      0.99      4082



In [97]:
# categorical columns ml feature importance

labels = labels.apply(pd.to_numeric)

X_train, X_test, y_train, y_test = train_test_split(categorical_ml, labels.values.ravel(),test_size=0.25,random_state=0)
categorical_ml_log = LogisticRegression(random_state=0,class_weight='balanced')
categorical_ml_log.fit(X_train,y_train)
y_pred = categorical_ml_log.predict(X_test)
print('Classification Report:\n',classification_report(y_test, y_pred))
importance = categorical_ml_log.coef_.flatten()
df = pd.DataFrame(importance,categorical_ml.columns)

# plotting
fig = px.bar(data_frame=importance, y=categorical_ml.columns, x=importance, orientation='h', width=800, height=1000,\
    title='Feature Importance Summary')
fig.update_layout(xaxis_title="Score",yaxis_title="Features",)
fig.show()

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.94      0.96      2880
           1       0.87      0.97      0.91      1202

    accuracy                           0.95      4082
   macro avg       0.93      0.95      0.94      4082
weighted avg       0.95      0.95      0.95      4082



In [98]:
# all columns ml feature importance

labels = labels.apply(pd.to_numeric)

X_train, X_test, y_train, y_test = train_test_split(all_ml, labels.values.ravel(),test_size=0.25,random_state=0)
all_ml_log = LogisticRegression(random_state=0,class_weight='balanced')
all_ml_log.fit(X_train,y_train)
y_pred = all_ml_log.predict(X_test)
print('Classification Report:\n',classification_report(y_test, y_pred))
importance = all_ml_log.coef_.flatten()
df = pd.DataFrame(importance,all_ml.columns)

# plotting
fig = px.bar(data_frame=importance, y=all_ml.columns, x=importance, orientation='h', width=800, height=1000,\
    title='Feature Importance Summary')
fig.update_layout(xaxis_title="Score",yaxis_title="Features",)
fig.show()

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99      2880
           1       0.96      1.00      0.98      1202

    accuracy                           0.99      4082
   macro avg       0.98      0.99      0.99      4082
weighted avg       0.99      0.99      0.99      4082

