# Experimentation Notebook

This notebook's purpose is to run a logistic model on Year, Counties, Age Groups, Median Income, Average Mortgage Rates, and Median House Price. After it is trained and tested with a split train test dataset, save it with job lib. Read in saved model and predict yes or no with new predicted income from the ARIMA model just to test the basic logistic regression model. The relatively newer data that the model has not seen are year 2020-2022. As we can see, it predicts no better than yes regardless of the input. 

In [None]:
# Imports

import joblib
import pymssql
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [117]:
# Credentials
# Functions to read from database and join tables

database = "arctic_analysts_capstone"
user = "arctic_analysts"
password  = "ThisPassw0rd!"
server = "gen10-data-fundamentals-22-02-sql-server.database.windows.net"

def sql_query(query):
    conn = pymssql.connect(server, user, password, database)
    cursor = conn.cursor()
    queried_data = pd.read_sql(query, conn)
    return queried_data

def run_queries():
    tables = ["year", "month", "county", "median_income", "main_table"]

    query = f"SELECT * FROM {tables[0]}"
    year_df = sql_query(query)

    query = f"SELECT * FROM {tables[1]}"
    month_df = sql_query(query)

    query = f"SELECT * FROM {tables[2]}"
    county_df = sql_query(query)

    query = f"SELECT * FROM {tables[3]}"
    median_income_df = sql_query(query)

    query = f"SELECT * FROM {tables[4]}"
    main_table = sql_query(query)

    all_df = [year_df, month_df, county_df, median_income_df, main_table]
    return all_df

def join_tables(all_df):
    year_df = all_df[0]
    month_df = all_df[1]
    county_df = all_df[2]
    median_income_df = all_df[3]
    main_table = all_df[4]

    master_table = pd.merge(
        main_table, year_df, left_on="YearID", right_on="YearID", how="outer"
    )
    print(master_table.shape[0])
    # Now has 5607 rows

    master_table = pd.merge(
        master_table, month_df, left_on="MonthID", right_on="MonthID", how="outer"
    )
    # Now has 5607 rows
    print(master_table.shape[0])

    master_table = pd.merge(
        master_table, county_df, left_on="FIPS", right_on="FIPS", how="outer"
    )
    # Now has 5607 rows
    print(master_table.shape[0])

    master_table = pd.merge(
        master_table,
        median_income_df,
        left_on=["FIPS", "YearID"],
        right_on=["FIPS", "YearID"],
        how="outer",
    )
    # Now has 20727 rows
    print(master_table.shape[0])

    master_table.loc[(master_table.MedianIncome < 0), "MedianIncome"] = None
    return master_table

all_df = run_queries()
master_table = join_tables(all_df)

5607
5607
5607
20727


In [118]:
# year 2020-2022 aggregated
predicted_years = master_table[(master_table['Year'] == 2020) | (master_table['Year'] == 2021) | (master_table['Year'] == 2022)]
predicted_df = predicted_years[['FIPS','Year','YearID','County','MedianHousePrice','AverageRate','AveragePoints']].groupby(by=['FIPS','Year','County']).agg('mean').reset_index()
predicted_df['FIPS'] = predicted_df['FIPS'].astype('int64')


# reading in income predictions to predict 2020-2022
df = pd.read_csv('PredictedIncomeFinal.csv')
df = df[['Year','FIPS','AgeGroup','train_and_predicted']]
income_predictions = df[(df.Year == 2020)| (df.Year == 2021) | (df.Year == 2022)]
income_predictions.rename(columns={'train_and_predicted':'MedianIncome'},inplace=True)

# merging income predictions to get the rest of the data
main_predictions = pd.merge(predicted_df, income_predictions, on=['Year','FIPS'],how='inner')
main_predictions

# adding income predictions to main table
final_table = master_table.dropna()
final_table = final_table[['FIPS','Year','County', 'YearID','MedianHousePrice', 'AverageRate', 'AveragePoints',
       'AgeGroup','MedianIncome']]
final_table['YearID'] = final_table['YearID'].astype(int)
final_table = pd.concat([final_table,main_predictions])

# CALCULATIONS BASED ON 12% DOWNPAYMENT

#calculate monthly income
final_table['MonthlyIncome'] = final_table['MedianIncome']/12

#calculate montly mortgage payment
#https://www.educba.com/mortgage-formula/

for row in final_table:
    P = final_table['MedianHousePrice']-(final_table['MedianHousePrice']*.12)
    r = (final_table['AverageRate']/100)
    t = 30
    n = 12
    monthly_tax = (final_table['MedianHousePrice']*.0189)/12
    final_table['MonthlyMortgage'] = (P * (((r/n) * pow((1+(r/n)),(n*t))) / (pow((1+r/n),(n*t))-1))) + monthly_tax


# mortgage to income ratio
final_table['mortgage_income_ratio'] = final_table['MonthlyMortgage']/final_table['MonthlyIncome']

#affordability determination
def affordable_condition(x):
    if x <= .25:
        return 'Yes'
    elif np.isnan(x):
        return 'Missing'
    else:
        return 'No'

final_table['affordable'] = final_table['mortgage_income_ratio'].apply(affordable_condition)

# MACHINE LEARNING PREP

# reset years for ml
final_table.YearID = final_table.YearID - 5
final_table = final_table.reset_index(drop=True)

# copying into a new dataframe for ml
ml_table = final_table.dropna()
ml_table = ml_table.drop(['FIPS','MonthlyIncome', 'MonthlyMortgage', 'mortgage_income_ratio'],axis=1)
ml_table.columns

numerical = ['MedianHousePrice', 'AverageRate', 'AveragePoints', 'MedianIncome']
# standardizing 
sc_X = StandardScaler()
numerical_standard = sc_X.fit_transform(ml_table[numerical])
numerical_ml = pd.DataFrame(data=numerical_standard,columns=numerical)

# dropping numerical data in main table
ml_table = ml_table.drop(ml_table[numerical],axis=1)

YearID = ml_table[['YearID']]
# get dummies for categorical columns
County = pd.get_dummies(ml_table['County'])
AgeGroup = pd.get_dummies(ml_table['AgeGroup'])

# setting labels for y values
labels = pd.DataFrame(ml_table['affordable'])
next_years_labels = labels.iloc[18873:,]
labels = labels.iloc[:18873,]

# setting new numerical df
numerical_ml = pd.concat([numerical_ml,YearID.reset_index(drop=True)],axis=1)

# setting new categorical df
categorical_ml = pd.concat([County,AgeGroup],axis=1)

# setting all df
all_ml = pd.concat([numerical_ml,categorical_ml.reset_index(drop=True)],axis=1)

next_years = all_ml.iloc[18873:,]
all_ml = all_ml.iloc[:18873,]

# LOGISTIC MODEL
X_train, X_test, y_train, y_test = train_test_split(all_ml, labels.values.ravel(),test_size=0.25,random_state=0)
model = LogisticRegression(random_state=0,class_weight='balanced',max_iter=1000)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print('Classification Report:\n',classification_report(y_test, y_pred))
importance = model.coef_.flatten()
df = pd.DataFrame(importance,all_ml.columns)

fig = px.bar(data_frame=importance, y=all_ml.columns, x=importance, orientation='h', width=800, height=900,\
    title='Feature Importance Summary')
fig.update_layout(xaxis_title="Score",yaxis_title="Features",)
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Classification Report:
               precision    recall  f1-score   support

          No       1.00      0.98      0.99      3673
         Yes       0.95      1.00      0.97      1046

    accuracy                           0.99      4719
   macro avg       0.97      0.99      0.98      4719
weighted avg       0.99      0.99      0.99      4719



In [119]:
# save model, read in model to predict for year 2020-2022, get predictions
model_result = model.predict(next_years)
joblib.dump(model, 'basic_logistic_model.sav')
my_basic_logistic_model = joblib.load('basic_logistic_model.sav')
read_in_model_result = my_basic_logistic_model.predict(next_years)

In [120]:
# compare predictions
model_result_df = pd.DataFrame(data=model_result,columns=['Prediction'])
all_together = pd.concat([next_years_labels.reset_index(drop=True),model_result_df],axis=1)
print('Confusion Matrix:\n',confusion_matrix(next_years_labels, model_result),'\n')
print('Classification Report:\n',classification_report(next_years_labels, model_result))

Confusion Matrix:
 [[146   6]
 [  0  82]] 

Classification Report:
               precision    recall  f1-score   support

          No       1.00      0.96      0.98       152
         Yes       0.93      1.00      0.96        82

    accuracy                           0.97       234
   macro avg       0.97      0.98      0.97       234
weighted avg       0.98      0.97      0.97       234

