In [None]:
import pymssql
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats

In [None]:
database = "arctic_analysts_capstone"
user = "arctic_analysts"
password  = "ThisPassw0rd!"
server = "gen10-data-fundamentals-22-02-sql-server.database.windows.net"

def sql_query(query):
    conn = pymssql.connect(server, user, password, database)
    cursor = conn.cursor()
    queried_data = pd.read_sql(query, conn)
    return queried_data

# Make a new cell here .....

In [None]:
# target_table = 'building_permits'
# target_table = 'mortgage_rates'
target_table = 'house_prices'
#target_table = 'median_income'

query = f"SELECT * FROM {target_table}" # Query Example
df = sql_query(query)

In [None]:
df['MedianHousePrice'] = df.MedianHousePrice.apply(lambda x: None if x == '' else int(float(x)))

In [None]:
df.sort_values(by = 'Date', inplace = True)

In [None]:
locales = df['FIPS'].unique().tolist()
for locale in locales:
    subset = df[(df.FIPS == locale)].copy()
    if subset.isnull().sum().sum() > 0:
        continue
    else:
        break
    


In [None]:
subset.sort_values(by = 'Date', inplace = True)

In [None]:
alternate = subset.groupby(by = ['County','Year'])[['MedianHousePrice']].agg('mean').reset_index()

In [None]:
subset.reset_index(drop = True, inplace = True)
subset.reset_index(inplace = True)

alternate.reset_index(drop = True, inplace = True)
alternate.reset_index(inplace = True)

In [None]:
subset

In [None]:
x = subset['index'].to_numpy()
y = subset['MedianHousePrice'].to_numpy()

slope, intercept, r, p, stderr = scipy.stats.linregress(x,y)

In [None]:
plt.rcParams['xtick.labelsize'] = 13
plt.rcParams['ytick.labelsize'] = 13

line = f'Slope: {slope:.2f}\nIntercept: {intercept:.2f}\nCorrelation coefficient: r={r:.2f}'

fig = plt.figure(figsize = (17,5))
plt.title(f"Date vs. Median Home Price | {subset.County.tolist()[0]}", fontsize = 20)
ax = sns.scatterplot(data = subset,
                     x = 'Date',
                     y = 'MedianHousePrice',
                     hue = 'MedianHousePrice',
                     palette = 'magma', 
                     s = 100,
                     alpha = .8, 
                     edgecolor = 'black', 
                     linewidth = 1,
                     legend = False)

ax = sns.lineplot(x = x,
                  y = intercept + slope * x, 
                  label = line,
                  color = 'blue',
                  linewidth = 1.2)

ax.set_xlabel('Date', fontsize = 14)
ax.set_ylabel('Median Home Price', fontsize = 14)
ax.tick_params(size = 10)
sns.despine()
plt.show()

In [None]:
x = alternate['index'].to_numpy()
y = alternate['MedianHousePrice'].to_numpy()

slope, intercept, r, p, stderr = scipy.stats.linregress(x,y)

In [None]:
plt.rcParams['xtick.labelsize'] = 13
plt.rcParams['ytick.labelsize'] = 13

line = f'Slope: {slope:.2f}\nIntercept: {intercept:.2f}\nCorrelation coefficient: r={r:.2f}'

fig = plt.figure(figsize = (17,5))
plt.title(f"Date vs. Median Home Price | {subset.County.tolist()[0]}", fontsize = 20)
ax = sns.scatterplot(data = alternate,
                     x = 'Year',
                     y = 'MedianHousePrice',
                     hue = 'MedianHousePrice',
                     palette = 'magma', 
                     s = 100,
                     alpha = .8, 
                     edgecolor = 'black', 
                     linewidth = 1,
                     legend = False)

ax = sns.lineplot(x = x,
                  y = intercept + slope * x, 
                  label = line,
                  color = 'blue',
                  linewidth = 1.2)

ax.set_xlabel('Date', fontsize = 14)
ax.set_ylabel('Median Home Price', fontsize = 14)
ax.tick_params(size = 10)
sns.despine()
plt.show()

In [None]:
.83**2

In [None]:
from sklearn.models import LogisticRegression, LinearRegression

In [None]:
def plot_charts(fips):
    subset = df[(df.FIPS == fips) & (df.AgeGroup == '25-44')][['Year','County','MedianIncome']]
    if subset.shape[0] < 4:
        print(subset.shape[0])
    else:
        return

    plt.rcParams['xtick.labelsize'] = 13
    plt.rcParams['ytick.labelsize'] = 13

    x = subset['Year'].to_numpy()
    y = subset['MedianIncome'].to_numpy()

    slope, intercept, r, p, stderr = scipy.stats.linregress(x,y)
    line = f'Slope: {slope:.2f}\nIntercept: {intercept:.2f}\nCorrelation coefficient: r={r:.2f}'

    fig = plt.figure(figsize = (10,8))
    plt.title(f"Year vs. Median Income | {subset.County.tolist()[0]}", fontsize = 20)
    ax = sns.scatterplot(data = subset,
                         x = 'Year',
                         y = 'MedianIncome',
                         hue = 'MedianIncome',
                         palette = 'magma', 
                         s = 100,
                         alpha = .8, 
                         edgecolor = 'black', 
                         linewidth = 1,
                         legend = False)

    ax = sns.lineplot(x = x,
                      y = intercept + slope * x, 
                      label = line,
                      color = 'blue',
                      linewidth = 1.2)

    ax.set_xlabel('Year', fontsize = 14)
    ax.set_ylabel('Median Income', fontsize = 14)
    ax.tick_params(size = 10)
    sns.despine()
    plt.show()


In [None]:
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing
from sklearn.metrics import mean_squared_error

In [None]:
records = len(subset)

df_train = subset[:-int(records *.25)]
df_test = subset[-int(records*.25):]

In [None]:
print(len(df_test))

In [None]:
model_results = []
for period in range(2, 200): 
    
    try:
        model_mul_additive = ExponentialSmoothing(df_train['MedianHousePrice'], trend='mul', seasonal='add', seasonal_periods = period)
        results_mul_add = model_mul_additive.fit()
        predictions_mul_add = results_mul_add.forecast(steps=67)
        rmse_mul_add = mean_squared_error(df_test['MedianHousePrice'], predictions_mul_add, squared=False)
        model_results.append([period, rmse_mul_add, model_mul_additive, 'additive'])
        
        model_mul_mult = ExponentialSmoothing(df_train['MedianHousePrice'], trend='mul', seasonal='mul', seasonal_periods = periods)
        results_mul_mult = model_mul_mult.fit()
        predictions_mul_mult = results_mul_mult.forecast(steps=67)
        rmse_mul_mult = mean_squared_error(df_test['MedianHousePrice'], predictions_mul_mult, squared=False)

        model_results.append([period, rmse_mul_mult, model_mul_mult, 'multiplicative'])
        
        ax1 = None
        fig1 = plt.figure(figsize = (17,5))

        ax1 = plt.plot(subset['MedianHousePrice'], color = 'black')
        ax1 = plt.plot(predictions_mul_add, color ='orange')
        ax1 = plt.plot(df_test['MedianHousePrice'], color = 'green', linewidth = 3)
        ax1 = plt.plot(predictions_mul_mult, color='red')
        plt.show()
        
        print(period)
    except:
        print('fail', periods)
        continue

    result_df = pd.DataFrame(model_results, columns = ['num_periods', 'mse', 'model', 'model_type'])
  

In [None]:
result_df

In [None]:
result_df.sort_values(by = 'mse',inplace = True)

In [None]:
result_df.reset_index(drop = True, inplace = True)

In [None]:
result_df.reset_index(inplace = True)

In [None]:
result_df

In [None]:
for result in result_df['index'].tolist():
    result_target = result_df[(result_df['index'] == result)]
    result_target.reset_index(drop = True, inplace = True)
    
    model = result_target.loc[0, 'model']
    model_type = result_target.loc[0, 'model_type']
    print(model_type)
    
    fitted_model = model.fit()
    predictions = fitted_model.forecast(steps=67)
    
    ax1 = None
    fig1 = plt.figure(figsize = (17,5))
    
    ax1 = plt.plot(subset['MedianHousePrice'], color = 'black')
    ax1 = plt.plot(predictions, color ='orange')
    ax1 = plt.plot(df_test['MedianHousePrice'], color = 'green', linewidth = 3)
    plt.show()

In [None]:
from sklearn.linear_model import ElasticNetCV