In [92]:
import numpy as np
import pandas as pd
import hvplot.pandas
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import os

In [93]:
#Citation: https://stackoverflow.com/questions/53731432/return-pandas-multiindex-as-list-of-tuples
def multi_index_data(path):
    df = pd.read_csv(path, infer_datetime_format=True, parse_dates=True)
    df["Date"] = [date[:7] for date in df.Date]
    arrays = [df["Region_ID"],
            df["Bedrooms"],
            df["Date"]]

    tuples = list(zip(*arrays))
    index = pd.MultiIndex.from_tuples(tuples, names=['Region_ID', 'Bedrooms','Date'])

    df = df.set_index(index)
    df = df.drop(columns=["Region_ID","Bedrooms","Date"])
    df = df.sort_index()
    return df

In [94]:
paths = ['../../../../Data/median_rents_multi_indexed.csv',
        '../../../../Data/primary_LSTM_df.csv'
]


In [95]:
df_list = []
for path in paths:
    df_list.append(multi_index_data(path))

In [96]:
TX_nbhoods_df = pd.concat(df_list,axis=1,join='outer')
TX_nbhoods_df.dropna(inplace=True)
TX_nbhoods_df.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Median_Rent,Price,Average_Market_Days,RegionName_Austin,RegionName_Beaumont,RegionName_Brownwood,RegionName_Converse,RegionName_Coppell,RegionName_Crosby,RegionName_Dallas,...,CountyName_Jefferson County,CountyName_McLennan County,CountyName_Montgomery County,CountyName_Rockwall County,CountyName_Smith County,CountyName_Tarrant County,CountyName_Tom Green County,CountyName_Travis County,CountyName_Val Verde County,CountyName_Williamson County
Region_ID,Bedrooms,Date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2,1,2013-11,1260.0,73054.0,76.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2013-12,1119.0,73666.0,77.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2014-01,1224.0,74422.0,83.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2014-02,1125.0,75160.0,80.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2014-03,1072.0,75576.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427,3,2019-08,1850.0,243203.0,59.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
427,3,2019-09,1800.0,244080.0,87.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
427,3,2019-10,1750.0,244699.0,82.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
427,3,2019-11,1761.0,245233.0,96.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
def window_data(df, window, feature_col_number, target_col_number):
    X = []
    y = []
    for i in range(len(df) - window - 1):
        features = df.iloc[i:(i + window), feature_col_number]
        target = df.iloc[(i + window), target_col_number]
        X.append(features)
        y.append(target)
    return np.array(X), np.array(y).reshape(-1, 1)

In [98]:
def build_by_region_beds(tuple_,window_size):
    region_bed = tuple_[:2]
    region_bed_df = TX_nbhoods_df.loc[region_bed]
    prices_df = region_bed_df.loc[:,["Median_Rent","Price"]]
    prices_df.sort_index(inplace=True)
    feature_column = 1
    target_column = 0
    X, y = window_data(prices_df, window_size, feature_column, target_column)

    split = int(0.7 * len(X))
    X_train = X[: split]
    X_old = prices_df
    X_old.rename(columns={"Median_Rent":"Real"},inplace=True)
    X_test = X[split:]
    y_train = y[: split]
    y_test = y[split:]

    scaler = MinMaxScaler()
    scaler.fit(X)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    scaler.fit(y)
    y_train = scaler.transform(y_train)
    y_test = scaler.transform(y_test)

    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

    model = Sequential()

    number_units = 5
    dropout_fraction = 0.2


    model.add(LSTM(
        units=number_units,
        return_sequences=True,
        input_shape=(X_train.shape[1], 1))
        )
    model.add(Dropout(dropout_fraction))

    model.add(LSTM(units=number_units, return_sequences=True))
    model.add(Dropout(dropout_fraction))

    model.add(LSTM(units=number_units))
    model.add(Dropout(dropout_fraction))

    model.add(Dense(1))

    model.compile(optimizer="adam", loss="mean_squared_error")


    model.fit(X_train, y_train, epochs=10, shuffle=False, batch_size=1, verbose=1)

    loss = model.evaluate(X_test, y_test)
    predicted = model.predict(X_test)

    predicted_prices = scaler.inverse_transform(predicted)
    real_prices = scaler.inverse_transform(y_test.reshape(-1, 1))

    index= [date for date in X_old.index]
    final_df = pd.DataFrame({
        "Real": real_prices.ravel(),
        "Predicted": predicted_prices.ravel()},
        index= index[-len(real_prices):]) 
    

    return [final_df, loss, X_old]


In [99]:
for tuple_ in set(TX_nbhoods_df.index):
    for i in [1,3,6,12]:
        path = f'../Results/Region/{tuple_[0]}/Bedrooms/{tuple_[1]}/Window-Size/{str(i)}/'
        if os.path.isdir(path)==False:
            os.makedirs(path)
            #Citation: https://docs.python.org/3/tutorial/controlflow.html
            continue
        else:
            pass

        file_list = os.listdir(path)
        
        if len(file_list)==4:
            print(f'Already have for Region {tuple_[0]}, {tuple_[1]} Bedroom with window size {i}: {file_list}')
            pass
        else:
            model = build_by_region_beds(tuple_,i)
            output = model[0]
            output.rename(columns={"Price":"Rent","Predicted":"Predicted Rent"},inplace=True)
            loss_val = model[1]
            old_prices = model[2]
            index = pd.MultiIndex.from_tuples([tuple_[:2]],names=["Region","Bedrooms"])
            #Note to self: If you don't end up figuring out why its predicting Nan's for some of them, remember to drop those region_beds from the data before AWS run (window too large for some datasets?)
            loss = pd.DataFrame(columns={"Loss":loss_val},index=index)
            loss["Loss"] = loss_val
            Predicted_vs_Real = output.plot()
            Train_vs_Test = pd.concat([old_prices["Real"], output["Predicted Rent"]],axis=1,join='outer')
            Train_vs_Test_plt = Train_vs_Test.plot()

            output.to_csv(f'{path}Results.csv')
            loss.to_csv(f'{path}Loss.csv') 

            # Citation: https://stackoverflow.com/questions/18992086/save-a-pandas-series-histogram-plot-to-file
            Predicted_vs_Real.figure.savefig(f'{path}Plot.png')
            Train_vs_Test_plt.figure.savefig(f'{path}Plot2.png')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
E

KeyboardInterrupt: 

In [16]:
# ## OLD FOR-LOOP IN CASE THE NEW ONE BREAKS:
# for tuple_ in set(TX_nbhoods_df.index):
#     for i in [1,3,6,12]:
#         model = build_by_region_beds(tuple_,i)
#         output = model[0]
#         loss_val = model[1]
#         old_prices = model[2]
#         index = pd.MultiIndex.from_tuples([tuple_[:2]],names=["Region","Bedrooms"])
#         #Note to self: If you don't end up figuring out why its predicting Nan's for some of them, remember to drop those region_beds from the data before AWS run (window too large for some datasets?)
#         loss = pd.DataFrame(columns={"Loss":loss_val},index=index)
#         loss["Loss"] = loss_valgit 
#         Predicted_vs_Real = output.plot()
#         Train_vs_Test = pd.concat([old_prices, output["Predicted"]],axis=1,join='outer')
#         print(Train_vs_Test)
#         Train_vs_Test_plt = Train_vs_Test.plot()

#         path = f'../Results/Region/{tuple_[0]}/Bedrooms/{tuple_[1]}/Window-Size/{str(i)}/'
        
        
#         if os.path.isdir(path)==True:
#             pass
            
#         else:
#             os.makedirs(path)
#         output.to_csv(f'../Results/Region/{tuple_[0]}/Bedrooms/{tuple_[1]}/Window-Size/{str(i)}/Results.csv')
#         loss.to_csv(f'../Results/Region/{tuple_[0]}/Bedrooms/{tuple_[1]}/Window-Size/{str(i)}/Loss.csv') 

#         # Citation: https://stackoverflow.com/questions/18992086/save-a-pandas-series-histogram-plot-to-file
#         Predicted_vs_Real.figure.savefig(f'../Results/Region/{tuple_[0]}/Bedrooms/{tuple_[1]}/Window-Size/{str(i)}/Plot.png')
#         Train_vs_Test_plt.figure.savefig(f'../Results/Region/{tuple_[0]}/Bedrooms/{tuple_[1]}/Window-Size/{str(i)}/Plot2.png')