# MVP Notebook Daniel

In [1]:
import preprocessing
import wrangle
import model

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix

# ignore warnings
import warnings
warnings.simplefilter(action='ignore')

In [2]:
df = preprocessing.get_model_df()

In [3]:
df = preprocessing.add_new_features(df)

In [4]:
def filter_top_cities(df):

    df["city_state"] = df["city"] + "_" + df["state"]

    city_mask = df.groupby("city_state").year.count()

    city_mask = city_mask[city_mask == 15]

    # apply city mask to shrink the df
    def in_city_mask(x):
        return x in city_mask
    df = df[df.city_state.apply(in_city_mask)]

    df = df.sort_values(["city", "state", "year"])
    
    return df

In [5]:
df = filter_top_cities(df)

## <span style='background :yellow' > Adding the labeling </span>

In [6]:
# # Using the Evolution Index as a label:
# # For values that are higher than 100% in evolution index.

# df["ei_label"] = np.where(df.ei > 1, 1, 0)

In [7]:
# using future data to create the labels

def labeling_future_data(df):
    """this function takes in a data frame and returns a boolean column that identifies
    if a city_state_year is a market that should be entered"""
    df["label_quantity_of_mortgages_pop_2y"] = (df.sort_values(["year"])
                                  .groupby(["city", "state"])[["quantity_of_mortgages_pop"]]
                                  .pct_change(2)
                                  .shift(-2))
    df["label_total_mortgage_volume_pop_2y"] = (df.sort_values(["year"])
                                  .groupby(["city", "state"])[["total_mortgage_volume_pop"]]
                                  .pct_change(2)
                                  .shift(-2))
    Q3 = df.label_quantity_of_mortgages_pop_2y.quantile(.75)
    Q1 = df.label_quantity_of_mortgages_pop_2y.quantile(.25)
    upper_fence_quantity = Q3 + ((Q3-Q1)*1.5)
    upper_fence_quantity
    Q3 = df.label_total_mortgage_volume_pop_2y.quantile(.75)
    Q1 = df.label_total_mortgage_volume_pop_2y.quantile(.25)
    upper_fence_volume = Q3 + ((Q3-Q1)*1.5)
    upper_fence_volume
    df['should_enter'] = (df.label_total_mortgage_volume_pop_2y > upper_fence_volume) | (df.label_quantity_of_mortgages_pop_2y > upper_fence_quantity)
    return df

In [8]:
df = labeling_future_data(df)

In [9]:
df.should_enter.value_counts()

False    338
True      52
Name: should_enter, dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 390 entries, 1513 to 156
Data columns (total 23 columns):
city                                  390 non-null object
state                                 390 non-null object
year                                  390 non-null int64
quantity_of_mortgages_pop             390 non-null int64
total_mortgage_volume_pop             390 non-null int64
average_mortgage_volume_pop           390 non-null float64
median_mortgage_amount_pop            390 non-null float64
quantity_of_mortgages_nc              180 non-null float64
total_mortgage_volume_nc              180 non-null float64
average_mortgage_volume_nc            180 non-null float64
median_mortgage_amount_nc             180 non-null float64
label                                 390 non-null bool
city_state_vol_delta_pop              364 non-null float64
city_state_vol_delta_nc               310 non-null float64
city_state_qty_delta_pop              364 non-null float64
city_state_qty_del

In [11]:
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

In [12]:
def train_test_data(df):
    train, test = train_test_split(df, train_size=.75, random_state=123)
    return train, test

#__Main Pre-modeling function__#
def prep_data_for_modeling(df, features_for_modeling, label_feature):

    # To avoid Nan's, I have removed all data from 2006 (because all the var's would be nan)
    df_model = df[df.year > 2007]

    # Create an observation id to reduce the chance of mistake's
    df_model["observation_id"] = df_model.city + "_" + df_model.state + "_"  + df_model.year.astype(str)

    # select that features that we want to model, and use our observation id as the row id
    features_for_modeling += ["observation_id"]
    features_for_modeling += [label_feature]
    data = df_model[features_for_modeling].set_index("observation_id")

    train, test = train_test_data(data)
    train = train.sort_values("observation_id")
    test = test.sort_values("observation_id")

    X_train = train.drop(columns=label_feature)
    y_train = train[label_feature]
    X_test = test.drop(columns=label_feature)
    y_test = test[label_feature]

    return X_train, y_train, X_test, y_test

In [20]:
features_for_modeling = ["quantity_of_mortgages_pop", "city_state_qty_delta_pop", "ei", "median_mortgage_amount_pop"]
label_feature = "should_enter"
X_train, y_train, X_test, y_test = prep_data_for_modeling(df, features_for_modeling, label_feature)

In [21]:
# Helper function used to updated the scaled arrays and transform them into usable dataframes
def return_values(scaler, train, test):
    train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values])
    test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
    return scaler, train_scaled, test_scaled

# Linear scaler
def min_max_scaler(train, test):
    scaler = MinMaxScaler().fit(train)
    scaler, train_scaled, test_scaled = return_values(scaler, train , test)
    return scaler, train_scaled, test_scaled

In [22]:
# Scaler is ready - in case we need it

scaler, train_scaled, test_scaled = min_max_scaler(X_train, X_test)

In [23]:
assert(train_scaled.shape[1] == test_scaled.shape[1])

In [24]:
train_scaled.head()

Unnamed: 0,quantity_of_mortgages_pop,city_state_qty_delta_pop,ei,median_mortgage_amount_pop
Atlanta_GA_2008,0.021739,0.010172,0.006617,0.033455
Atlanta_GA_2009,0.173913,0.341549,0.115067,0.046596
Atlanta_GA_2010,0.217391,0.087637,0.018195,0.056728
Atlanta_GA_2011,0.195652,0.06338,0.020376,0.045856
Atlanta_GA_2012,0.23913,0.085915,0.02143,0.03196


In [25]:
train_scaled.isnull().sum()

quantity_of_mortgages_pop     0
city_state_qty_delta_pop      0
ei                            0
median_mortgage_amount_pop    0
dtype: int64

In [26]:
grid, df_result, best_model = model.run_decision_tree(train_scaled, y_train)

{'criterion': 'gini', 'max_depth': 4, 'score': 0.3333333333333333}


In [27]:
grid, df_result, best_model = model.run_random_forest(train_scaled, y_train)

{'max_depth': 9, 'min_samples_leaf': 1, 'score': 0.075}


In [28]:
grid, df_result, best_model = model.run_knn(train_scaled, y_train)

{'n_neighbors': 1, 'weights': 'uniform', 'score': 0.2}


----

# Evaluation

In [29]:
grid, df_result, best_model = model.run_decision_tree(train_scaled, y_train)

{'criterion': 'gini', 'max_depth': 7, 'score': 0.3333333333333333}


In [30]:
y_pred = best_model.predict(train_scaled)

In [31]:
labels = sorted(y_train.unique())
matrix = pd.DataFrame(confusion_matrix(y_train, y_pred), index = labels, columns = labels)

In [32]:
from sklearn.metrics import recall_score

In [33]:
recall_score(y_train, y_pred)

0.717948717948718

In [34]:
print(matrix)

       False  True 
False    214      0
True      11     28


In [35]:
best_model.score(test_scaled, y_test)

0.8235294117647058

In [36]:
y_pred = best_model.predict(test_scaled)

In [37]:
labels = sorted(y_train.unique())
matrix = pd.DataFrame(confusion_matrix(y_test, y_pred), index = labels, columns = labels)

In [38]:
recall_score(y_test, y_pred)

0.3076923076923077

In [39]:
print(matrix)

       False  True 
False     66      6
True       9      4


----

# Prediction

In [None]:
model_df = preprocessing.get_model_df()

df["city_state"] = df["city"] + "_" + df["state"]

city_mask = df.groupby("city_state").year.count()

city_mask = city_mask[city_mask == 15]

# apply city mask to shrink the df
def in_city_mask(x):
    return x in city_mask
df = df[df.city_state.apply(in_city_mask)]

df = preprocessing.add_new_features(df)

df = df.sort_values(["city", "state", "year"])

In [None]:
df.head()

In [None]:
features_for_predicting = ["city_state_qty_delta_pop","total_mortgage_volume_pop", "ei", "median_mortgage_amount_pop"]

In [None]:
predictions = df[(df.year == 2020) | (df.year == 2019)].groupby("city_state")[features_for_predicting].mean()
predictions

In [None]:
# Helper function used to updated the scaled arrays and transform them into usable dataframes
def return_values_prediction(scaler, df):
    train_scaled = pd.DataFrame(scaler.transform(df), columns=df.columns.values).set_index([df.index.values])
    return scaler, train_scaled

# Linear scaler
def min_max_scaler_prediction(df):
    scaler = MinMaxScaler().fit(df)
    scaler, df_scaled = return_values_prediction(scaler, df)
    return scaler, df_scaled

In [None]:
scaler, predictions_scaled = min_max_scaler_prediction(predictions)

In [None]:
predictions["label"] = best_model.predict(predictions_scaled)

In [None]:
predictions

In [None]:
city = predictions.reset_index().city_state.str.split("_", n=1, expand=True)[0]

state = predictions.reset_index().city_state.str.split("_", n=1, expand=True)[1]

In [None]:
predictions = predictions.reset_index()

In [None]:
predictions["city"] = city

predictions["state"] = state

In [None]:
predictions

In [None]:
plt.figure(figsize=(15,5))
ax = sns.barplot(data=predictions, x="city", y="ei", hue="label")
plt.title("What markets will look like in 2021, based on evolution index")
plt.xticks(rotation=45, ha="right")
plt.xlabel("City")
plt.ylabel("Evolution Index (%)")
new_labels = ['Markets to not enter', 'Markets to enter']
h, l = ax.get_legend_handles_labels()
ax.legend(h, new_labels)
plt.show()