In [1]:
from wrangle import split_data, wrangle_zillow, scale_data
import pandas as pd
import sklearn.preprocessing
import matplotlib.pyplot as plt

### Wrangle Zillow Data

In [2]:
df = wrangle_zillow()

Using cached data


In [3]:
train, validate, test = split_data(df)

Data split as follows: Train 56.00%, Validate 24.00%, Test 20.00%


### Apply scalars from lesson and visualize: Min-Max, Standard, and Robust

#### Min Max

In [None]:
columns = ['bedroom_cnt', 'bathroom_cnt', 'square_feet', 'tax_value','tax_amount',
       'year_built']

In [None]:
for col in columns:
    scaler = sklearn.preprocessing.MinMaxScaler()
    scaler.fit(train[[col]])
    train_scaled = scaler.transform(train[[col]])
    plt.figure(figsize=(13, 6))
    plt.subplot(121)
    plt.hist(train[col], bins=15, ec='black')
    plt.title(f'Original {col}')
    plt.subplot(122)
    plt.hist(train_scaled, bins=100, ec='black')
    plt.title(f'Scaled {col}: MinMax')
    plt.show()

#### Standard Scaler

In [None]:
for col in columns:
    scaler = sklearn.preprocessing.StandardScaler()
    scaler.fit(train[[col]])
    train_scaled = scaler.transform(train[[col]])
    plt.figure(figsize=(13, 6))
    plt.subplot(121)
    plt.hist(train[col], bins=25, ec='black')
    plt.title(f'Original {col}')
    plt.subplot(122)
    plt.hist(train_scaled, bins=100, ec='black')
    plt.title(f'Scaled {col}: Standard Scaler')
    plt.show()

#### Robust

In [None]:
for col in columns:
    scaler = sklearn.preprocessing.RobustScaler()
    scaler.fit(train[[col]])
    train_scaled = scaler.transform(train[[col]])
    plt.figure(figsize=(13, 6))
    plt.subplot(121)
    plt.hist(train[col], bins=25, ec='black')
    plt.title(f'Original {col}')
    plt.subplot(122)
    plt.hist(train_scaled, bins=100, ec='black')
    plt.title(f'Scaled {col}: Robust Scaler')
    plt.show()

### 2. Inverse Transform

In [None]:
# for col in columns:
col = 'tax_value'
scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(train[[col]])
train_scaled = scaler.transform(train[[col]])
inverse_train_scaled = scaler.inverse_transform(train_scaled)
plt.figure(figsize=(13, 6))
plt.subplot(131)
plt.hist(train[col], bins=15, ec='black')
plt.title(f'Original {col}')
plt.subplot(132)
plt.hist(train_scaled, bins=100, ec='black')
plt.title(f'Scaled {col}: MinMax')
plt.subplot(133)
plt.hist(inverse_train_scaled, bins=15, ec='black')
plt.title(f'Inverse Transform of scaled data for \n{col}: MinMax')
plt.tight_layout()
plt.show()

In [None]:
both = train[[col]]

both["inverse_train_scaled"] = inverse_train_scaled

both["difference"] = both.inverse_train_scaled - both.tax_value

In [None]:
both.difference.mean()

### Using Tax Value as an example -  after applying the inverse transform to the scaled data the resulting dataset appears to be extremely close to the original but a very slight difference (average difference of -1.348985242062614e-13)

### For bedroom count

In [None]:
# for col in columns:
col = 'bedroom_cnt'
scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(train[[col]])
train_scaled = scaler.transform(train[[col]])
inverse_train_scaled = scaler.inverse_transform(train_scaled)
plt.figure(figsize=(13, 6))
plt.subplot(131)
plt.hist(train[col], bins=15, ec='black')
plt.title(f'Original {col}')
plt.subplot(132)
plt.hist(train_scaled, bins=100, ec='black')
plt.title(f'Scaled {col}: MinMax')
plt.subplot(133)
plt.hist(inverse_train_scaled, bins=100, ec='black')
plt.title(f'Inverse Transform {col}: MinMax')
plt.show()

In [None]:
both = train[[col]]

both["inverse_train_scaled"] = inverse_train_scaled

both["difference"] = both.inverse_train_scaled - both.bedroom_cnt

In [None]:
both.difference.min()

### Well for bedroom count they are exactly the same - maybe because it's an integer?

# 3. Quantile Transformer using normal for output_distribution. Apply scaler to data, visualize

In [None]:
for col in columns:
    scaler = sklearn.preprocessing.QuantileTransformer(output_distribution='normal')
    scaler.fit(train[[col]])
    train_scaled = scaler.transform(train[[col]])
    plt.figure(figsize=(13, 6))
    plt.subplot(121)
    plt.hist(train[col], bins=25, ec='black')
    plt.title(f'Original {col}')
    plt.subplot(122)
    plt.hist(train_scaled, ec='black')
    plt.title(f'Scaled {col}: Quantile Transformer Scaler (Normal)')
    plt.show()

### ->Quantile Transform certainly makes the data look normalized

In [None]:
for col in columns:
    scaler = sklearn.preprocessing.QuantileTransformer()
    scaler.fit(train[[col]])
    train_scaled = scaler.transform(train[[col]])
    plt.figure(figsize=(13, 6))
    plt.subplot(121)
    plt.hist(train[col], bins=25, ec='black')
    plt.title(f'Original {col}')
    plt.subplot(122)
    plt.hist(train_scaled,  ec='black')
    plt.title(f'Scaled {col}: Quantile Transformer (uniform)')
    plt.show()

## With uniform quantile scaling the outliers are not distinct, basically invisible

## Write funciton to scale data for zillow data, put in prepare.py

In [None]:
def scale_data(train, validate, test, features_to_scale):
    """Scales Zillow data using MinMax Scaler. 
    Accepts train, validate, and test datasets as inputs as well as a list of the features to scale. 
    Returns dataframe with scaled values added on as columns"""
    
    # Fit the scaler to train data only
    scaler = sklearn.preprocessing.MinMaxScaler()
    scaler.fit(train[features_to_scale])
    
    # Generate a list of the new column names with _scaled added on
    scaled_columns = [col+"_scaled" for col in features_to_scale]
    
    # Transform the separate datasets using the scaler learned from train
    scaled_train = scaler.transform(train[features_to_scale])
    scaled_validate = scaler.transform(validate[features_to_scale])
    scaled_test = scaler.transform(test[features_to_scale])
    
    # Concatenate the scaled data to the original unscaled data
    train_scaled = pd.concat([train, pd.DataFrame(scaled_train,index=train.index, columns = scaled_columns)],axis=1)
    validate_scaled = pd.concat([validate, pd.DataFrame(scaled_validate,index=validate.index, columns = scaled_columns)],axis=1)
    test_scaled = pd.concat([test, pd.DataFrame(scaled_test,index=test.index, columns = scaled_columns)],axis=1)

    return train_scaled, validate_scaled, test_scaled

In [5]:
features_to_scale = ['bedroom_cnt', 'bathroom_cnt', 'square_feet', 'tax_value',
       'year_built', 'tax_amount']

In [6]:
train_scaled, validate_scaled, test_scaled = scale_data(train, validate, test, features_to_scale)

In [7]:
train_scaled

Unnamed: 0,parcel_id,bedroom_cnt,bathroom_cnt,square_feet,tax_value,year_built,tax_amount,fips,bedroom_cnt_scaled,bathroom_cnt_scaled,square_feet_scaled,tax_value_scaled,year_built_scaled,tax_amount_scaled
1567873,12832154,3,1.0,1248.0,191578.0,1950,2712.21,6037.0,0.200000,0.03125,0.001309,0.002124,0.693023,0.002510
1218138,10914505,2,2.0,1326.0,1345206.0,1950,16539.04,6037.0,0.133333,0.06250,0.001391,0.014915,0.693023,0.015335
339661,12639487,2,1.0,1053.0,356648.0,1953,4575.16,6037.0,0.133333,0.03125,0.001104,0.003954,0.706977,0.004238
1017133,12268872,3,2.0,1256.0,175069.0,1946,2635.51,6037.0,0.200000,0.06250,0.001317,0.001941,0.674419,0.002439
40250,13924017,3,2.0,1640.0,543000.0,1957,6344.96,6059.0,0.200000,0.06250,0.001721,0.006020,0.725581,0.005879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941799,12290590,2,1.0,696.0,27699.0,1954,918.71,6037.0,0.133333,0.03125,0.000730,0.000307,0.711628,0.000846
631585,12934374,3,3.0,2081.0,619590.0,1987,7319.49,6037.0,0.200000,0.09375,0.002184,0.006870,0.865116,0.006783
883917,10769349,3,2.0,1742.0,196237.0,1957,2512.91,6037.0,0.200000,0.06250,0.001828,0.002176,0.725581,0.002325
1253096,12035061,4,3.0,2066.0,617344.0,1924,7650.55,6037.0,0.266667,0.09375,0.002168,0.006845,0.572093,0.007090
