In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from math import sqrt
from scipy import stats
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE, SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler

import env
import zillow_wrangle
import split_scale
import evaluate
import explore
import feature

# Data Science Pipeline

### Acquire
*Goal: Having a clean dataframe ready to prepare*

The ad hoc part includes summarizing your data as you read it in and begin to explore, look at the first few rows, data types, summary stats, column names, shape of the data frame, etc.

Brainstorming ideas, hypotheses, related to how variables might impact or relate to each other, both within independent variables and between the independent variables and dependent variable, and also related to any ideas for new features you may have while first looking at the existing variables and challenge ahead of you.

Have a detailed README.md file for anyone who wants to check out your project. In this file should be a description of what the project is, and any instructions necessary for someone else to clone your project and run the code on their own laptop.

During project planning, think about what things in your project are nice to have, versus which things are need to have. For example, you might document that you will only worry about trying to scale your features after creating and evaluating a baseline model.

#### - I'm acquiring my data using wrangle_zillow

In [None]:
df = zillow_wrangle.wrangle_zillow()

In [None]:
df.info()

### Hypothesis:
- $H_0$: Number of bathrooms and bedrooms in home, as well as square footage will not be leading factors in predicting property value
- $H_a$: Number of bathrooms, bedrooms and square footage will have a strong reciprocy for predicting property value.
- $H_a$: Using recursive feature elimination from SK.learn will proove a different variation of features that will predict features mentioned above but not discluding other possible features included in the dataset

### Prep
*Goal: leave this section with a dataset that is split into train and test ready to be analyzed. Data types are appropriate, missing values have been addressed, as have any data integrity issues.*

##### After a bit of exploring, I noticed that every fip code was for a county in california , added that as the state

In [None]:
state = 'California'
df['State'] = state
df.head()

#### There are only 3 unique fips numbers, so I added these to a seperate column called 'county'.

In [None]:
df.fips.unique()

#### small for loop to convert fips to counties

In [None]:
county = []

for row in df['fips']:
    if row == 6037:
        county.append('Los Angelas')
    elif row == 6059:
        county.append('Orange')
    elif row == 6111:
        county.append('Ventura')
        
df['county'] = county        

In [None]:
df.groupby('county').count()

#### Creating a new column called tax_rate which is the amount taxed divided by the home value to give us the percentage taxed.

In [None]:
df['tax_rate'] = df.taxamount / df.taxvaluedollarcnt

df['tax_rate'] = df.tax_rate.round(3)

### plotting the tax distribution

In [None]:
df.head()

In [None]:
county_df = df[['county', 'tax_rate']]
county_df.head()

In [None]:
la = county_df[county_df.county == 'Los Angelas'].tax_rate
orange = county_df[county_df.county == 'Orange'].tax_rate
ventura = county_df[county_df.county == 'Ventura'].tax_rate

plt.figure(figsize=(16, 8))
plt.suptitle('Distribution of Tax Rates by County')

plt.subplot(131)
sns.distplot(la)
plt.title('Los Angelas')
plt.xlim(0, 0.5)
plt.ylim(0, 350)

plt.subplot(132)
sns.distplot(orange)
plt.title("Orange")
plt.xlim(0, 0.5)
plt.ylim(0, 350)

plt.subplot(133)
sns.distplot(ventura)
plt.title("Ventura")
plt.xlim(0, 0.5)
plt.ylim(0, 350)


plt.show()

## There are some crazy outliers in the Los Angelas and Ventura counties. I will show the distribution by cutting those outliers from the data, as they are heavily influencing the shape of the distribution.

In [None]:
la = county_df[county_df.county == 'Los Angelas'].tax_rate
orange = county_df[county_df.county == 'Orange'].tax_rate
ventura = county_df[county_df.county == 'Ventura'].tax_rate

plt.figure(figsize=(16, 8))
plt.suptitle('Distribution of Tax Rates by County')

plt.subplot(131)
sns.distplot(la)
plt.title('Los Angelas')
plt.xlim(0, 0.1)
plt.ylim(0, 350)

plt.subplot(132)
sns.distplot(orange)
plt.title("Orange")
plt.xlim(0, 0.1)
plt.ylim(0, 350)

plt.subplot(133)
sns.distplot(ventura)
plt.title("Ventura")
plt.xlim(0, 0.1)
plt.ylim(0, 350)


plt.show()

The ad hoc part includes plotting the distributions of individual variables and using those plots to identify outliers and if those should be handled (and if so, how), identify unit scales to identify how to best scale the numeric data, as well as finding erroneous or invalid data that may exist in your dataframe.

### visualizing Data for pre-exploration

In [None]:
df.info()

### a lot of these parameters aren't viable features for our model, so I wittled it down to 4 possible features and our target variable

In [None]:
bed_bath_and_beyond = df[['bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet', 'roomcnt','taxvaluedollarcnt']]

In [None]:
bed_bath_and_beyond.info()

In [None]:
explore.plot_variable_pairs(bed_bath_and_beyond)

Add a data dictionary in your notebook that defines all fields used in your model and your analysis, and answers the question: why did you use the fields you used, e.g. why did you use bedroom_field1 over bedroom_field2, not why did you use number of bedrooms!

#### splitting our data

In [None]:
train, test = split_scale.split_my_data(df)

### Data Exploration
Goal: Address each of the questions you posed in your planning and brainstorming and any others you have come up with along the way through visual or statistical analysis.

When you have completed this step, you will have the findings from your analysis that will be used in your final report, answers to specific questions your customers has asked, and information to move forward toward building a model.

Run at least 1 t-test and 1 correlation test (but as many as you need!)
Visualize all combinations of variables in some way(s).
What independent variables are correlated with the dependent?
Which independent variables are correlated with other independent variables?
Make sure to summarize your takeaways and conclusions. That is, the data science zillow team doesn't want to see just a bunch of dataframes/numbers/charts without any explanation, you should explain in the notebook what these dataframes/numbers/charts mean.

### using ***pearson's r*** to test correlation

#### testing for bathroom count & property value

In [None]:
x = df.bathroomcnt
y = df.taxvaluedollarcnt 

corr, p = stats.pearsonr(x, y)
corr, p

#### Here we visualize our findings

In [None]:
#plt.bar(x, y)
#plt.show()

#### testing for correlation between bedroom count and property value

In [None]:
x = df.bedroomcnt

corr, p = stats.pearsonr(x, y)
corr, p

#### Visuals for our correlation

In [None]:
#plt.bar(x, y)
#plt.show()

#### testing for correlation between square footage and property value

In [None]:
x = df.calculatedfinishedsquarefeet

corr, p = stats.pearsonr(x, y)
corr, p

#### visuals

In [None]:
plt.scatter(x, y, c='firebrick')
plt.show()

#### testing correlation between room count and property value

In [None]:
x = df.roomcnt

corr, p = stats.pearsonr(x, y)
corr, p

#### visuals

In [None]:
plt.scatter(x, y, c="darkseagreen")
plt.show()

### Modeling
Goal: develop a regression model that performs better than a baseline.

In [None]:
train = train[['bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet', 'roomcnt','taxvaluedollarcnt']]
test = test[['bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet', 'roomcnt','taxvaluedollarcnt']]

In [None]:
train.taxvaluedollarcnt.sort_values(ascending=False)

In [None]:
X_train = train.drop(columns=['taxvaluedollarcnt'])
y_train = train[['taxvaluedollarcnt']]
X_test = test.drop(columns=['taxvaluedollarcnt'])
y_test = test[['taxvaluedollarcnt']]

#### developing baseline model

In [None]:
predictions = pd.DataFrame()

predictions['actual'] = y_train['taxvaluedollarcnt']
predictions['baseline'] = y_train['taxvaluedollarcnt'].mean()

In [None]:
predictions.head()

#### Evaluate baseline model,

In [None]:
evaluate.baseline_mean_errors(predictions.actual)

##### These numbers are terrible...

### model # 1

### Now we're going to do a linear regression model using Recursive Feature Elimination from Scikit learn

- feature elimination

In [None]:
X_train.info(), y_train.info()

In [None]:
# Initialize the Linear Regression Object 
lm = LinearRegression()
rfe = RFE(lm, 3)

# Transforming data using RFE
X_rfe = rfe.fit_transform(X_train,y_train)  

#### we have our 3 features from sklearn

In [None]:
#boolean mask for each variable of whether it was selected or not. 
mask = rfe.support_

# select the column names of the features that were selected and convert them to a list for future use. 
rfe_features = X_train.columns[mask]

# print them out here for our reference
print(f'selected {len(rfe_features)} features:', ', '.join(rfe_features))

In [None]:
# Fitting the data to model
lm.fit(X_rfe, y_train)

In [None]:
print("Linear Model:", lm)

print("intercept: ", lm.intercept_)

print("features: ", rfe_features)
print("coefficients: ", lm.coef_)

In [None]:
predictions['model1'] = lm.predict(X_rfe)

In [None]:
predictions.head()

#### now we're grabbing our regression errors

In [None]:
evaluate.regression_errors(predictions.actual, predictions.model1)

In [None]:
print(f"Our R^2: ", 2.534779** -1, "RMSE: ", 5.763501 ** 5)

In [None]:
sns.regplot(predictions.actual, predictions.model1)

In [None]:
evaluate.plot_residuals(predictions, predictions.actual, predictions.model1)

### Ran on our test, data

In [None]:
model_predictions = pd.DataFrame()

model_predictions['actual'] = y_test['taxvaluedollarcnt']

model1 = lm.predict(test[['bathroomcnt', 'bedroomcnt', 'roomcnt']])

model_predictions['model1'] = model1

#### evaluating our test data with the regression errors and plotting our residuals

In [None]:
evaluate.regression_errors(model_predictions.actual, model_predictions.model1)

In [None]:
print("R^2:", 3.522962**-1, "RMSE: ", 4.828763**5 )

In [None]:
sns.regplot(model_predictions.actual, model_predictions.model1)

In [None]:
evaluate.plot_residuals(model_predictions, model_predictions.actual, model_predictions.model1)

### Model 2 we will do a linear regression model after selecting features from SelectKBest

In [None]:
# We are initializing ourf_selector object
f_selector = SelectKBest(f_regression, k=2)

# fitting the object to our data 
f_selector.fit(X_train, y_train)

In [None]:
# transforming our dataset to reduce kbest features
X_reduced = f_selector.transform(X_train)

f_support = f_selector.get_support()

f_feature = X_train.loc[:,f_support].columns.tolist()

print(str(len(f_feature)), 'selected features')
print(f_feature)

In [None]:
lm = LinearRegression()
lm

In [None]:
# Fitting the data to model
K_best_train = X_train[['bathroomcnt', 'calculatedfinishedsquarefeet']]
lm.fit(K_best_train, y_train)

In [None]:
print("Linear Model:", lm)

print("intercept: ", lm.intercept_)

print("features: ", f_feature)
print("coefficients: ", lm.coef_)

In [None]:
predictions['model2'] = lm.predict(K_best_train)

In [None]:
predictions.head()

In [None]:
evaluate.regression_errors(predictions.actual, predictions.model2)

In [None]:
print("R^2: ", 3.647595**-1, "RMSE: ", 5.316602**5)

In [None]:
sns.regplot(predictions.actual, predictions.model2)

In [None]:
evaluate.plot_residuals(predictions, predictions.actual, predictions.model2)

### Ran on our test data

In [None]:
model2 = lm.predict(test[['bathroomcnt', 'calculatedfinishedsquarefeet']])

model_predictions['model2'] = model2

#### evaluating our regression errors & plotting our residuals

In [None]:
evaluate.regression_errors(model_predictions.actual, model_predictions.model2)

In [None]:
print("R^2: ", 5.064185**-1, "RMSE: ", 4.454601**5  )

In [None]:
sns.regplot(model_predictions.actual, model_predictions.model2)

In [None]:
evaluate.plot_residuals(model_predictions, model_predictions.actual, model_predictions.model2)

### model 3 we will be using number of bedrooms, bathrooms and square footage to predict property value

In [None]:
X_train = X_train[['bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet']]

In [None]:
# initializing linear regression model from sklearn
lm = LinearRegression()
lm

In [None]:
# Fitting the data to model
lm.fit(X_train, y_train)

In [None]:
print("Linear Model:", lm)

print("intercept: ", lm.intercept_)

print("features: ", X_train.columns)
print("coefficients: ", lm.coef_)

In [None]:
predictions['model3'] = lm.predict(X_train)

predictions.head()

#### grab those regression errors

In [None]:
evaluate.regression_errors(predictions.actual, predictions.model3)

In [None]:
sns.regplot(predictions.actual, predictions.model3)

In [None]:
evaluate.plot_residuals(predictions, predictions.actual, predictions.model3)

In [None]:
print(f"R^2: ", 3.903000**-1, "RMSE: ", 5.208625 ** 5 )

### running our model on our test data

In [None]:
model3 = lm.predict(test[['bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet']])

model_predictions['model3'] = model3

#### grabbing our regression errors and plotting our residuals

In [None]:
evaluate.regression_errors(model_predictions.actual, model_predictions.model3)

In [None]:
print("R^2: ", 5.284769**-1, "RMSE: ", 4.317318**5)

In [None]:
sns.regplot(model_predictions.actual, model_predictions.model3)

In [None]:
evaluate.plot_residuals(model_predictions, model_predictions.actual, model_predictions.model3)

### Going back to the tax distribution idea, maybe it would be a more accurate model if we split our data by county and modeled our data by county

#### I'm going to take my most accurate model, and use it on this data split by county

In [None]:
la = df[df.county == 'Los Angelas']
orange = df[df.county == 'Orange']
ventura = df[df.county == 'Ventura']

### train test data

In [None]:
la_train, la_test = split_scale.split_my_data(la)
orange_train, orange_test = split_scale.split_my_data(orange)
ventura_train, ventura_test = split_scale.split_my_data(ventura)

In [None]:
county_predictions = pd.DataFrame()
county_predictions['actual'] = la_train.taxvaluedollarcnt

### Los angelas

In [None]:
X_train = la_train[['bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet']]
y_train = la_train.taxvaluedollarcnt

# initializing linear regression model from sklearn
lm = LinearRegression()
lm

# Fitting the data to model
lm.fit(X_train, y_train)


county_predictions['LA'] = lm.predict(X_train)

county_predictions.head()

In [None]:
evaluate.regression_errors(county_predictions.actual, county_predictions.LA)

In [None]:
print("R^2: ",3.885693**-1, "RMSE: ",  5.900845**5)

In [None]:
sns.regplot(county_predictions.actual, county_predictions.LA)

In [None]:
evaluate.plot_residuals(county_predictions, county_predictions.actual, county_predictions.LA)

### Los Angelas test data

In [None]:
test_results = pd.DataFrame()
test_results['actual'] = la_test.taxvaluedollarcnt

In [None]:
Los_angelas = lm.predict(la_test[['bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet']])

test_results['LA'] = Los_angelas


evaluate.regression_errors(test_results.actual, test_results.LA)

In [None]:
print("R2: ", 7.581485**-1, "RMSE: ", 4.257386**5 )

In [None]:
sns.regplot(test_results.actual, test_results.LA)

In [None]:
evaluate.plot_residuals(test_results, test_results.actual, test_results.LA)

### Orange County

In [None]:
county_predictions = pd.DataFrame()
county_predictions['actual'] = orange_train.taxvaluedollarcnt

In [None]:
X_train = orange_train[['bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet']]
y_train = orange_train.taxvaluedollarcnt

# initializing linear regression model from sklearn
lm = LinearRegression()
lm

# Fitting the data to model
lm.fit(X_train, y_train)


county_predictions['orange'] = lm.predict(X_train)

county_predictions.head()

In [None]:
evaluate.regression_errors(county_predictions.actual, county_predictions.orange)

In [None]:
print("R2: ", 4.367566**-1, "RMSE: ", 4.294976**5 )

In [None]:
sns.regplot(county_predictions.actual, county_predictions.orange)

In [None]:
evaluate.plot_residuals(county_predictions, county_predictions.actual, county_predictions.orange)

#### orange Test data

In [None]:
test_results = pd.DataFrame()
test_results['actual'] = orange_test.taxvaluedollarcnt

In [None]:
orange = lm.predict(orange_test[['bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet']])

test_results['orange'] = orange


evaluate.regression_errors(test_results.actual, test_results.orange)

In [None]:
print("R2: ", 7.219882**-1, "RMSE: ", 2.979973**5 )

In [None]:
sns.regplot(test_results.actual, test_results.orange)

In [None]:
evaluate.plot_residuals(test_results, test_results.actual, test_results.orange)

### Ventura County

In [None]:
county_predictions = pd.DataFrame()
county_predictions['actual'] = ventura_train.taxvaluedollarcnt

In [None]:
X_train = ventura_train[['bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet']]
y_train = ventura_train.taxvaluedollarcnt

# initializing linear regression model from sklearn
lm = LinearRegression()
lm

# Fitting the data to model
lm.fit(X_train, y_train)


county_predictions['ventura'] = lm.predict(X_train)

county_predictions.head()

In [None]:
evaluate.regression_errors(county_predictions.actual, county_predictions.ventura)

In [None]:
print("R2: ", 6.148503**-1, "RMSE: ", 1.958842**5 )

In [None]:
sns.regplot(county_predictions.actual, county_predictions.ventura)

In [None]:
evaluate.plot_residuals(county_predictions, county_predictions.actual, county_predictions.ventura)

#### ventura test data

In [None]:
test_results = pd.DataFrame()
test_results['actual'] = ventura_test.taxvaluedollarcnt

In [None]:
ventura = lm.predict(ventura_test[['bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet']])

test_results['ventura'] = ventura

evaluate.regression_errors(test_results.actual, test_results.ventura)

In [None]:
print("R2: ", 5.582471**-1, "RMSE: ", 2.069989**5 )

In [None]:
sns.regplot(test_results.actual, test_results.ventura)

In [None]:
evaluate.plot_residuals(test_results, test_results.actual, test_results.ventura)

Your notebook will contain various algorithms and/or hyperparameters tried, along with the evaluation code and results, before settling on the final algorithm.

Be sure and evaluate your model using the standard techniques: plotting the residuals, computing the evaluation metric (SSE, RMSE, and/or MSE), comparing to baseline, plotting 
y
 by 
^
y
.

model.py: will have the functions to fit, predict and evaluate the final model on the test data set.