# Homework 2
*Jinyi Zhou | u1424752 | May 29*

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.stats.proportion import proportions_ztest
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sb

## Part 1: A/B Hypothesis Testing

*Null hypothesis: A is better than B.*

In [None]:
# significance 1%
signif = 0.01
sampleSize = 1000
aSample = 500
bSample = 550
clicks = np.array([aSample, bSample])
stat, pValue = proportions_ztest(count=clicks, nobs=sampleSize,  alternative='two-sided')
print(stat)
print(pValue)

if pValue > signif:
    print ("Null hypothesis is incorrect. B is better than A.")
else:
    print ("Null hypothesis is correct. A is better than B.")

In [None]:
# significance 5%
signif = 0.05
sampleSize = 1000
aSample = 500
bSample = 550
clicks = np.array([aSample, bSample])
stat, pValue = proportions_ztest(count=clicks, nobs=sampleSize,  alternative='two-sided')
print(stat)
print(pValue)

if pValue > signif:
    print ("Null hypothesis is incorrect. B is better than A.")
else:
    print ("Null hypothesis is correct. A is better than B.")

## Part 2: Regression of real estate data

*Import and clean the data:*

In [None]:
data1 = pd.read_csv("realEstate1.csv")
data2 = pd.read_csv("realEstate2.csv")
data = [data1, data2]
data = pd.concat(data, ignore_index=True)
data = data[['Acres', 'Deck', 'GaragCap', 'Latitude', 'Longitude', 'LstPrice', 'Patio', 'PkgSpacs', 'PropType', 'SoldPrice', 'Taxes', 'TotBed', 'TotBth', 'TotSqf', 'YearBlt']]
data = data[data["LstPrice"] > 200000]
data = data[data["LstPrice"] < 1000000]
data.TotSqf = data.TotSqf.map(lambda x: int(x.replace(",", '')))
data = data[data.Longitude < 0] # removing
data = data[data.Taxes < 1000000] # as above

propertyTypes = data["PropType"]
data = pd.get_dummies(data)
data.reset_index(inplace=True, drop=True)
data


*Exploratory data analysis:*

*From the above results, we can see that there are 259 data and 14 variables. After removing the unreasonable values, the values are now reasonable.*

In [None]:
data.describe()

In [None]:
propertyTypes.value_counts().plot.bar()

In [None]:
corr = data[data.columns[:-3]].corr() # can't convert, omit

heatmap = plt.pcolor(corr, vmin=-1, vmax=1, cmap=plt.cm.bwr)
plt.colorbar(heatmap)
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90);
plt.yticks(range(len(corr.columns)), corr.columns);

In [None]:
pd.plotting.scatter_matrix(data[ ['Acres', 'LstPrice', 'SoldPrice', 'Taxes', 'TotBed', 'TotBth', 'TotSqf', 'YearBlt']], figsize=(8, 8))
plt.show()

*Findings: Sold prices, list prices and taxes are very correlated. When the listing price goes up, all the other columns like tax also go up.*

### Task 4: Geospatial Plot

In [None]:
data.plot.scatter(x="Latitude", y="Longitude", c="SoldPrice", cmap='bwr')

*The prices increase when it's more North and East. The prices go down when it's far away from the North and East.*

### Task 5: Simple Linear Regression

In [None]:
regression = smf.ols('SoldPrice ~ LstPrice', data=data).fit()
print(regression.summary())

In [None]:
sfReg = smf.ols('SoldPrice ~ TotSqf', data=data).fit()
print(sfReg.summary())

In [None]:
yearReg = smf.ols('SoldPrice ~ YearBlt', data=data).fit()
print(yearReg.summary())

In [None]:
data.head()

In [None]:
sb.regplot(x='LstPrice', y='SoldPrice', data=data)

In [None]:
sb.regplot(x='TotSqf', y='SoldPrice', data=data)

In [None]:
sb.regplot(x='YearBlt', y='SoldPrice', data=data)

### Task 6: Multilinear Regression

In [None]:
multi_regression = smf.ols(formula="SoldPrice ~ Taxes + TotSqf + Acres + GaragCap", data=data).fit()
multi_regression.summary()

In [None]:
param = dict(multi_regression.params)
print("R2:", multi_regression.rsquared)


- Often the price per square foot for a house is advertised. Is this what the coefficient for TotSqf is measuring? Provide an interpretation for the coefficient for TotSqf.


In [None]:
print(param['TotSqf'])

- Estimate the value that each Garage space adds to a house.

In [None]:
print(param['GaragCap'])

- Does latitude or longitude have an impact on house price? Explain.

*I think longitude does affect the price to some extent.*

- If we wanted to start a 'house flipping' company, we'd have to be able to do a better job of predicting the sold price than the list price does. How does your model compare?

*It does a worse job based on the model.*

### Task 7: Incorporating a Categorical Variable

In [None]:
color_dict = {'Single Family': 'red', 'Condo': 'blue', 'Townhouse': 'green'}
PropType_Condo = data.PropType_Condo
plt.figure(figsize=(8, 8))
scatter = plt.scatter(y=data.TotSqf, x=data.SoldPrice, c=PropType_Condo, s=10)
plt.xlabel('Sold Price')
plt.ylabel('Total Sqft')
prop_names = ['Condo', 'Single Family', 'Townhouse']
plt.legend(handles=scatter.legend_elements()[0], labels=prop_names, title="Property Type")