Read the csv file. It starts getting slow when loading more than 100k records. Amir said ok to work with a subset so that's what I'm doing here

In [24]:
# starts getting slow when loading more than 100k records. Amir said ok to work with a subset
import pandas as pd
import numpy as np
import scipy.stats as st
filename = 'C:/data/properties_2017.csv'
df = pd.read_csv(filename, nrows = 100000)

Data Wrangling Step #1: replace True/False with 1 and 0

In [25]:
replace_true_false_col_list = ['fireplaceflag',
'hashottuborspa',
'taxdelinquencyflag']

# replacing True with 1 from column's i'm identifying
df[replace_true_false_col_list] = df[replace_true_false_col_list].replace(True,1)

# replacing NaN (False) with 0 from column's i'm identifying
df[replace_true_false_col_list] = df[replace_true_false_col_list].fillna(0)

Data Wrangling Step #2: remove records with NaN values in taxvaluedollarcnt. This is my target feature moving forward and cannot have null values

In [26]:
drop_null_col_list = ['taxvaluedollarcnt']

In [27]:
df = df.dropna(axis=0, subset=drop_null_col_list)

Data Wrangling Step #3: replace NaN with 0 for poolcnt in order to do Statistical Analysis #1

In [28]:
replace_nan = ['poolcnt']

In [29]:
# replacing NaN (False) with 0 from column's i'm identifying
df[replace_nan] = df[replace_nan].fillna(0)

Data Wrangling Step #4: Divid taxvaluedollarcnt by 1000. This makes all house values in thousands

In [30]:
df['taxvaluedollarcnt'] = df['taxvaluedollarcnt']/1000

Statistical Analysis #1: Is the mean housing price of a home with over 3 garages same as a house with a hot tub/spa. We will be testing the null hypothesis that the mean housing price is the same of the two samples. 

Result: Based on my findings, it can be said with a 99% significance level that houses with 3 or more garages have different prices than houses with hot tub/spa's.  

In [31]:
# exploratory analysis and seperating the data sets
#print(df.hashottuborspa.value_counts(dropna=False))
#print(df.garagecarcnt.value_counts(dropna=False))


over_three_garages = df[df['garagecarcnt']>=3]
hashottuborspa = df[df['hashottuborspa']==1]
over_three_garages_mean = np.mean(over_three_garages['taxvaluedollarcnt'])
hashottuborspa_mean = np.mean(hashottuborspa['taxvaluedollarcnt'])
print over_three_garages_mean
print hashottuborspa_mean


1380.39371327
1127.05131495


In [32]:
# first will try the frequentist approach using a t test
std_over_three_garages =  np.std(over_three_garages['taxvaluedollarcnt'])
std_hashottuborspa = np.std(hashottuborspa['taxvaluedollarcnt'])
numerator = over_three_garages_mean - hashottuborspa_mean
denominator = np.sqrt((std_over_three_garages**2/len(over_three_garages['taxvaluedollarcnt'])) + (std_hashottuborspa**2/len(hashottuborspa['taxvaluedollarcnt'])))
t = numerator / denominator
p_values_t = st.norm.sf(abs(t))*2

print 'the t-score is', t
print 'the p-value is', p_values_t

the t-score is 3.37559920142
the p-value is 0.000736551331998


In [33]:
## next will try the bayesian approach to double check 
#first need to set up functions that will be used for analysis

def diff_of_means(data_1, data_2):
    #"""Difference in means of two arrays."""

    # The difference of means of data_1, data_2: diff
    diff = np.mean(data_1) - np.mean(data_2)

    return diff


def draw_perm_reps(data_1, data_2, func, size=1):
    #"""Generate multiple permutation replicates."""

    # Initialize array of replicates: perm_replicates
    perm_replicates = np.empty(size)

    for i in range(size):
        # Generate permutation sample
        perm_sample_1, perm_sample_2 = permutation_sample(data_1, data_2)

        # Compute the test statistic
        perm_replicates[i] = func(perm_sample_1, perm_sample_2)

    return perm_replicates

def permutation_sample(data1, data2):
    """Generate a permutation sample from two data sets."""

    # Concatenate the data sets: data
    data = np.concatenate((data1, data2))

    # Permute the concatenated array: permuted_data
    permuted_data = np.random.permutation(data)

    # Split the permuted array into two: perm_sample_1, perm_sample_2
    perm_sample_1 = permuted_data[:len(data1)]
    perm_sample_2 = permuted_data[len(data1):]

    return perm_sample_1, perm_sample_2

In [34]:
# continue using the Bayesian approach

over_three_garages = df[df['garagecarcnt']>=3]
hashottuborspa = df[df['hashottuborspa']==1]
over_three_garages_mean = np.mean(over_three_garages['taxvaluedollarcnt'])
hashottuborspa_mean = np.mean(hashottuborspa['taxvaluedollarcnt'])


# Compute difference of mean impact force from experiment: empirical_diff_means
empirical_diff_means = diff_of_means(over_three_garages['taxvaluedollarcnt'], hashottuborspa['taxvaluedollarcnt'])

# Draw 10,000 permutation replicates: perm_replicates
perm_replicates = draw_perm_reps(over_three_garages['taxvaluedollarcnt'], hashottuborspa['taxvaluedollarcnt'],
                                 diff_of_means, size=10000)

# Compute p-value: p
p = float(np.sum(perm_replicates >= empirical_diff_means)) / float(len(perm_replicates))

# Print the result
print('p-value =', p)
print over_three_garages_mean
print hashottuborspa_mean

('p-value =', 0.0001)
1380.39371327
1127.05131495


Statistical Analysis #2: Is the mean housing price of houses with multiple stories different than houses with central air conditioning?

Result: Based on my findings, it can be said with a 99% significance level houses with multiple stories and houses with central air conditioning have similar housing prices

In [106]:
# seperate out datasets
multiple_stories = df[df['numberofstories']>1]

#airconditioningtypeid = 1 is how you get central_ac  
central_ac = df[df['airconditioningtypeid']==1]


25142


In [102]:
# will use the frequentist approach using a t test
mean_multiple_stories = np.mean(multiple_stories.taxvaluedollarcnt)
mean_central_ac = np.mean(central_ac.taxvaluedollarcnt)
std_multiple_stories =  np.std(multiple_stories['taxvaluedollarcnt'])
std_central_ac = np.std(central_ac['taxvaluedollarcnt'])
numerator = mean_multiple_stories - mean_central_ac
denominator = np.sqrt((std_multiple_stories**2/len(multiple_stories['taxvaluedollarcnt'])) + (std_central_ac**2/len(central_ac['taxvaluedollarcnt'])))
t = numerator / denominator
p_values_t = st.norm.sf(abs(t))*2 

print 'the t-score is', t
print 'the p-value is', p_values_t
print 'the mean housing price of houses with multiple stories is ', mean_multiple_stories
print 'the mean housing price of houses with central air conditioning is ', mean_central_ac

the t-score is 0.2990123474
the p-value is 0.764930624454
the mean housing price of houses with multiple stories is  526.401738893
the mean housing price of houses with central air conditioning is  523.618849734


In [103]:
## next will try the bayesian approach to double check. functions used are previously definied in Statistical Analysis #1

# Compute difference of mean impact force from experiment: empirical_diff_means
empirical_diff_means = diff_of_means(multiple_stories['taxvaluedollarcnt'], central_ac['taxvaluedollarcnt'])

# Draw 10,000 permutation replicates: perm_replicates
perm_replicates = draw_perm_reps(multiple_stories['taxvaluedollarcnt'], central_ac['taxvaluedollarcnt'],
                                 diff_of_means, size=10000)

# Compute p-value: p
p = float(np.sum(perm_replicates >= empirical_diff_means)) / float(len(perm_replicates))

# Print the result
print('p-value =', p)
print mean_multiple_stories
print mean_central_ac

('p-value =', 0.3785)
526.401738893
523.618849734


Statistical Analysis #3: Is the mean housing price of houses with decks different from the mean housing price of houses with pools? 

Result: Based on my findings, it can be said with a 99% significance level houses with pools and houses with decks have similar housing prices

In [68]:
# set up datasets
pool = df[df.poolcnt == 1 ]
deck = df[df.decktypeid > 1 ]

In [74]:
# will use the frequentist approach using a t test
mean_pool = np.mean(pool.taxvaluedollarcnt)
mean_deck = np.mean(deck.taxvaluedollarcnt)
std_pool =  np.std(pool['taxvaluedollarcnt'])
std_deck = np.std(deck['taxvaluedollarcnt'])
numerator = mean_pool - mean_deck
denominator = np.sqrt((std_pool**2/len(pool['taxvaluedollarcnt'])) + (std_deck**2/len(deck['taxvaluedollarcnt'])))
t = numerator / denominator
p_values_t = st.norm.sf(abs(t))*2 

print 'the t-score is', t
print 'the p-value is', p_values_t
print 'the mean housing price of houses with a pool is ', mean_pool
print 'the mean housing price of houses with a deck is ', mean_deck

the t-score is 0.430349168986
the p-value is 0.666941665245
the mean housing price of houses with a pool is  660.862630741
the mean housing price of houses with a deck is  650.715133333


In [86]:
## next will try the bayesian approach to double check. functions used are previously definied in Statistical Analysis #1
pool = df[df.poolcnt == 1 ]
deck = df[df.decktypeid > 1 ]
mean_pool = np.mean(pool.taxvaluedollarcnt)
mean_deck = np.mean(deck.taxvaluedollarcnt)


# Compute difference of mean impact force from experiment: empirical_diff_means
empirical_diff_means = diff_of_means(pool['taxvaluedollarcnt'], deck['taxvaluedollarcnt'])

# Draw 10,000 permutation replicates: perm_replicates
perm_replicates = draw_perm_reps(pool['taxvaluedollarcnt'], deck['taxvaluedollarcnt'],
                                 diff_of_means, size=10000)

# Compute p-value: p
p = float(np.sum(perm_replicates >= empirical_diff_means)) / float(len(perm_replicates))

# Print the result
print('p-value =', p)
print mean_pool
print mean_deck

('p-value =', 0.4252)
660.862630741
650.715133333


Statistical Analysis #4: Is the mean housing price of houses with greater than the average number of bathrooms the same price as houses with greater than the average number of bedrooms? 

Result: Based on my findings, it can be said with a 99% significance level that houses with more than the average number of bathrooms have different prices than houses with more than the average number of bedrooms. 

In [83]:
# exploratory analysis and settingn up datasets
avg_bathroomcnt = np.mean(df.bathroomcnt)
avg_bedroomcnt  = np.mean(df.bedroomcnt)
above_avg_bathroomcnt = df[df.bathroomcnt > avg_bathroomcnt]
above_avg_bedroomcnt = df[df.bedroomcnt > avg_bedroomcnt]


print avg_bathroomcnt
print avg_bedroomcnt

2.23445193776
3.12374751019


In [84]:
# will use the frequentist approach using a t test
mean_above_avg_bathroomcnt = np.mean(above_avg_bathroomcnt.taxvaluedollarcnt)
mean_above_avg_bedroomcnt = np.mean(above_avg_bedroomcnt.taxvaluedollarcnt)
std_above_avg_bathroomcnt =  np.std(above_avg_bathroomcnt['taxvaluedollarcnt'])
std_above_avg_bedroomcnt = np.std(above_avg_bedroomcnt['taxvaluedollarcnt'])
numerator = mean_above_avg_bathroomcnt - mean_above_avg_bedroomcnt
denominator = np.sqrt((std_above_avg_bathroomcnt**2/len(above_avg_bathroomcnt['taxvaluedollarcnt'])) + (std_above_avg_bedroomcnt**2/len(above_avg_bedroomcnt['taxvaluedollarcnt'])))
t = numerator / denominator
p_values_t = st.norm.sf(abs(t))*2

print 'the t-score is', t
print 'the p-value is', p_values_t
print 'the mean housing price of houses with above average bathrooms is ', mean_above_avg_bathroomcnt
print 'the mean housing price of houses with above average bedrooms is ', mean_above_avg_bedroomcnt

the t-score is 9.92487521911
the p-value is 3.24508363931e-23
the mean housing price of houses with above average bathrooms is  658.250168622
the mean housing price of houses with above average bedrooms is  595.7339425


In [85]:
## next will try the bayesian approach to double check. functions used are previously definied in Statistical Analysis #1

# Compute difference of mean impact force from experiment: empirical_diff_means
empirical_diff_means = diff_of_means(above_avg_bathroomcnt['taxvaluedollarcnt'], above_avg_bedroomcnt['taxvaluedollarcnt'])

# Draw 10,000 permutation replicates: perm_replicates
perm_replicates = draw_perm_reps(above_avg_bathroomcnt['taxvaluedollarcnt'], above_avg_bedroomcnt['taxvaluedollarcnt'],
                                 diff_of_means, size=10000)

# Compute p-value: p
p = float(np.sum(perm_replicates >= empirical_diff_means)) / float(len(perm_replicates))

# Print the result
print('p-value =', p)
print mean_above_avg_bathroomcnt
print mean_above_avg_bedroomcnt

('p-value =', 0.0)
658.250168622
595.7339425
