<center> <h1> Statistical Analysis for San Francisco Building Permits </h1> </center>  

The goal of this report is to perform statistical analysis on the dataset and find correlations between independent variables and with dependent variables. 

In [1]:
import datetime as dt 
import numpy as np
import pandas as pd
import seaborn as sb 
import matplotlib.pyplot as plt 
import scipy.stats as stats
import matplotlib.style as style
import statsmodels.api as sm

In [2]:
df = pd.read_csv(filepath_or_buffer='Building_Permits_Cleaned.csv',
                 header=0, low_memory=False, index_col='Unnamed: 0')

In [3]:
def to_category(columns, dataframe):
    """Convert a list of columns, from a dataframe, to a category datatype"""
    for column in columns: 
        dataframe[column] = dataframe[column].astype('category')

def to_integer(columns, dataframe):
    """Convert columns from a dataframe to an int64 datatype"""
    for column in columns: 
        dataframe[column] = dataframe[column].astype('int64')

In [4]:
cols = ['permit_creation_date', 'current_status_date','filed_date',
        'issued_date', 'completed_date', 'first_construction_document_date',
        'permit_expiration_date']

for col in cols:
    df[col] = pd.to_datetime(df[col])

    
columns = ['permit_type_definition', 'current_status', 'fire_only_permit', 
           'existing_use', 'proposed_use', 'existing_construction_type',
           'proposed_construction_type','existing_construction_type_description',
           'proposed_construction_type_description', 'tidf_compliance',
           'site_permit', 'neighborhoods', 'zipcode',
           'time_frame']

to_category(columns=columns, dataframe=df)

In [5]:
df.describe()

Unnamed: 0,permit_type,street_number,number_of_existing_stories,number_of_proposed_stories,estimated_cost,revised_cost,existing_units,proposed_units,plansets,latitude,longitude,time_range
count,198900.0,198900.0,156116.0,156032.0,160834.0,192834.0,147362.0,147989.0,161591.0,197200.0,197200.0,183960.0
mean,7.522323,1121.728944,5.705773,5.745043,168955.4,132856.2,15.666164,16.51095,1.27465,37.768751,-122.431812,26.054697
std,1.457451,1135.768948,8.613455,8.613284,3630386.0,3584903.0,74.476321,75.220444,22.407345,0.02373,0.028556,91.061716
min,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,37.70817,-122.510938,0.0
25%,8.0,235.0,2.0,2.0,3300.0,1.0,1.0,1.0,0.0,37.752218,-122.447257,0.0
50%,8.0,710.0,3.0,3.0,11000.0,7000.0,1.0,2.0,2.0,37.77344,-122.427673,0.0
75%,8.0,1700.0,4.0,4.0,35000.0,28707.5,4.0,4.0,2.0,37.788825,-122.409675,6.0
max,8.0,8400.0,78.0,78.0,537958600.0,780500000.0,1907.0,1911.0,9000.0,37.825017,-122.365853,1740.0


In [6]:
def twoSampZ(X1, X2, mudiff, sd1, sd2, n1, n2):
    from numpy import sqrt, abs, round
    from scipy.stats import norm
    pooledSE = sqrt(sd1**2/n1 + sd2**2/n2)
    z = ((X1 - X2) - mudiff)/pooledSE
    pval = 2*(1 - norm.cdf(abs(z)))
    return round(z, 3), round(pval, 4)

## Hypotheses to investigate:
**Null Hypothesis:** Permit type has no effect on approval time. In other words: Mean approval time = 26 days   
**Alternative Hypothesis:** Permit type has an effect on approval time. In other words: Mean approval time != 26 days

**Null Hypothesis:** Zipcode (region) has no effect on approval time. In other words: Mean approval time = 26 days  
**Alternative Hypothesis:** Zipcode (region) has an effect on approval time.  In other words: Mean approval time != 26 days  

**Null Hypothesis:** Proposed construction type has no effect on approval time. In other words: Mean approval time = 26 days    
**Alternative Hypothesis:** Proposed construction type has an effect on approval time. In other words: Mean approval time != 26 days 

**First Hypothesis Test**

In [7]:
one = df[df.permit_type == 1]
two = df[df.permit_type == 2]
three = df[df.permit_type == 3]
four = df[df.permit_type == 4]
five = df[df.permit_type == 5]
six = df[df.permit_type == 6]
seven = df[df.permit_type == 7]
eight = df[df.permit_type == 8]

In [8]:
one = one[one.time_range.notnull()]
two = two[two.time_range.notnull()]
three = three[three.time_range.notnull()]
four = four[four.time_range.notnull()]
five = five[five.time_range.notnull()]
six = six[six.time_range.notnull()]
seven = seven[seven.time_range.notnull()]
eight = eight[eight.time_range.notnull()]

###### Permit Type 1

In [9]:
alpha = 0.05
mean = 26
sample_mean = np.mean(one.time_range)
std = np.std(one.time_range)
n = len(one.time_range)
print("The sample mean is: {}".format(sample_mean))
print("The standard deviation is: {}".format(std))
print("The sample size is: {}".format(n))

z = (sample_mean - mean)/(std/np.sqrt(n))
print("The z-score is: {}".format(z))

p_value = stats.norm.cdf(x=-z)*2
print("The p-value is: {}".format(p_value))

if p_value <= alpha:
    print("Reject the null hypothesis. Permit type does have an effect on approval time.")

if p_value > alpha:
    print("Fail to reject the null hypothesis.")

The sample mean is: 478.60227272727275
The standard deviation is: 284.901739032821
The sample size is: 176
The z-score is: 21.07550375781369
The p-value is: 1.3347823018479708e-98
Reject the null hypothesis. Permit type does have an effect on approval time.


###### Permit Type 2

In [10]:
alpha = 0.05
mean = 26
sample_mean = np.mean(two.time_range)
std = np.std(two.time_range)
n = len(two.time_range)
print("The sample mean is: {}".format(sample_mean))
print("The standard deviation is: {}".format(std))
print("The sample size is: {}".format(n))

z = (sample_mean - mean)/(std/np.sqrt(n))
print("The z-score is: {}".format(z))

p_value = stats.norm.cdf(x=-z)*2
print("The p-value is: {}".format(p_value))

if p_value <= alpha:
    print("Reject the null hypothesis. Permit type does have an effect on approval time.")

if p_value > alpha:
    print("Fail to reject the null hypothesis.")

The sample mean is: 398.1430948419301
The standard deviation is: 317.69101894326656
The sample size is: 601
The z-score is: 28.717211281523863
The p-value is: 2.3263325303473036e-181
Reject the null hypothesis. Permit type does have an effect on approval time.


###### Permit Type 3

In [11]:
alpha = 0.05
mean = 26
sample_mean = np.mean(three.time_range)
std = np.std(three.time_range)
n = len(three.time_range)
print("The sample mean is: {}".format(sample_mean))
print("The standard deviation is: {}".format(std))
print("The sample size is: {}".format(n))

z = (sample_mean - mean)/(std/np.sqrt(n))
print("The z-score is: {}".format(z))

p_value = stats.norm.cdf(x=-z)*2
print("The p-value is: {}".format(p_value))

if p_value <= alpha:
    print("Reject the null hypothesis. Permit type does have an effect on approval time.")

if p_value > alpha:
    print("Fail to reject the null hypothesis.")

The sample mean is: 247.25225129780696
The standard deviation is: 207.08351043341662
The sample size is: 9439
The z-score is: 103.80186895349637
The p-value is: 0.0
Reject the null hypothesis. Permit type does have an effect on approval time.


###### Permit Type 4

In [12]:
alpha = 0.05
mean = 26
sample_mean = np.mean(four.time_range)
std = np.std(four.time_range)
n = len(four.time_range)
print("The sample mean is: {}".format(sample_mean))
print("The standard deviation is: {}".format(std))
print("The sample size is: {}".format(n))

z = (sample_mean - mean)/(std/np.sqrt(n))
print("The z-score is: {}".format(z))

p_value = stats.norm.cdf(x=-z)*2
print("The p-value is: {}".format(p_value))

if p_value <= alpha:
    print("Reject the null hypothesis. Permit type does have an effect on approval time.")

if p_value > alpha:
    print("Fail to reject the null hypothesis.")

The sample mean is: 52.72451102788182
The standard deviation is: 105.34316162190886
The sample size is: 2403
The z-score is: 12.43598829625382
The p-value is: 1.6666659269177442e-35
Reject the null hypothesis. Permit type does have an effect on approval time.


###### Permit Type 5

In [13]:
alpha = 0.05
mean = 26
sample_mean = np.mean(five.time_range)
std = np.std(five.time_range)
n = len(five.time_range)
print("The sample mean is: {}".format(sample_mean))
print("The standard deviation is: {}".format(std))
print("The sample size is: {}".format(n))

z = (sample_mean - mean)/(std/np.sqrt(n))
print("The z-score is: {}".format(z))

p_value = stats.norm.cdf(x=-z)*2
print("The p-value is: {}".format(p_value))

if p_value <= alpha:
    print("Reject the null hypothesis. Permit type does have an effect on approval time.")

if p_value > alpha:
    print("Fail to reject the null hypothesis.")

The sample mean is: 97.3076923076923
The standard deviation is: 104.98855953453257
The sample size is: 78
The z-score is: 5.998486779734139
The p-value is: 1.9916473109801197e-09
Reject the null hypothesis. Permit type does have an effect on approval time.


###### Permit Type 6

In [14]:
alpha = 0.05
mean = 26
sample_mean = np.mean(six.time_range)
std = np.std(six.time_range)
n = len(six.time_range)
print("The sample mean is: {}".format(sample_mean))
print("The standard deviation is: {}".format(std))
print("The sample size is: {}".format(n))

z = (sample_mean - mean)/(std/np.sqrt(n))
print("The z-score is: {}".format(z))

p_value = stats.norm.cdf(x=-z)*2
print("The p-value is: {}".format(p_value))

if p_value <= alpha:
    print("Reject the null hypothesis. Permit type does have an effect on approval time.")

if p_value > alpha:
    print("Fail to reject the null hypothesis.")

The sample mean is: 348.17506631299733
The standard deviation is: 336.6875482711004
The sample size is: 377
The z-score is: 18.57956520578397
The p-value is: 4.7030003364981436e-77
Reject the null hypothesis. Permit type does have an effect on approval time.


###### Permit Type 7

In [15]:
alpha = 0.05
mean = 26
sample_mean = np.mean(seven.time_range)
std = np.std(seven.time_range)
n = len(seven.time_range)
print("The sample mean is: {}".format(sample_mean))
print("The standard deviation is: {}".format(std))
print("The sample size is: {}".format(n))

z = (sample_mean - mean)/(std/np.sqrt(n))
print("The z-score is: {}".format(z))

p_value = stats.norm.cdf(x=-z)*2
print("The p-value is: {}".format(p_value))

if p_value <= alpha:
    print("Reject the null hypothesis. Permit type does have an effect on approval time.")

if p_value > alpha:
    print("Fail to reject the null hypothesis.")

The sample mean is: 45.348066298342545
The standard deviation is: 77.96064568271822
The sample size is: 362
The z-score is: 4.721896079337366
The p-value is: 2.336560689490869e-06
Reject the null hypothesis. Permit type does have an effect on approval time.


###### Permit Type 8

In [16]:
alpha = 0.05
mean = 26
sample_mean = np.mean(eight.time_range)
std = np.std(eight.time_range)
n = len(eight.time_range)
print("The sample mean is: {}".format(sample_mean))
print("The standard deviation is: {}".format(std))
print("The sample size is: {}".format(n))

z = (sample_mean - mean)/(std/np.sqrt(n))
print("The z-score is: {}".format(z))

p_value = stats.norm.cdf(x=z)*2
print("The p-value is: {}".format(p_value))

if p_value <= alpha:
    print("Reject the null hypothesis. Permit type does have an effect on approval time.")

if p_value > alpha:
    print("Fail to reject the null hypothesis.")

The sample mean is: 10.870757195468087
The standard deviation is: 42.73928156074344
The sample size is: 170524
The z-score is: -146.17824079859943
The p-value is: 0.0
Reject the null hypothesis. Permit type does have an effect on approval time.


**Second Hypothesis Test**

In [17]:
df.time_range.mean()

26.054696673189824

In [18]:
west = df[df.region == "west"]
north = df[df.region == "north"]
east = df[df.region == "east"]
south = df[df.region == "south"]
central = df[df.region == "central"]
unknown = df[df.region == "unknown"]

In [19]:
west = west[west.notnull()]
north = north[north.notnull()]
east = east[east.notnull()]
south = south[south.notnull()]
central = central[central.notnull()]
unknown = unknown[unknown.notnull()]

###### West

In [20]:
#### West
alpha = 0.05
mean = 26
sample_mean = np.mean(west.time_range)
std = np.std(west.time_range)
n = len(west.time_range)
print("The sample mean is: {}".format(sample_mean))
print("The standard deviation is: {}".format(std))
print("The sample size is: {}".format(n))

z = (sample_mean - mean)/(std/np.sqrt(n))
print("The z-score is: {}".format(z))

p_value = stats.norm.cdf(x=z)*2
print("The p-value is: {}".format(p_value))

if p_value <= alpha:
    print("Reject the null hypothesis. Region does have an effect on approval time.")

if p_value > alpha:
    print("Fail to reject the null hypothesis.")

The sample mean is: 22.76887700185916
The standard deviation is: 88.80738941133406
The sample size is: 32892
The z-score is: -6.598565432335644
The p-value is: 4.151552475052067e-11
Reject the null hypothesis. Region does have an effect on approval time.


###### East

In [21]:
#### East
alpha = 0.05
mean = 26
sample_mean = np.mean(east.time_range)
std = np.std(east.time_range)
n = len(east.time_range)
print("The sample mean is: {}".format(sample_mean))
print("The standard deviation is: {}".format(std))
print("The sample size is: {}".format(n))

z = (sample_mean - mean)/(std/np.sqrt(n))
print("The z-score is: {}".format(z))

p_value = stats.norm.cdf(x=-z)*2
print("The p-value is: {}".format(p_value))

if p_value <= alpha:
    print("Reject the null hypothesis. Region does have an effect on approval time.")

if p_value > alpha:
    print("Fail to reject the null hypothesis.")

The sample mean is: 27.380936507936507
The standard deviation is: 90.83171609787149
The sample size is: 68313
The z-score is: 3.9736340713100495
The p-value is: 7.07842844402755e-05
Reject the null hypothesis. Region does have an effect on approval time.


###### South

In [22]:
#### South
alpha = 0.05
mean = 26
sample_mean = np.mean(south.time_range)
std = np.std(south.time_range)
n = len(south.time_range)
print("The sample mean is: {}".format(sample_mean))
print("The standard deviation is: {}".format(std))
print("The sample size is: {}".format(n))

z = (sample_mean - mean)/(std/np.sqrt(n))
print("The z-score is: {}".format(z))

p_value = stats.norm.cdf(x=-z)*2
print("The p-value is: {}".format(p_value))

if p_value <= alpha:
    print("Reject the null hypothesis. Region does have an effect on approval time.")

if p_value > alpha:
    print("Fail to reject the null hypothesis.")

The sample mean is: 29.127747494285213
The standard deviation is: 95.96740615457011
The sample size is: 24645
The z-score is: 5.116492471185624
The p-value is: 3.112697038251377e-07
Reject the null hypothesis. Region does have an effect on approval time.


###### North

In [23]:
#### North
alpha = 0.05
mean = 26
sample_mean = np.mean(north.time_range)
std = np.std(north.time_range)
n = len(north.time_range)
print("The sample mean is: {}".format(sample_mean))
print("The standard deviation is: {}".format(std))
print("The sample size is: {}".format(n))

z = (sample_mean - mean)/(std/np.sqrt(n))
print("The z-score is: {}".format(z))

p_value = stats.norm.cdf(x=z)*2
print("The p-value is: {}".format(p_value))

if p_value <= alpha:
    print("Reject the null hypothesis. Region does have an effect on approval time.")

if p_value > alpha:
    print("Fail to reject the null hypothesis.")

The sample mean is: 25.103924502244215
The standard deviation is: 90.23561939104182
The sample size is: 28391
The z-score is: -1.6732350110967684
The p-value is: 0.09428104054227561
Fail to reject the null hypothesis.


###### Central

In [24]:
#### Central
alpha = 0.05
mean = 26
sample_mean = np.mean(central.time_range)
std = np.std(central.time_range)
n = len(central.time_range)
print("The sample mean is: {}".format(sample_mean))
print("The standard deviation is: {}".format(std))
print("The sample size is: {}".format(n))

z = (sample_mean - mean)/(std/np.sqrt(n))
print("The z-score is: {}".format(z))

p_value = stats.norm.cdf(x=z)*2
print("The p-value is: {}".format(p_value))

if p_value <= alpha:
    print("Reject the null hypothesis. Region does have an effect on approval time.")

if p_value > alpha:
    print("Fail to reject the null hypothesis.")

The sample mean is: 25.976926939492845
The standard deviation is: 92.0594140851939
The sample size is: 42943
The z-score is: -0.051937756864091696
The p-value is: 0.9585782892340396
Fail to reject the null hypothesis.


###### Third Hypothesis Test 

In [25]:
df['proposed_construction_type_description'].value_counts()

wood frame (5)    114382
unknown            43162
constr type 1      27841
constr type 3       9360
constr type 2       3778
constr type 4        377
Name: proposed_construction_type_description, dtype: int64

In [26]:
constr_type_1 = df[df['proposed_construction_type_description'] == 'constr type 1']
constr_type_2 = df[df['proposed_construction_type_description'] == 'constr type 2']
constr_type_3 = df[df['proposed_construction_type_description'] == 'constr type 3']
constr_type_4 = df[df['proposed_construction_type_description'] == 'constr type 4']
wood_frame = df[df['proposed_construction_type_description'] == 'wood frame (5)']
unknown = df[df['proposed_construction_type_description'] == 'unknown']

In [27]:
constr_type_1 = constr_type_1[constr_type_1.notnull()]
constr_type_2 = constr_type_2[constr_type_2.notnull()]
constr_type_3 = constr_type_3[constr_type_3.notnull()]
constr_type_4 = constr_type_4[constr_type_4.notnull()]
wood_frame = wood_frame[wood_frame.notnull()]
unknown = unknown[unknown.notnull()]

In [28]:
#### Construction Type 1
alpha = 0.05
mean = 26
sample_mean = np.mean(constr_type_1.time_range)
std = np.std(constr_type_1.time_range)
n = len(constr_type_1.time_range)
print("The sample mean is: {}".format(sample_mean))
print("The standard deviation is: {}".format(std))
print("The sample size is: {}".format(n))

z = (sample_mean - mean)/(std/np.sqrt(n))
print("The z-score is: {}".format(z))

p_value = stats.norm.cdf(x=z)*2
print("The p-value is: {}".format(p_value))

if p_value <= alpha:
    print("Reject the null hypothesis. Proposed construction type does have an effect on approval time.")

if p_value > alpha:
    print("Fail to reject the null hypothesis.")

The sample mean is: 22.58874392623484
The standard deviation is: 70.70944997317272
The sample size is: 27841
The z-score is: -8.049692269136994
The p-value is: 8.30024686457201e-16
Reject the null hypothesis. Proposed construction type does have an effect on approval time.


In [29]:
#### Construction Type 2
alpha = 0.05
mean = 26
sample_mean = np.mean(constr_type_2.time_range)
std = np.std(constr_type_2.time_range)
n = len(constr_type_2.time_range)
print("The sample mean is: {}".format(sample_mean))
print("The standard deviation is: {}".format(std))
print("The sample size is: {}".format(n))

z = (sample_mean - mean)/(std/np.sqrt(n))
print("The z-score is: {}".format(z))

p_value = stats.norm.cdf(x=-z)*2
print("The p-value is: {}".format(p_value))

if p_value <= alpha:
    print("Reject the null hypothesis. Proposed construction type does have an effect on approval time.")

if p_value > alpha:
    print("Fail to reject the null hypothesis.")

The sample mean is: 34.23252412986253
The standard deviation is: 86.71571445851102
The sample size is: 3778
The z-score is: 5.8353402270609385
The p-value is: 5.368091501180551e-09
Reject the null hypothesis. Proposed construction type does have an effect on approval time.


In [30]:
#### Construction Type 3
alpha = 0.05
mean = 26
sample_mean = np.mean(constr_type_3.time_range)
std = np.std(constr_type_3.time_range)
n = len(constr_type_3.time_range)
print("The sample mean is: {}".format(sample_mean))
print("The standard deviation is: {}".format(std))
print("The sample size is: {}".format(n))

z = (sample_mean - mean)/(std/np.sqrt(n))
print("The z-score is: {}".format(z))

p_value = stats.norm.cdf(x=-z)*2
print("The p-value is: {}".format(p_value))

if p_value <= alpha:
    print("Reject the null hypothesis. Proposed construction type does have an effect on approval time.")

if p_value > alpha:
    print("Fail to reject the null hypothesis.")

The sample mean is: 36.258513750731424
The standard deviation is: 99.99753299655781
The sample size is: 9360
The z-score is: 9.925058688282876
The p-value is: 3.2391212411203996e-23
Reject the null hypothesis. Proposed construction type does have an effect on approval time.


In [31]:
#### Construction Type 4
alpha = 0.05
mean = 26
sample_mean = np.mean(constr_type_4.time_range)
std = np.std(constr_type_4.time_range)
n = len(constr_type_4.time_range)
print("The sample mean is: {}".format(sample_mean))
print("The standard deviation is: {}".format(std))
print("The sample size is: {}".format(n))

z = (sample_mean - mean)/(std/np.sqrt(n))
print("The z-score is: {}".format(z))

p_value = stats.norm.cdf(x=-z)*2
print("The p-value is: {}".format(p_value))

if p_value <= alpha:
    print("Reject the null hypothesis. Proposed construction type does have an effect on approval time.")

if p_value > alpha:
    print("Fail to reject the null hypothesis.")

The sample mean is: 33.88823529411765
The standard deviation is: 82.89620996340895
The sample size is: 377
The z-score is: 1.8476336219303562
The p-value is: 0.06465536239705377
Fail to reject the null hypothesis.


In [32]:
#### Wood Frame
alpha = 0.05
mean = 26
sample_mean = np.mean(wood_frame.time_range)
std = np.std(wood_frame.time_range)
n = len(wood_frame.time_range)
print("The sample mean is: {}".format(sample_mean))
print("The standard deviation is: {}".format(std))
print("The sample size is: {}".format(n))

z = (sample_mean - mean)/(std/np.sqrt(n))
print("The z-score is: {}".format(z))

p_value = stats.norm.cdf(x=-z)*2
print("The p-value is: {}".format(p_value))

if p_value <= alpha:
    print("Reject the null hypothesis. Proposed construction type does have an effect on approval time.")

if p_value > alpha:
    print("Fail to reject the null hypothesis.")

The sample mean is: 32.451668501939004
The standard deviation is: 103.32312280053164
The sample size is: 114382
The z-score is: 21.11802812003508
The p-value is: 5.431525646568726e-99
Reject the null hypothesis. Proposed construction type does have an effect on approval time.
