In [1]:
from datetime import datetime as dt 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
df = pd.read_csv(filepath_or_buffer='Building_Permits_Cleaned.csv', index_col=0)

In [3]:
df.loc[df.time_frame == 'undetermined', 'time_frame'] = 'unknown'

In [4]:
def to_category(columns, dataframe):
    """Convert a list of columns, from a dataframe, to a category datatype"""
    for column in columns: 
        dataframe[column] = dataframe[column].astype('category')

def to_integer(columns, dataframe):
    """Convert columns from a dataframe to an int64 datatype"""
    for column in columns: 
        dataframe[column] = dataframe[column].astype('int64')

In [5]:
cols = ['permit_creation_date', 'current_status_date','filed_date',
        'issued_date', 'completed_date', 'first_construction_document_date',
        'permit_expiration_date']

for col in cols:
    df[col] = pd.to_datetime(df[col])

    
columns = ['permit_type_definition', 'current_status', 'fire_only_permit', 
           'existing_use', 'proposed_use', 'existing_construction_type',
           'proposed_construction_type','existing_construction_type_description',
           'proposed_construction_type_description', 'tidf_compliance',
           'site_permit', 'neighborhoods', 'zipcode',
           'time_frame', 'decision', 'region']

to_category(columns=columns, dataframe=df)

In [6]:
cols = ['permit_creation_date', 'block', 'lot', 'street_number',
        'street_name', 'unit', 'tidf_compliance', 'fire_only_permit',
        'first_construction_document_date', 'permit_expiration_date',
        'address', 'location', 'site_permit']
df.drop(columns=cols, inplace=True)

In [7]:
df_cols = df[['permit_type_definition',
              'existing_construction_type_description',
              'proposed_construction_type_description']]

In [8]:
cols = ['permit_type_definition',
              'existing_construction_type_description',
              'proposed_construction_type_description']
df.drop(columns=cols, inplace=True)

In [9]:
labels = list(df[df.time_frame == 'unknown'].index)
df.drop(labels=labels, inplace=True)

In [10]:
df.loc[df['current_status'] == 'withdrawn', 'time_range'] = -10
df.loc[df['current_status'] == 'cancelled', 'time_range'] = -10

In [11]:
cols = ['plansets', 'supervisor_district', 'revised_cost', 
        'completed_date', 'current_status_date', 'description',
        'filed_date', 'issued_date']
df.drop(labels=cols, axis=1, inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 186052 entries, 0 to 198899
Data columns (total 19 columns):
permit_type                   186052 non-null int64
current_status                186052 non-null category
number_of_existing_stories    145122 non-null float64
number_of_proposed_stories    145019 non-null float64
estimated_cost                148960 non-null float64
existing_use                  186052 non-null category
existing_units                137044 non-null float64
proposed_use                  186052 non-null category
proposed_units                137524 non-null float64
existing_construction_type    186052 non-null category
proposed_construction_type    186052 non-null category
neighborhoods                 186052 non-null category
zipcode                       186052 non-null category
latitude                      184399 non-null float64
longitude                     184399 non-null float64
time_range                    186052 non-null float64
time_frame          

Columns that need mean imputed: existing_stories, proposed_stories, estimated_cost, existing_units, proposed_units, latitude, longitude
Need to convert categories to code: current_status, existing_use, proposed_use, construction_types, neighborhoods, 
zipcodes, time_frame, decision, region 

In [13]:
columns = ['current_status', 'existing_use', 'proposed_use',
           'existing_construction_type', 'proposed_construction_type',
           'neighborhoods', 'zipcode','time_frame', 'decision', 'region']
df = pd.get_dummies(data=df, columns=columns)

In [14]:
y = df.time_range.values
X = df.drop(labels='time_range', axis=1).values

In [15]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Lasso


In [16]:
imp = Imputer()
scaler = StandardScaler()
reg = Lasso()

pipeline = make_pipeline(imp, scaler, reg)

In [17]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=42)

cv_score = cross_val_score(pipeline, Xtrain, ytrain, cv=5)
print(cv_score)

[0.76428774 0.77065914 0.77071026 0.76103407 0.75197107]


In [21]:
pipeline.best_estimator_

AttributeError: 'Pipeline' object has no attribute 'best_estimator_'

In [22]:
d

AttributeError: 'Lasso' object has no attribute 'coef_'