In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split


df = pd.read_csv('project-04/housing.csv')
df.head()

df.shape

df.get_dtype_counts() 

df.isnull().any()

#zip is the only column with no missing values

df.isnull().sum().sum()

#there are 3662 pieces of missing data in the df

df.isnull().sum()

#likely the 280 are all the same rows, so we can find those and likely drop them.

df['address'].head()

#lots of address rows with 'sign in for details', some NaN

df = df.dropna(thresh=2)

df.loc[df['price'].isnull()]

#that showed all the price columns that have NaN

len(df.ix[:,0:5])

#for indexing used the ix because it's a dataframe., want all rows, so :, and wanted the first 5 columns, so use indexing 0:5

len(df.ix[:,0:5].dropna(how='all'))

#dropped rows, found length to see if the number dropped was 280

df.shape

del df['more_info']

#that column was not needed

df.head()

df.isnull().sum()

df['price']

len(df[df['bed_bath'].str.contains('Foreclosure', na = False)]) 

#i wanted to see how many rows have the price in the bed_bath column to see if they're worth keeping. There are enough to keep
#Now how to drop the rows that have it

#here i want to write if df['price] is null, dropna unless  df[df['bed_bath'].str.contains('Foreclosure', na = False)     
#if forclosure is in bed_bath and has NaN in price column, we keep it.
#i want to address the foreclosure thing, but in the interest of time I decided to move forward with dropping all the Nan

df.dropna(how='any', inplace=True)

df.head()

#get rid of the unneccessary characters in the price column

def erase(symbol):
    symbol = symbol.replace('$','')
    symbol = symbol.replace(',','')
    symbol = symbol.replace('K', '000')
    symbol = symbol.replace('M', '000000')
    symbol = symbol.replace('+', '')
    return symbol

df['price'] = df['price'].apply(erase)

df.head()

df['price'].head()

#we want the price column to be floats

df['price'] = df['price'].apply(float)

df['price'].head()

df['bed_bath'].head()

#cleaning up with bed_bath column

def buh_bye(crap):
    crap = crap.replace(',','')
    crap = crap.replace(' bds', '')
    crap = crap.replace('  ba', '')
    crap = crap.replace('ba', '')
    crap = crap.replace(' bd', '')
    crap = crap.replace(' sqft', '')
    crap = crap.replace('--', '')
    crap= crap.replace('Studio', '0')
    crap = crap.replace('lot', '')
    crap = crap.replace(' ', '')
    crap = crap.replace('ac','')
    return crap

df['bed_bath'] = df['bed_bath'].apply(lambda x: buh_bye(x))

df.head()

df['bed_bath'].head()

#now to split at the dot

def moving(stuff):
    try:
        return float(stuff.split('·')[0])
    except:
        return 0

def moving2(stuff):
    try:
        return float(stuff.split('·')[1])
    except:
        return 0
        
def moving3(stuff):
    try:
        return float(stuff.split('·')[2])
    except:
        return 0
    
df['bedrooms'] = df['bed_bath'].apply(moving)

df['bedrooms'].value_counts()

df['bathrooms'] = df['bed_bath'].apply(moving2)

df['bathrooms'].value_counts()

df['square_feet'] = df['bed_bath'].apply(moving3)

df['square_feet'].head()

df.head(10)

df.dtypes

df['bedrooms'].value_counts()

#there was wonky bedroom data - some had an insnae number of rooms, so wanted to clean that up

def too_many(bedrooms):    
    if bedrooms >=1 or bedrooms <= 10:
        return bedrooms
    else:
        return 0

df['bedrooms'] = df['bedrooms'].apply(too_many)

df['bedrooms'].value_counts()

#still looks a little wonky, but is better

#checking summary stats to determine what we could consider to be a 'high priced' housing

df['price'].describe()

df.columns

#I chose housing that was in the 75% or above, so housing over $500,000. Made price into categorical variables based on this cut off point.

df['dummy_price'] = df['price'].apply(lambda x: 1 if x >= 500000 else 0)

#using a logistic regression model

y = df['dummy_price']
x = df[['bedrooms', 'bathrooms', 'square_feet']]

lr = LogisticRegression()

x_train, x_test, y_train, y_test = train_test_split(x,y)

lr.fit(x_train, y_train)

lr.score(x_train, y_train)

lr.score(x_test, y_test)





    













