In [1]:
import pandas as pd
house = pd.read_csv('https://raw.githubusercontent.com/learn-co-students/dsc-v2-mod1-final-project-dc-ds-career-042219/master/kc_house_data.csv')

In [2]:
house.shape

(21597, 21)

### Inspect repeated ids to find out if they're duplicates

In [None]:
house.id.value_counts().head()

# house[house.id == 1825069031]

### Drop rows

In [7]:
def drop_anomolous_rows(df, colname, val):
    return df[df[colname] != val]

In [8]:
house = drop_anomolous_rows(house, 'bedrooms', 33)

In [9]:
house = drop_anomolous_rows(house, 'sqft_basement', "?")

In [10]:
house.shape

(21142, 21)

### Change nulls to unknown for categorical values so we can get dummies

In [11]:
def nan_to_unknown(df, colname):
    mask = df[colname].isna()
    df.loc[mask, colname] = 'unknown'
    return df

In [12]:
house = nan_to_unknown(house, 'waterfront')
# house = nan_to_unknown(house, 'yr_renovated')

### Bin yr_built and yr_renovated by decade

In [13]:
from sklearn.preprocessing import LabelEncoder

def bin_by_decade(df, colname):
    bins = pd.IntervalIndex.from_tuples([(1900, 1909), (1910, 1919), (1920, 1929), (1930, 1939), (1940, 1949), (1950, 1959), (1960, 1969), (1970, 1979), (1980, 1989), (1990, 1999), (2000, 2010), (2010, 2020)])
    series = pd.cut(df[colname], bins)
    cats = series.astype('category')
    
    return pd.get_dummies(cats)
        

In [14]:
yr_built_df = bin_by_decade(house, 'yr_built')
yr_renovated_df = bin_by_decade(house, 'yr_renovated')



In [15]:
yr_renovated_df.shape

(21142, 12)

### Get dummies on other discrete or categorical values

In [16]:
## Waterfront View Floors Bedrooms Bathrooms Condition Grade Zipcode

In [17]:
def dummies_df(df, col_list):
    dummy_df = pd.DataFrame({})
    for column in col_list:
        d = pd.get_dummies(df[column], prefix=column + "_")
        dummy_df = pd.concat([dummy_df, d], axis=1)
    return dummy_df

In [18]:
dummies_for_house = dummies_df(house, ['waterfront', 'view', 'floors', 'bedrooms', 'condition', 'grade', 'zipcode'])

In [19]:
dummies_for_house.shape

(21142, 111)

In [20]:
## Turn date into datetime object
pd.to_datetime(house.date)

0       2014-10-13
1       2014-12-09
2       2015-02-25
3       2014-12-09
4       2015-02-18
5       2014-05-12
7       2015-01-15
8       2015-04-15
9       2015-03-12
10      2015-04-03
11      2014-05-27
12      2014-05-28
13      2014-10-07
14      2015-03-12
15      2015-01-24
16      2014-07-31
17      2014-05-29
19      2015-04-24
20      2014-05-14
21      2014-08-26
22      2014-07-03
23      2014-05-16
24      2014-11-20
25      2014-11-03
26      2014-06-26
27      2014-12-01
28      2014-06-24
29      2015-03-02
30      2014-11-10
31      2014-12-01
           ...    
21566   2014-11-12
21567   2014-06-10
21568   2014-12-02
21569   2014-08-28
21570   2014-10-15
21571   2015-03-05
21572   2014-11-13
21573   2014-09-10
21574   2014-05-14
21575   2014-10-02
21576   2015-04-16
21577   2015-03-17
21578   2014-10-17
21579   2014-10-31
21580   2014-08-13
21582   2014-10-13
21583   2014-09-15
21584   2014-10-15
21585   2015-04-07
21586   2014-06-26
21587   2014-08-25
21588   2015

In [21]:
house.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21142 entries, 0 to 21596
Data columns (total 21 columns):
id               21142 non-null int64
date             21142 non-null object
price            21142 non-null float64
bedrooms         21142 non-null int64
bathrooms        21142 non-null float64
sqft_living      21142 non-null int64
sqft_lot         21142 non-null int64
floors           21142 non-null float64
waterfront       21142 non-null object
view             21081 non-null float64
condition        21142 non-null int64
grade            21142 non-null int64
sqft_above       21142 non-null int64
sqft_basement    21142 non-null object
yr_built         21142 non-null int64
yr_renovated     17388 non-null float64
zipcode          21142 non-null int64
lat              21142 non-null float64
long             21142 non-null float64
sqft_living15    21142 non-null int64
sqft_lot15       21142 non-null int64
dtypes: float64(7), int64(11), object(3)
memory usage: 3.5+ MB


In [22]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.heatmap(house.corr(), center=0)


<matplotlib.axes._subplots.AxesSubplot at 0x1a20643860>

In [None]:
house.corr()

In [25]:
clean_house.describe()

NameError: name 'clean_house' is not defined