In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import  train_test_split
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [8]:
class HousingPrice():
    '''
    Housing price prediction in SACRAMENTO
    '''
    def __init__(self, fileName):
        self.fileName = fileName
        self.model = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.test_mse = 0
        self.others = []
        
    def extract_road(self, x):
        '''
        Extract street type from each address
        '''
        x_ls = x.split(' ')

        if 'Unit' in x_ls:
            x_road = x_ls[-3]
        elif x_ls[-1].isdigit():
            x_road = x_ls[-2]
        else:
            x_road = x_ls[-1]

        return x_road

    def sacram(self, x):
        '''
        Transform other cities except SACRAMENTO into Others
        '''
        if x == 'SACRAMENTO':
            return x
        else:
            return 'others'
        
    def other_transform(self, x):
        '''
        Transform street types with frequency 1 into others
        '''
        if x in self.others:
            return 'others'
        else:
            return x

    def data_wrangling(self, fileName):
        '''
        Data cleaning and wrangling
        '''
        
        # Delete the data point with Unknown type
        data = pd.read_csv(fileName)
        data = data.query('type!="Unkown"')

        # Drop columns Zip, State
        data = data.iloc[:,1:]
        data = data.drop(['zip', 'state'], axis=1)

        # Extract street type and create a new feature
        streets = data.street.apply(self.extract_road)
        data['street_type'] = streets

        # delete street
        data = data.iloc[:,1:]

        # Delete those data points with 0 in the three features
        data = data.query('beds != 0 or baths != 0').query('sq__ft != 0')

        # Delete the data point with sq__ft over 5000
        data = data.query('sq__ft<5000')

        # Transform cities ouside Sacramento into 'others' 
        data['city_bi'] = data.city.apply(self.sacram)

        # Transform street types with frequency 1 into others
        street_count = data.street_type.value_counts()
        self.others = street_count[street_count == 1].keys().tolist()

        data['street_t'] = data.street_type.apply(self.other_transform)
        data = data.drop('street_type', axis=1)

        return data
    
    def data_split(self):
        '''
        Data transformation and split
        '''
        # Data Wrangling
        data = self.data_wrangling(self.fileName)
        
        # Get X and y
        X = data.drop('price', axis=1)
        y = data.price

        # One Hot encoding
        X_one_df = pd.get_dummies(X, columns=['city', 'type', 'sale_date','city_bi', 'street_t'])
        X_one = X_one_df.as_matrix()
        
        # Split the data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X_one, y, test_size = 0.2, random_state = 48) 
            
    def fit(self):
        '''
        Fit a model
        '''  
        model = Ridge(alpha=0.3)
        self.data_split()
        self.model = model.fit(self.X_train, self.y_train)
   
    def predict(self):
        '''
        Make prediction on final test set
        '''
        pred = self.model.predict(self.X_test)
        self.test_mse = mean_squared_error(self.y_test, pred)
        

In [9]:
house_prediction = HousingPrice('sacramento_real_estate_transactions_Clean.csv')
house_prediction.fit()
house_prediction.predict()

In [10]:
house_prediction.test_mse

5837489353.8191128