In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import re, glob

import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats



In [21]:
import re, glob

import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# Path of data
path="/Users/jbrosamer/PonyPricerFiles/Batchs/DressageId1000*.csv"     
    
def all_data(path):
    """
    Takes in: with wildcarding of dataframes stored in .csv
    
    Returns: a dataframe of all 
    """
    files=glob.glob(path)
    dfs=[]
    frame=pd.DataFrame()
    for f in files:
        dfs.append(pd.read_csv(f))
    frame = pd.concat(dfs)
    frame = frame.reset_index().drop('index', axis = 1)
    return frame

def clean_col(df):
    """
    Takes in: dataframe from all_data

    Returns: a dataframe after converting null values
    """
    for col in df.columns:
        # Honda civics are 4 cylinders
        if col == 'cylinders':
            df.loc[pd.isnull(df[col]), col] = 4
        elif col == 'extra':
            df.loc[pd.isnull(df[col]), col] = 'normal'
        elif col == 'fuel':
            df.loc[pd.isnull(df[col]), col] = 'gas'
        elif col == 'drive':
            df.loc[pd.isnull(df[col]), col] = 'fwd'
        elif col == 'transmission':
            df.loc[pd.isnull(df[col]), col] = 'auto'
        elif col == 'type':
            df.loc[pd.isnull(df[col]), col] = 'unknown'
    return df
        
def encode(df):
    """
    Takes in: dataframe from clean_col
    
    Returns: a dataframe that LabelEncodes the categorical variables
    """
    columns = ['drive', 'extra', 'fuel',
               'odometer', 'title_stat',
               'transmission', 'type']
    for col in columns:
        le = LabelEncoder()
        le.fit(df[col])
        df[col] = le.transform(df[col])
    
    final_cols = ['year', 'cylinders', 'drive', 'extra',
                  'fuel', 'odometer', 'title_stat',
                  'transmission', 'type', 'price']
    # Order columns with price as the last column
    df = df[final_cols]
    return df

class Model():
    
    def __init__(self, df, params, test_size = 0.3):
        self.df = df
        self.params = params
        self.test_size = 0.3
        
    def split(self):
        np.random.seed(1)
        self.df = self.df.reindex(np.random.permutation(self.df.index))
        self.df = self.df.reset_index().drop('index', axis = 1)
        X = self.df.as_matrix(self.df.columns[:-1])
        y = self.df.as_matrix(['price'])[:,0]
        X_train, X_test, y_train, y_test = train_test_split(
                                                X, y,
                                                test_size=self.test_size,
                                                )
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        
    def kfold_cv(self, n_folds = 3):
        """
        Takes in: number of folds
        
        Prints out RMSE score and stores the results in self.results
        """

        cv = KFold(n = self.X_train.shape[0], n_folds = n_folds)
        gbr = GradientBoostingRegressor(**self.params)
        self.rmse_cv = []
        self.results = {'pred': [],
                   'real': []}
        
        for train, test in cv:
            gbr.fit(self.X_train[train], self.y_train[train])
            pred = gbr.predict(self.X_train[test])
            error = mean_squared_error(pred, self.y_train[test])**0.5
            self.results['pred'] += list(pred)
            self.results['real'] += list(self.y_train[test])
            self.rmse_cv += [error]
        print 'RMSE Scores:', self.rmse_cv
        print 'Mean RMSE:', np.mean(self.rmse_cv)
        
    def plot_results(self):
        """
        Plots results from CV
        """
        plt.style.use('ggplot')
        fig, ax = plt.subplots(figsize = (12,10))

        ax.scatter(self.results['real'], self.results['pred'], color = (0.6,0.0,0.2),
                   label = 'Model Predictions',
                   s = 100, alpha = 0.4)
        ax.plot(np.arange(0, 50000),np.arange(0, 50000), color = 'black',
                   label = 'Perfect Prediction Line',
                   lw = 4, alpha = 0.5, ls = 'dashed')

        ax.set_xlabel('Actual Price ($)',fontsize = 20)
        ax.set_ylabel('Predicted Price ($)', fontsize = 20)
        ax.set_title('Results from KFold Cross-Validation', fontsize = 25)
        ax.set_xlim(0,30000)
        ax.set_ylim(0,30000)
        ax.legend(loc=2, fontsize = 16)
        ax.tick_params(labelsize =20)
    
    def validate(self):
        """
        Validate Model on Test set
        """
        gbr = GradientBoostingRegressor(**self.params)
        gbr.fit(self.X_train, self.y_train)
        self.preds = gbr.predict(self.X_test)
        self.rmse = mean_squared_error(self.preds, self.y_test)**0.5
        print 'RMSE score:', self.rmse
        
        plt.style.use('ggplot')
        fig, ax = plt.subplots(figsize = (12,10))

        ax.scatter(self.y_test, self.preds, color = (0.6,0.0,0.2),
                   label = 'Model Predictions',
                   s = 100, alpha = 0.4)
        ax.plot(np.arange(0, 50000),np.arange(0, 50000), color = 'black',
                   label = 'Perfect Prediction Line',
                   lw = 4, alpha = 0.5, ls = 'dashed')

        ax.set_xlabel('Actual Price ($)',fontsize = 20)
        ax.set_ylabel('Predicted Price ($)', fontsize = 20)
        ax.set_title('Results from Test Set', fontsize = 25)
        ax.set_xlim(0,30000)
        ax.set_ylim(0,30000)
        ax.legend(loc=2, fontsize = 16)
        ax.tick_params(labelsize =20)
df_test = all_data(path)

In [38]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,breeds,price,color,location,age,zip,height,temp,warmblood,for sale,for lease,registered,skills
0,2021017,"['Andalusian', 'Related Searches by Breed']",8500,Grey,Crescent Valley British Columbia V0G 1H0,2,0,0 Inches,11,False,True,False,True,"['Champion Pedigree', 'Dressage', 'Halter', 'S..."
1,2020662,"['Friesian CrossHalf Arabian', 'Related Search...",5000,Bay,Stanwood Washington 98292,1,98292,0 Inches,11,False,True,False,True,"['Dressage', 'Sporthorse']"
2,1996336,"['Welsh Cob', 'Related Searches by Breed']",8500,Bay,Carlisle Massachusetts 01741,14,1741,14.3 hh,5,False,True,False,True,[]
3,2020354,"['AQHA Quarter Horse Appendix', 'Related Searc...",9500,Bay,Port Orchard Washington 98366,13,98366,16.0 hh,3,False,True,False,True,"['Dressage', 'Eventing', 'Hunt Seat Equitation..."
4,2020339,"['Dutch Warmblood', 'Related Searches by Breed']",18000,Bay,Waddell Arizona 85355,14,85355,17.0 hh,5,False,True,False,True,"['Dressage', 'Eventing', 'Trail Riding, Recrea..."


In [None]:
df_test = all_data(path)
df = df_test.copy()
df = clean_col(df)
df = encode(df)
params_gbr = {'loss': 'ls',
              'learning_rate': 0.02,
              'n_estimators': 500,
              'max_depth': 6,
              'min_samples_split': 2,
              'min_samples_leaf': 13,
              'subsample': 0.7
             }
b = Model(df, params = params_gbr)
b.split()
b.kfold_cv(n_folds = 3)

In [56]:
breed=pd.Series(index=df_test.index)
height=pd.Series(index=df_test.index)
print type(df_test['breeds'][1])


<type 'str'>


In [52]:
for i in df_test.index:
    print df_test[i]['breeds'][0]
    breed[i]=df_test[i]['breeds'][0]
    if "hh" in df_test[i]['height']:
        height[i]=float(df_test[i]['height'].split('.')[0])*4+float(df_test[i]['height'].split('.')[0])

KeyError: 0

In [None]:
b.plot_results()
plt.savefig('app/static/img/cv_results.png')

In [None]:
b.validate()
plt.savefig('app/static/img/test_results.png')

In [None]:
slope, intercept, r_value, p_value, std_err = stats.linregress(b.results['real'],b.results['pred'])
print "r-squared for cv:", r_value**2 
print "slope for cv:", slope

In [None]:
slope, intercept, r_value, p_value, std_err = stats.linregress(b.y_test,b.preds)
print "r-squared for test set:", r_value**2 
print "slope for test:", slope

In [None]:
plt.style.use('ggplot')
df = df_test.copy()
df = clean_col(df)
idx = df.loc[df['price']>80000,:].index
df = df.drop(idx, axis = 0)
df_temp = df[['type','odometer', 'year', 'price']]
plot = sns.pairplot(df_temp, hue = 'type')
for ax in plot.axes.flat:
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, horizontalalignment='right')