In [1]:
from ydc.features.get_features import get_features
import pandas as pd
import numpy as np

from pprint import pprint

In [2]:
(features, df, box, combos, cells, neighbourhood) = get_features(status=True)



In [3]:
def _error(df_feat, quality, idx, coeffs):
    if idx is not None:
        return (quality[idx] - (df_feat[idx] * coeffs).sum(axis=1)).abs().mean()
    else:
        return (quality - (df_feat * coeffs).sum(axis=1)).abs().mean()
    
def principal_values(A, n):
    U, s, Vt = np.linalg.svd(A, full_matrices=False)
    V = Vt.T

    # sort the PCs by descending order of the singular values (i.e. by the
    # proportion of total variance they explain)
    ind = np.argsort(s)[::-1]
    U = U[:, ind]
    s = s[ind]
    V = V[:, ind]

    S = np.diag(s)
    # if we use only the first 20 PCs the reconstruction is less accurate
    Ahat = np.dot(U[:, :n], np.dot(S[:n, :n], V[:,:n].T))
    print("Using first %s PCs, MSE = %.6G" %(n, np.mean((A - Ahat)**2)))
    
    return Ahat

def get_coeffs_by_cat(df_feat, df_busi, combos):
    coeffs = {}
    errors = {}
    
    for combo in combos:
        idx = (df_busi['category'] == combo)
        
        c = get_coeffs(df_feat[idx], df_busi.loc[idx, 'real_stars'])
        coeffs[combo] = c
        
        errors[combo] = {
            "all": _error(df_feat, df_busi['real_stars'], None, c),
            "combo": _error(df_feat, df_busi['real_stars'], idx, c),
            "other": _error(df_feat, df_busi['real_stars'], -idx, c)
        }

    return coeffs, errors
    
def get_coeffs(df_feat, quality):
    if (df_feat.empty):
        return pd.Series(0, index=df_feat.columns)
    
    a = df_feat.values
    b = quality.values
    lstsq_res = np.linalg.lstsq(a,b)[0]
    
    return pd.Series(lstsq_res, index=df_feat.columns)

def find_best_location():
    return


In [5]:
# Compare with just a constant prediction
error = abs(df['real_stars'] - df['real_stars'].mean())
print("Guessing a constant %f" % df['real_stars'].mean())
print("error-mean: %f" % error.mean())
print("error-std:  %f" % error.std())

Guessing a constant 3.659542
error-mean: 0.733059
error-std:  0.547270


In [6]:
c_all = get_coeffs(features,df['real_stars'])
e_all = _error(features, df['real_stars'], None, c_all)
print(e_all)

0.718404888177


In [26]:
from sklearn import linear_model

In [31]:
mdl = linear_model.Ridge(alpha=.1)

A = features.values
b = df['real_stars'].values
mdl.fit(A, b)

error = abs(mdl.predict(A) - b)
print("error-mean: %f" % error.mean())
print("error-std:  %f" % error.std())

error-mean: 0.718405
error-std:  0.539900


In [28]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
model = Pipeline([('poly', PolynomialFeatures(degree=2)),
                  ('linear', Ridge(alpha=.1))])

In [29]:
A = features.values
b = df['real_stars'].values
mdl = model.fit(A, b)

In [31]:
error = abs(mdl.predict(A) - b)
print("error-mean: %f" % error.mean())
print("error-std:  %f" % error.std())

error-mean: 0.698239
error-std:  0.530030


In [14]:
from ydc.tools.cache import cache_result
from ydc.tools.distances import haversine

def _radius(indices, df):
    """Counts occurence and divides by number of businesses in neighbourhood"""
    lats = df.loc[indices, 'latitude']
    lons = df.loc[indices, 'longitude']

    top = lats.max()
    bot = lats.min()
    left = lons.min()
    right = lons.max()
    
    # get the radius of a "circle"
    d = haversine(left, top, right, bot)
    return d/2
    
    

# @cache_result("pickles")
def nbhd_area(df, nbrs):
    # If we use the tuples pandas gets ecited and creates a multiindex
    r = nbrs.apply(lambda indices: _radius(indices, df))
    
    # Replace NaN with 0 (0 occurences)
    df_feats = pd.DataFrame()
    df_feats['r'] = r
    df_feats['r^2'] = r**2
    
    return df_feats


In [22]:
new_shit = nbhd_area(df, neighbourhood)

In [24]:
def try(mdl, idx, feat, df):
    
    A = features[idx].values
b = df['real_stars'].values
mdl.fit(A, b)

error = abs(mdl.predict(A) - b)
print("error-mean: %f" % error.mean())
print("error-std:  %f" % error.std())

array([[  7.12736244e-01,   5.07992954e-01],
       [  3.96041207e+00,   1.56848638e+01],
       [  1.48141160e+00,   2.19458032e+00],
       ..., 
       [  8.31903615e-01,   6.92063625e-01],
       [  6.02266268e-02,   3.62724657e-03],
       [  5.92248621e-01,   3.50758429e-01]])