In [4]:
from ydc.tools.import_data import import_businesses, import_reviews
from ydc.features.get_features import get_features
import datetime as dt
from random import sample
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation
import numpy as np

import pandas as pd

In [None]:
(features, df, box, combos, cells, n_ind, n_dist) = get_features(status=True)

In [None]:
def divide_good_and_bad(dataframe, combos, key_feature, percentage):
    """
    Take the given percentage of the dataframe and mark them as "good"
    Done for each category separately, sort by column "key_feature"
    
    """
    # Initialize dataframe sorted by stars and setup new column "good" with False as default
    dataframe = dataframe.sort(columns=key_feature, ascending=False)
    dataframe['good'] = False
    
    for combo in combos:
        # Every (super/sub) category combo
        total = (df['category']==combo).sum()
        
        # Find n (total number times percentage)
        num_good = int(round(total * percentage))
        dataframe['good']
        
        # Get index from best n
        idx = dataframe.loc[df['category']==combo, :].head(num_good).index
        
        # Set those best ones to true
        dataframe.loc[idx, 'good'] = True
    
    # Restore correct order (sorting scrambled it)
    return dataframe.sort_index()

def divide(dataframe, combos, key_feature, n_classes):
    """
    Divide into n classes
    Done for each category separately, sort by column "key_feature"
    0 is best, n_classes-1 worst
    """
    # Initialize dataframe sorted by stars and setup new column "good" with False as default
    dataframe = dataframe.sort(columns=key_feature, ascending=False)
    dataframe['good'] = n_classes-1
    
    for combo in combos:
        # Every (super/sub) category combo
        total = (df['category']==combo).sum()
        # Find number per class, they are all equally big
        n_each = int(round(total / n_classes))
        
        # Get index 
        idx = dataframe.loc[df['category']==combo, :].index.tolist()
        
        for n in range(n_classes-1):
            class_idx = idx[(n * n_each):((n + 1) * n_each)]
            dataframe.loc[class_idx, 'good'] = n
    
    # Restore correct order (sorting scrambled it)
    return dataframe.sort_index()

In [63]:
businesses = divide(df_res, combos, 'count', 4)

In [27]:
test = df_res.groupby("category")['city'].count()
combo = test.argmax()

In [11]:
def fitness_one(features, df_busi, combo, test_portion, test_rounds, c, gamma status=False):
    features = (features - features.mean()) / features.std()

    score = []
    idx = df_busi[df_busi['category'] == combo].index.tolist()

    for _ in range(test_rounds):
        # Take out 15% of data as random sample to test data with
        idx_sample = sample(idx, round(len(idx)*test_portion))
        idx_train = [index for index in idx if index not in idx_sample]

        subcat_clf = svm.SVC(cache_size=2000, C=c, gamma=gamma, kernel='rbf')

        feat_train = features.loc[idx_train, :].values
        quality_train = df_busi.loc[idx_train, 'good'].values  

        feat_sample = features.loc[idx_sample, :].values
        quality_sample = df_busi.loc[idx_sample, 'good'].values  

        subcat_clf.fit(feat_train, quality_train)
        score.append(subcat_clf.score(feat_sample, quality_sample))

    return np.mean(score)

In [14]:
fitness_one(features, businesses, combo, 0.1, 10, 0.1, 0.1)

TypeError: fitness_one() takes from 5 to 6 positional arguments but 7 were given

In [18]:
def fitness(features, df_busi, combos, test_portion, test_rounds, status=False):
    tot_results = []
    for combo in combos:
        try:
            weight = df_busi.loc[df_busi['category']==combo, 'city'].count()
            score = fitness_one(features, df_busi, combo, test_portion, test_rounds, status)
            tot_results.append((score, weight))
        except:
            pass
    
    tot_score = 0
    tot_weight = 0    
    for item in tot_results:
        if item[1]==0:
            continue  # ignore 
        tot_score += item[0] * item[1]
        tot_weight += item[1]
        
    return tot_score/tot_weight
        

In [19]:
fitness(features, businesses, combos, 0.4, 5)

0.58519942046426543

In [13]:
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
%matplotlib inline

C_range = np.logspace(-3, 3, 5)
gamma_range = np.logspace(-6, -2, 5)

y = target
X = data

param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv)
grid.fit(X, y)
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

class MidpointNormalize(Normalize):

    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(value, x, y))

# plot the scores of the grid
# grid_scores_ contains parameter settings and scores
# We extract just the scores
scores = [x[1] for x in grid.grid_scores_]
scores = np.array(scores).reshape(len(C_range), len(gamma_range))

# Draw heatmap of the validation accuracy as a function of gamma and C
#
# The score are encoded as colors with the hot colormap which varies from dark
# red to bright yellow. As the most interesting scores are all located in the
# 0.92 to 0.97 range we use a custom normalizer to set the mid-point to 0.92 so
# as to make it easier to visualize the small variations of score values in the
# interesting range while not brutally collapsing all the low score values to
# the same color.

plt.figure(figsize=(8, 6))
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot,
           norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
plt.yticks(np.arange(len(C_range)), C_range)
plt.title('Validation accuracy')
plt.show()

In [112]:
# Machine
clf = svm.SVC(kernel='rbf', C=1, gamma=0.001)
scaler = StandardScaler()

# Data
businesses = divide(df_res, combos, 'count', 2)
idx = (businesses['category'] == combo)
data = features.loc[idx, :].values
target = businesses.loc[idx, 'good'].values

# Go
cv = StratifiedShuffleSplit(target, n_iter=5, test_size=0.2, random_state=42)
data = scaler.fit_transform(data)
scores = cross_validation.cross_val_score(clf, data, target, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.62 (+/- 0.01)


In [118]:
from sklearn.ensemble import RandomForestClassifier
businesses = divide(df_res, combos, 'count', 2)

# Random forest
clf = RandomForestClassifier(n_estimators=10, max_features=1)
scaler = StandardScaler()

# Data
idx = (businesses['category'] == combo)
data = features.loc[idx, :].values
target = businesses.loc[idx, 'good'].values

# Go
cv = StratifiedShuffleSplit(target, n_iter=5, test_size=0.2, random_state=42)
data = scaler.fit_transform(data)
scores = cross_validation.cross_val_score(clf, data, target, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.58 (+/- 0.01)


In [114]:
clf = RandomForestClassifier(n_estimators=10)
clf.fit(data, target)
feat_names = features.columns
feat_weights = clf.feature_importances_
feat_series = pd.Series(feat_weights, index=feat_names)
feat_series.sort(ascending=False)
feat_series[:10]

reviews_mean                    0.058204
reviews_sum                     0.057223
reviews_std                     0.055087
reviews_median                  0.042378
stars_std                       0.042105
stars_sum                       0.041601
weighted review-count           0.041269
stars_mean                      0.039915
reviews_max                     0.037571
neighbourhood_radius_squared    0.036353
dtype: float64

In [4]:
test = import_businesses()

KeyError: 'review_count_last_year'

In [3]:
import_businesses()

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type,real_stars,review_count_last_year
0,{'By Appointment Only': True},vcNAWiLM4dR7D2nwwJ7nCA,"[Doctors, Health & Medical]",Phoenix,"4840 E Indian School Rd\nSte 101\nPhoenix, AZ ...","{'Wednesday': {'open': '08:00', 'close': '17:0...",33.499313,-111.983758,"Eric Goldberg, MD",[],True,9,3.5,AZ,business,3.600000,4
1,"{'Good For Groups': True, 'Accepts Credit Card...",UsFtqoBl7naz8AVUBZMjQQ,[Nightlife],Dravosburg,"202 McClure St\nDravosburg, PA 15034",{},40.350519,-79.886930,Clancy's Pub,[],True,4,3.5,PA,business,3.500000,3
2,{'Good for Kids': True},cE27W9VPgO88Qxe4ol6y_g,"[Active Life, Mini Golf, Golf]",Bethel Park,"1530 Hamilton Rd\nBethel Park, PA 15234",{},40.356896,-80.015910,Cool Springs Golf Center,[],False,5,2.5,PA,business,2.600000,3
3,{},HZdLhv6COCleJMo7nPl-RA,"[Shopping, Home Services, Internet Service Pro...",Pittsburgh,"301 S Hills Vlg\nPittsburgh, PA 15241","{'Friday': {'open': '10:00', 'close': '21:00'}...",40.357620,-80.059980,Verizon Wireless,[],True,3,3.5,PA,business,3.666667,2
4,"{'Good For Groups': True, 'Wi-Fi': 'no', 'Has ...",mVHrayjG3uZ_RLHkLj-AMg,"[Bars, American (New), Nightlife, Lounges, Res...",Braddock,"414 Hawkins Ave\nBraddock, PA 15104","{'Wednesday': {'open': '10:00', 'close': '19:0...",40.408735,-79.866351,Emil's Lounge,[],True,11,4.5,PA,business,4.700000,5
5,"{'Good For Groups': True, 'Wi-Fi': 'free', 'Ha...",KayYbHCt-RkbGcPdGOThNg,"[Bars, American (Traditional), Nightlife, Rest...",Carnegie,"141 Hawthorne St\nGreentree\nCarnegie, PA 15106",{},40.415517,-80.067534,Alexion's Bar & Grill,[Greentree],True,15,4.0,PA,business,3.916667,6
6,{},b12U9TFESStdy7CsTtcOeg,"[Auto Repair, Automotive]",Carnegie,"718 Hope Hollow Rd\nCarnegie, PA 15106",{},40.394588,-80.084454,Flynn's E W Tire Service Center,[],True,5,1.5,PA,business,1.600000,4
7,{'Good for Kids': True},Sktj1eHQFuVa-M4bgnEh8g,"[Active Life, Mini Golf]",Carnegie,"920 Forsythe Rd\nCarnegie\nCarnegie, PA 15106",{},40.405404,-80.076267,Forsythe Miniature Golf & Snacks,[Carnegie],True,4,4.0,PA,business,4.000000,2
8,{},3ZVKmuK2l7uXPE6lXY4Dbg,"[Home Services, Contractors]",Carnegie,"8 Logan St\nCarnegie\nCarnegie, PA 15106",{},40.406324,-80.090357,Quaker State Construction,[Carnegie],True,3,2.5,PA,business,2.333333,3
9,"{'Good For Groups': True, 'Delivery': False, '...",wJr6kSA5dchdgOdwH6dZ2w,"[Burgers, Breakfast & Brunch, American (Tradit...",Carnegie,"2100 Washington Pike\nCarnegie, PA 15106","{'Friday': {'open': '08:00', 'close': '02:00'}...",40.387732,-80.092874,Kings Family Restaurant,[],True,8,3.5,PA,business,3.571429,2
