In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
import matplotlib.pyplot as plt
from ydc.tools import import_data, review_analysis
from ydc.tools.supercats import add_supercats
from ydc.tools.cache import cache_result
from ydc.features import get_features
from operator import itemgetter
from haversine import haversine
from random import sample
%matplotlib inline

In [2]:
# reviews = import_data.import_reviews(fields=['business_id', 'stars', 'date', 'real_date'])

In [3]:
# businesses = import_data.import_businesses()

In [2]:
(features, df, box, combos, cells, n_ind, n_dist) = get_features.get_features(status=True, new_cache=True)

Successfully imported reviews with columns ['business_id' 'date' 'stars' 'real_date']


In [6]:
def divide_good_and_bad(dataframe, combos, key_feature, percentage):
    """
    Take the given percentage of the dataframe and mark them as "good"
    Done for each category separately, sort by column "key_feature"
    
    """
    # Initialize dataframe sorted by stars and setup new column "good" with False as default
    dataframe = dataframe.sort(columns=key_feature, ascending=False)
    dataframe['good'] = False
    
    for combo in combos:
        # Every (super/sub) category combo
        total = (df['category']==combo).sum()
        
        # Find n (total number times percentage)
        num_good = int(round(total * percentage))
        dataframe['good']
        
        # Get index from best n
        idx = dataframe.loc[df['category']==combo, :].head(num_good).index
        
        # Set those best ones to true
        dataframe.loc[idx, 'good'] = True
    
    # Restore correct order (sorting scrambled it)
    return dataframe.sort_index()

In [7]:
businesses = divide_good_and_bad(df, combos, 'stars', 0.3)

In [9]:
check = businesses.groupby('category')['good']
check.sum()/check.count()

category
(-1, -1)    0.299213
(0, -1)     0.301887
(0, 0)      0.300281
(0, 1)      0.299786
(0, 2)      0.297297
(0, 3)      0.299507
(1, -1)     0.304348
(1, 0)      0.299762
(1, 1)      0.301887
(1, 2)      0.300280
(1, 3)      0.299539
(1, 4)      0.299614
(1, 5)      0.296296
(2, -1)     0.299296
(2, 0)      0.299966
(2, 1)      0.300023
(2, 2)      0.299754
(2, 3)      0.300056
(3, -1)     0.298851
(3, 0)      0.300112
(3, 1)      0.300195
(3, 2)      0.299349
(3, 3)      0.300191
(3, 4)      0.300437
(3, 5)      0.300341
(4, -1)     0.312500
(4, 0)      0.299867
(4, 1)      0.299938
(4, 2)      0.297101
(4, 3)      0.301370
(5, -1)     0.285714
(5, 0)      0.299127
(5, 1)      0.300314
(5, 2)      0.296296
(5, 3)      0.300000
(5, 4)      0.300000
(6, -1)     0.300000
(6, 0)      0.299715
(6, 1)      0.300668
(6, 2)      0.298578
(6, 3)      0.299712
(6, 4)      0.300000
(7, -1)     0.285714
(7, 0)      0.300116
(7, 1)      0.500000
(7, 2)      0.301370
(7, 3)      0.299862
(7, 

In [None]:
# Initialize support vector machine
clf = svm.SVC()
# Fit using "good" as determined before
clf.fit(features.values, businesses['good'].values)

In [None]:
clf.score(features.values, businesses['good'].values)

In [9]:
# choose only (1,3) fast food
subcat_filter = df['category'] == (1,3)
subcat_clf = svm.SVC()
subcat_clf.fit(features[subcat_filter].values, businesses[subcat_filter]['good'].values)
score1 = subcat_clf.score(features[subcat_filter].values, businesses[subcat_filter]['good'].values)
print('for training data: {}'.format(score1))
score2 = subcat_clf.score(features.values, businesses['good'].values)
print('for whole data-set: {}'.format(score2))

for training data: 0.967741935483871
for whole data-set: 0.699323850896861


In [44]:
res = fitness(features, businesses, combos, 0.4, 5)


Percentage:
Automotive/Car Wash: 0.699656.4
Home Services/Professional Services: 0.675862.4
Pets/Pet Stores: 0.702857.4
Shopping/Fashion: 0.691822.4
Food/Desserts: 0.683748.4
Shopping/Sporting Goods: 0.688043.4
Restaurants/Fast Food: 0.663328.4
Automotive/Uncategorized: 0.700000.4
Event Planning & Services/Transportation: 0.692857.4
Health & Medical/Doctors: 0.670727.4
Active Life/Fitness & Instruction: 0.692118.4
Home Services/Uncategorized: 0.666667.4
Beauty & Spas/Uncategorized: 0.600000.4
Automotive/Motorcycle Repair: 0.717241.4
Beauty & Spas/Skin Care: 0.704444.4
Home Services/Local Services: 0.677591.4
Pets/Pet Services: 0.690761.4
Shopping/Books, Mags, Music & Video: 0.699522.4
Food/Uncategorized: 0.696000.4
Food/Grocery: 0.694790.4
Event Planning & Services/Arts & Entertainment: 0.679327.4
Home Services/Mass Media: 0.700000.4
Health & Medical/Chiropractors: 0.718033.4
Restaurants/Breakfast & Brunch: 0.677419.4
Health & Medical/Optometrists: 0.718750.4
Beauty & Spas/Hair Salons:

In [13]:
#print("Accuracy per category:")
results = {}
for combo, name in combos.items():
    try:
        idx = df[df['category'] == combo].index.tolist()
        
        # Take out 15% of data as random sample to test data with
        idx_sample = sample(idx, round(len(idx)*0.4))
        idx_train = [index for index in idx if index not in idx_sample]
        
        subcat_clf = svm.SVC()

        # Multiply stars by 2 to get full numbers which scikit will use as class identifier (it doesnt like float for that)
        feat_train = features.loc[idx_train, :].values
        quality_train = businesses.loc[idx_train, 'stars'].values * 2  
        
        feat_sample = features.loc[idx_sample, :].values
        quality_sample = businesses.loc[idx_sample, 'stars'].values * 2  
        
        
        subcat_clf.fit(feat_train, quality_train)
        score1 = subcat_clf.score(feat_sample, quality_sample)
        #print('{}: {:.2f}%'.format(name, score1 * 100))
        
        # Compare to just a constant guess
        c = round(df.loc[df['category']==combo, 'stars'].mean() * 2)
        res = df.ix[idx].loc[(df['stars']*2 == c), 'stars'].count()
        score2 =  res / df.ix[idx].loc[:, 'stars'].count()
        
        results[combo] = [score1, score2]
        
    except Exception as e:
        print(e)
        results[combo] = [np.nan, np.nan]    
        
resframe = pd.DataFrame(results)
resframe.transpose()

The number of classes has to be greater than one; got 1


Unnamed: 0,Unnamed: 1,0,1
-1,-1,0.162562,0.194882
0,-1,0.190476,0.188679
0,0,0.266355,0.260056
0,1,0.272727,0.134904
0,2,0.322034,0.378378
0,3,0.352217,0.22069
1,-1,0.055556,0.173913
1,0,0.355159,0.124504
1,1,0.132075,0.196226
1,2,0.27591,0.12549


In [28]:
idx_tot = df[df['category'] == combo].index

In [32]:
sample_size = round(len(idx_tot) * 0.15)

In [33]:
from random import sample

In [37]:
idx = df[df['category'] == (2,1)].index.tolist()
        
# Take out 15% of data as random sample to test data with
idx_sample = sample(idx, round(len(idx)*0.15))
idx_train = [index for index in idx if index not in idx_sample]
        

In [46]:
quality_train = businesses.loc[idx_train, 'stars'].values * 2  
        

In [None]:
# "Genetic" feature search
# Step 1: Get Population
# Step 2: Calculate fitness
# Then select fittest and start again

In [7]:
# Step 1: Polulation function
from random import random

def _switch(field, chance):
    """Changes bool with chance as specified"""
    if random() < chance:
        return not(field)
    return field

def mutate(old, chance, n_output):
    output = []
    for _ in range(n_output):
        output.append([_switch(item, chance) for item in old])
    return output

In [4]:
# Step 2, Fitness function
def fitness(features, df_busi, combos, test_portion, test_rounds, status=False):
    if status:
        print("Percentage:")
    all_scores = []
    for combo, name in combos.items():
        try:
            score = []
            idx = df_busi[df_busi['category'] == combo].index.tolist()

            for _ in range(test_rounds):
                # Take out 15% of data as random sample to test data with
                idx_sample = sample(idx, round(len(idx)*test_portion))
                idx_train = [index for index in idx if index not in idx_sample]

                subcat_clf = svm.SVC(cache_size=2000, kernel="rbf")
                
                # Multiply stars by 2 to get full numbers which scikit will use as class identifier (it doesnt like float for that)
                feat_train = features.loc[idx_train, :].values
                quality_train = df_busi.loc[idx_train, 'good'].values  

                feat_sample = features.loc[idx_sample, :].values
                quality_sample = df_busi.loc[idx_sample, 'good'].values  

                subcat_clf.fit(feat_train, quality_train)
                score.append(subcat_clf.score(feat_sample, quality_sample))

            mean_score = np.mean(score)
            all_scores.append((mean_score, len(idx)))  # Tuple score - weight  
            
            if status:
                print("%s: %f.4" % (name, mean_score))

        except Exception as e:
            pass  # Ignore this for now

    tot_weight = 0
    tot_score = 0
    for score, weight in all_scores:
        tot_score += score*weight
        tot_weight += weight
    
    if tot_weight==0:
        return 0
    return (tot_score/tot_weight)

In [29]:
from operator import itemgetter
# Set up: Get starting point
num_feat = len(features.columns)

# Chance: 1/num_feats so in exceptation always one feature mutates
chance = 1/num_feat

counter = 0
generations = {}
generations[counter] = [([False] * num_feat, 0)]

all_features = features.copy(deep=True)

In [38]:
import datetime as dt
n_new = 5  # Try with 5 new candidates per old one
# Run on timer
end = dt.datetime.now() + dt.timedelta(hours=6)
while True:
    candidates = generations[counter]  # They will also be in the list so that only children that are better survive
    for item in generations[counter]:
        new_features = mutate(item[0], chance, n_new)
        for feats in new_features:
            if not(any(feats)):
                continue # Go on only if at least one feature is selected
            score = fitness(all_features.loc[:, feats], businesses, combos, 0.1, 5)
            candidates.append((feats, score))

    # Now get 5 best candidates
    new_generation = sorted(candidates, key=itemgetter(1))[:5]
    counter += 1
    generations[counter] = new_generation
    
    if dt.datetime.now() > end:
        break

KeyboardInterrupt: 

In [37]:
for item in generations[counter]:
    print(item[1])

0.69276282582
0.693717069187
0.695333020002
0.697621079839
0.697656117001


In [39]:
import pickle
with open("gen.pkl", 'wb') as f:
    pickle.dump(generations, f)

In [10]:
res = {}
for column in features.columns:
    print("Now column '%s'" % column, end="\r")
    score = fitness(features.loc[:, [column]], businesses, combos, 0.1, 5)
    res[column] = score



In [13]:
res

{'(-1, -1)': 0.69896816585363064,
 '(0, -1)': 0.70061766344530674,
 '(0, 0)': 0.69884234955590607,
 '(0, 1)': 0.69396624480286295,
 '(0, 2)': 0.70139202222521002,
 '(0, 3)': 0.6986176964893227,
 '(1, -1)': 0.69988187652571254,
 '(1, 0)': 0.69955088356155537,
 '(1, 1)': 0.69635642777227513,
 '(1, 2)': 0.6979885166983707,
 '(1, 3)': 0.69730872517279974,
 '(1, 4)': 0.70002316945425369,
 '(1, 5)': 0.69893794223337025,
 '(2, -1)': 0.70301305814963511,
 '(2, 0)': 0.6963006544055419,
 '(2, 1)': 0.70105809096451321,
 '(2, 2)': 0.70150148201420093,
 '(2, 3)': 0.70265738451972515,
 '(2, 4)': 0.70013225960335657,
 '(3, -1)': 0.69428955352434807,
 '(3, 0)': 0.69871253780968423,
 '(3, 1)': 0.70059638358750831,
 '(3, 2)': 0.70177307898149399,
 '(3, 3)': 0.69744668323199643,
 '(3, 4)': 0.6962908398095452,
 '(4, -1)': 0.69745470084105732,
 '(4, 0)': 0.70104379913438408,
 '(4, 1)': 0.69694089480025223,
 '(4, 2)': 0.69807038960729229,
 '(4, 3)': 0.69992626122993773,
 '(5, -1)': 0.70031981803922039,
 '(5

In [8]:
test = businesses.groupby("category")['city'].count()
combo = test.argmax()

In [9]:
def fitness_one(features, df_busi, combo, test_portion, test_rounds, status=False):
    features = (features - features.mean()) / features.std()

    score = []
    idx = df_busi[df_busi['category'] == combo].index.tolist()

    for _ in range(test_rounds):
        # Take out 15% of data as random sample to test data with
        idx_sample = sample(idx, round(len(idx)*test_portion))
        idx_train = [index for index in idx if index not in idx_sample]

        subcat_clf = svm.SVC(cache_size=2000, kernel="rbf")

        # Multiply stars by 2 to get full numbers which scikit will use as class identifier (it doesnt like float for that)
        feat_train = features.loc[idx_train, :].values
        quality_train = df_busi.loc[idx_train, 'good'].values  

        feat_sample = features.loc[idx_sample, :].values
        quality_sample = df_busi.loc[idx_sample, 'good'].values  

        subcat_clf.fit(feat_train, quality_train)
        score.append(subcat_clf.score(feat_sample, quality_sample))

    return np.mean(score)

In [10]:
fitness_one(features, businesses, combo, 0.4, 5)

0.70329985652797711