In [6]:
from ydc.tools.import_data import import_businesses, import_reviews
from ydc.features.get_features import get_features
import datetime as dt
from random import sample
from sklearn import svm
import numpy as np

In [7]:
df_busi = import_businesses(new_cache=True)

In [8]:
df_rev = import_reviews(new_cache=True)

Successfully imported reviews with columns ['business_id' 'date' 'review_id' 'stars' 'text' 'type' 'user_id' 'votes'
 'real_date']


In [9]:
latest_date = df_rev.groupby('business_id')['real_date'].max()

In [10]:
def one_year(date_series):
    threshold = date_series.max() - dt.timedelta(days=360)
    return (date_series > threshold).sum()

In [11]:
grouped = df_rev.groupby('business_id')['real_date'].agg({'count': one_year})
result = grouped['count'].apply(lambda x: x.nanosecond)  # Pandas is stupid and interpreted the sum as a timestamp

In [12]:
(features, df, box, combos, cells, n_ind, n_dist) = get_features(status=True, new_cache=True)

Successfully imported reviews with columns ['business_id' 'date' 'stars' 'real_date']


In [13]:
df_res = df.join(result, on='business_id')

In [14]:
def divide_good_and_bad(dataframe, combos, key_feature, percentage):
    """
    Take the given percentage of the dataframe and mark them as "good"
    Done for each category separately, sort by column "key_feature"
    
    """
    # Initialize dataframe sorted by stars and setup new column "good" with False as default
    dataframe = dataframe.sort(columns=key_feature, ascending=False)
    dataframe['good'] = False
    
    for combo in combos:
        # Every (super/sub) category combo
        total = (df['category']==combo).sum()
        
        # Find n (total number times percentage)
        num_good = int(round(total * percentage))
        dataframe['good']
        
        # Get index from best n
        idx = dataframe.loc[df['category']==combo, :].head(num_good).index
        
        # Set those best ones to true
        dataframe.loc[idx, 'good'] = True
    
    # Restore correct order (sorting scrambled it)
    return dataframe.sort_index()

In [15]:
businesses = divide_good_and_bad(df_res, combos, 'count', 0.5)
test = df_res.groupby("category")['city'].count()

In [16]:
combo = test.argmax()

In [17]:
def fitness_one(features, df_busi, combo, test_portion, test_rounds, status=False):
    features = (features - features.mean()) / features.std()

    score = []
    idx = df_busi[df_busi['category'] == combo].index.tolist()

    for _ in range(test_rounds):
        # Take out 15% of data as random sample to test data with
        idx_sample = sample(idx, round(len(idx)*test_portion))
        idx_train = [index for index in idx if index not in idx_sample]

        subcat_clf = svm.SVC(cache_size=2000, C=0.1, kernel='rbf')

        feat_train = features.loc[idx_train, :].values
        quality_train = df_busi.loc[idx_train, 'good'].values  

        feat_sample = features.loc[idx_sample, :].values
        quality_sample = df_busi.loc[idx_sample, 'good'].values  

        subcat_clf.fit(feat_train, quality_train)
        score.append(subcat_clf.score(feat_sample, quality_sample))

    return np.mean(score)

In [18]:
fitness_one(features, businesses, combo, 0.4, 10)

0.61155732679337826

In [18]:
def fitness(features, df_busi, combos, test_portion, test_rounds, status=False):
    tot_results = []
    for combo in combos:
        try:
            weight = df_busi.loc[df_busi['category']==combo, 'city'].count()
            score = fitness_one(features, df_busi, combo, test_portion, test_rounds, status)
            tot_results.append((score, weight))
        except:
            pass
    
    tot_score = 0
    tot_weight = 0    
    for item in tot_results:
        if item[1]==0:
            continue  # ignore 
        tot_score += item[0] * item[1]
        tot_weight += item[1]
        
    return tot_score/tot_weight
        

In [19]:
fitness(features, businesses, combos, 0.4, 5)

0.58519942046426543