In [1]:
import graphlab

In [2]:
apps_data_sf = graphlab.SFrame.read_csv("apps.csv", verbose=False)
apps_data_sf2 = graphlab.SFrame.read_csv("apps2.csv", verbose=False)

This non-commercial license of GraphLab Create for academic use is assigned to jenarvaezg@gmail.com and will expire on December 12, 2017.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1504475917.log


In [3]:
graphlab.canvas.set_target('ipynb')

apps_data_sf.show()

In [4]:
def aggregate_by_session_id(sf):
    import numpy as np
    out_sf = graphlab.SFrame()
    
    has_app = "app" in sf.column_names()
    
    for session_id in sf['session_id'].unique():
        session_out_sf = graphlab.SFrame()
        session_out_sf['session_id'] = [session_id]
        words = []
        time_diffs = []
        sf_group = sf[sf['session_id'] == session_id]
        n_events = len(sf_group)
        
        for i in range(n_events -1, -1, -1):
            this_sf = sf_group[i]
            words.append(this_sf['word'])
            if i != 0:
                time_diffs.append(this_sf['timestamp'] - sf_group[i-1]['timestamp'])

        session_out_sf['time_diff_mean'] = [np.mean(time_diffs)]
        session_out_sf['time_diff_var'] = [np.var(time_diffs)]
        session_out_sf['words'] = [words]
        
        if has_app:
            session_out_sf['app'] = [sf_group['app'][0]]
        
        out_sf = out_sf.append(session_out_sf)
        
        
    
    return out_sf

def get_normalized_tf_idf(sf):
    n = len(sf['words'])
    tf_idf_accum = {}

    for key, value in sf['tf_idf'].iteritems():
        tf_idf_accum[key] = value / n

    return tf_idf_accum

def get_swipe_percentage(bow):
    n = 0
    n_swipes = 0
    for word, times in bow.iteritems():
        n += times
        if "->" in word:
            n_swipes += times
    return n_swipes / float(n)
        
    
def get_grouped_sf(sf):
    grouped_words_sf = aggregate_by_session_id(sf)
    grouped_words_sf['bow'] = graphlab.text_analytics.count_words(grouped_words_sf['words'])
    grouped_words_sf['tf_idf'] = graphlab.text_analytics.tf_idf(grouped_words_sf['bow'])
    grouped_words_sf['normalized_tf_idf'] = grouped_words_sf.apply(get_normalized_tf_idf)
    grouped_words_sf['swipe_percentage'] = grouped_words_sf['bow'].apply(get_swipe_percentage)
    return grouped_words_sf


In [5]:
from itertools import product, compress
def combinations(items):
    return (set(compress(items,mask)) for mask in product([0,1], repeat=len(items)))

In [6]:
grouped_words_sf = get_grouped_sf(apps_data_sf)
grouped_words_sf2 = get_grouped_sf(apps_data_sf2)

train, test = grouped_words_sf.random_split(0.8, seed=1)
train2, test2 = grouped_words_sf2.random_split(0.8, seed=1)

# With classifier approach

In [19]:
def get_best_features_via_loocv(sf):
    
    features = ['normalized_tf_idf', 'tf_idf', 'time_diff_mean', 'time_diff_var', 'swipe_percentage']
    features_combinations = combinations(features)

    total = len(sf)
    best_accuracy = 0
    best_feature_list = []

    for features_set in features_combinations:
        features_list = list(features_set)
        if features_list == []:
            continue
        n_right = 0

        # LOO cross validation
        for i in range(len(sf)):
            e = sf.head(i+1).tail(1)
            to_use_grouped_words = sf[:i].append(sf[i+1:])
            m = graphlab.logistic_classifier.create(to_use_grouped_words, target='app', 
                                                    verbose=False, features=features_list, 
                                                    validation_set=None)
            if e[0]['app'] == m.predict(e)[0]:
                n_right += 1

        accuracy = round(n_right / float(total) * 100, 2)
        print "Features used:", features_list, "accuracy:",  accuracy, "% (", n_right, "out of", total, ")"
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_feature_list = features_list
            print "Best feature set so far!"
    
    
    print "All done, best feature list is", best_feature_list
    return best_feature_list

    

In [13]:
features1 = get_best_features_via_loocv(train)
features2 = get_best_features_via_loocv(train2)

model = graphlab.logistic_classifier.create(train, target="app", verbose=False, features=features1, validation_set=None)
model2 = graphlab.logistic_classifier.create(train2, target="app", verbose=False, features=features2, validation_set=None)

full_model = graphlab.logistic_classifier.create(grouped_words_sf, target="app", verbose=False, features=features1, validation_set=None)
full_model2 = graphlab.logistic_classifier.create(grouped_words_sf2, target="app", verbose=False, features=features2, validation_set=None)

Features used: ['swipe_percentage'] accuracy: 72.53 % ( 66 out of 91 )
Best feature set so far!
Features used: ['time_diff_var'] accuracy: 53.85 % ( 49 out of 91 )
Features used: ['time_diff_var', 'swipe_percentage'] accuracy: 76.92 % ( 70 out of 91 )
Best feature set so far!
Features used: ['time_diff_mean'] accuracy: 70.33 % ( 64 out of 91 )
Features used: ['time_diff_mean', 'swipe_percentage'] accuracy: 80.22 % ( 73 out of 91 )
Best feature set so far!
Features used: ['time_diff_var', 'time_diff_mean'] accuracy: 70.33 % ( 64 out of 91 )
Features used: ['time_diff_var', 'time_diff_mean', 'swipe_percentage'] accuracy: 76.92 % ( 70 out of 91 )
Features used: ['tf_idf'] accuracy: 95.6 % ( 87 out of 91 )
Best feature set so far!
Features used: ['swipe_percentage', 'tf_idf'] accuracy: 94.51 % ( 86 out of 91 )
Features used: ['time_diff_var', 'tf_idf'] accuracy: 95.6 % ( 87 out of 91 )
Features used: ['time_diff_var', 'swipe_percentage', 'tf_idf'] accuracy: 95.6 % ( 87 out of 91 )
Features

In [14]:
full_model.save("app_model_")
full_model2.save("app_model_2")

In [15]:
model.evaluate(test)

{'accuracy': 0.7857142857142857,
 'auc': 0.9916666666666666,
 'confusion_matrix': Columns:
 	target_label	str
 	predicted_label	str
 	count	int
 
 Rows: 5
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |   facebook   |      tinder     |   2   |
 |    tinder    |      tinder     |   4   |
 |   whatsapp   |     whatsapp    |   3   |
 |   facebook   |     facebook    |   4   |
 |   facebook   |     whatsapp    |   1   |
 +--------------+-----------------+-------+
 [5 rows x 3 columns],
 'f1_score': 0.7948051948051947,
 'log_loss': 0.5808187084628839,
 'precision': 0.8055555555555555,
 'recall': 0.8571428571428571,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 	class	int
 
 Rows: 300003
 
 Data:
 +-----------+----------------+-----+---+---+-------+
 | threshold |      fpr       | tpr | p | n | class |
 +-----------+----------------+-----+---+---+-------+
 |    

In [16]:
model2.evaluate(test2)

{'accuracy': 0.9230769230769231,
 'auc': 0.9592592592592593,
 'confusion_matrix': Columns:
 	target_label	str
 	predicted_label	str
 	count	int
 
 Rows: 4
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |   facebook   |     facebook    |   6   |
 |   whatsapp   |     whatsapp    |   3   |
 |   whatsapp   |      tinder     |   1   |
 |    tinder    |      tinder     |   3   |
 +--------------+-----------------+-------+
 [4 rows x 3 columns],
 'f1_score': 0.9047619047619048,
 'log_loss': 0.4230283949355617,
 'precision': 0.9166666666666666,
 'recall': 0.9166666666666666,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 	class	int
 
 Rows: 300003
 
 Data:
 +-----------+----------------+-----+---+---+-------+
 | threshold |      fpr       | tpr | p | n | class |
 +-----------+----------------+-----+---+---+-------+
 |    0.0    |      1.0       | 1.0 | 6 | 7 |   0 

In [17]:
full_model.evaluate(grouped_words_sf2)

{'accuracy': 0.7582417582417582,
 'auc': 0.887109622622701,
 'confusion_matrix': Columns:
 	target_label	str
 	predicted_label	str
 	count	int
 
 Rows: 8
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |   whatsapp   |     facebook    |   3   |
 |   facebook   |      tinder     |   3   |
 |    tinder    |     facebook    |   9   |
 |   facebook   |     facebook    |   30  |
 |    tinder    |     whatsapp    |   4   |
 |   facebook   |     whatsapp    |   3   |
 |    tinder    |      tinder     |   22  |
 |   whatsapp   |     whatsapp    |   17  |
 +--------------+-----------------+-------+
 [8 rows x 3 columns],
 'f1_score': 0.7584304584304583,
 'log_loss': 1.97203328690083,
 'precision': 0.7675396825396826,
 'recall': 0.7706349206349207,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 	class	int
 
 Rows: 300003
 
 Data:
 +-----------+----------------+--------

In [18]:
full_model2.evaluate(grouped_words_sf)

{'accuracy': 0.780952380952381,
 'auc': 0.9442857142857143,
 'confusion_matrix': Columns:
 	target_label	str
 	predicted_label	str
 	count	int
 
 Rows: 8
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |    tinder    |      tinder     |   34  |
 |   whatsapp   |      tinder     |   2   |
 |   facebook   |     whatsapp    |   12  |
 |    tinder    |     whatsapp    |   1   |
 |   facebook   |     facebook    |   16  |
 |   whatsapp   |     whatsapp    |   32  |
 |   whatsapp   |     facebook    |   1   |
 |   facebook   |      tinder     |   7   |
 +--------------+-----------------+-------+
 [8 rows x 3 columns],
 'f1_score': 0.7623931623931623,
 'log_loss': 1.142729628980789,
 'precision': 0.8143284187059837,
 'recall': 0.780952380952381,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 	class	int
 
 Rows: 300003
 
 Data:
 +-----------+----------------+--------