In [1]:
import graphlab

In [2]:
apps_data_sf = graphlab.SFrame.read_csv("apps.csv", verbose=False)

This non-commercial license of GraphLab Create for academic use is assigned to jenarvaezg@gmail.com and will expire on December 12, 2017.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1503246824.log


In [3]:
graphlab.canvas.set_target('ipynb')

apps_data_sf.show()

In [4]:
def aggregate_by_session_id(sf):
    import numpy as np
    out_sf = graphlab.SFrame()
    
    has_app = "app" in sf.column_names()
    
    for session_id in sf['session_id'].unique():
        session_out_sf = graphlab.SFrame()
        session_out_sf['session_id'] = [session_id]
        words = []
        time_diffs = []
        sf_group = sf[sf['session_id'] == session_id]
        n_events = len(sf_group)
        
        for i in range(n_events -1, -1, -1):
            this_sf = sf_group[i]
            words.append(this_sf['word'])
            if i != 0:
                time_diffs.append(this_sf['timestamp'] - sf_group[i-1]['timestamp'])

        session_out_sf['time_diff_mean'] = [np.mean(time_diffs)]
        session_out_sf['time_diff_var'] = [np.var(time_diffs)]
        session_out_sf['words'] = [words]
        
        if has_app:
            session_out_sf['app'] = [sf_group['app'][0]]
        
        out_sf = out_sf.append(session_out_sf)
        
        
    
    return out_sf
        

In [5]:
def get_normalized_tf_idf(sf):
    n = len(sf['words'])
    tf_idf_accum = {}

    for key, value in sf['tf_idf'].iteritems():
        tf_idf_accum[key] = value / n

    return tf_idf_accum

def get_swipe_percentage(bow):
    n = 0
    n_swipes = 0
    for word, times in bow.iteritems():
        n += times
        if "->" in word:
            n_swipes += times
    return n_swipes / float(n)

In [6]:
grouped_words_sf = aggregate_by_session_id(apps_data_sf)
grouped_words_sf['bow'] = graphlab.text_analytics.count_words(grouped_words_sf['words'])
grouped_words_sf['tf_idf'] = graphlab.text_analytics.tf_idf(grouped_words_sf['bow'])
grouped_words_sf['normalized_tf_idf'] = grouped_words_sf.apply(get_normalized_tf_idf)
grouped_words_sf['swipe_percentage'] = grouped_words_sf['bow'].apply(get_swipe_percentage)


In [7]:
grouped_words_sf.show()

# With classifier approach

In [12]:
total = len(grouped_words_sf)

feature_sets = [['normalized_tf_idf', 'time_diff_mean', 'time_diff_var', 'swipe_percentage'],
                ['normalized_tf_idf', 'swipe_percentage'],
                ['tf_idf', 'time_diff_mean', 'time_diff_var', 'swipe_percentage'],
                ['tf_idf', 'swipe_percentage'],
                ['tf_idf'],
                ['normalized_tf_idf', 'tf_idf', 'time_diff_mean', 'time_diff_var', 'swipe_percentage']]


for features in feature_sets:
    n_right = 0
    for i in range(len(grouped_words_sf)):
        e = grouped_words_sf.head(i+1).tail(1)
        to_use_grouped_words = grouped_words_sf[:i].append(grouped_words_sf[i+1:])
        m = graphlab.logistic_classifier.create(to_use_grouped_words, target='app', 
                                       verbose=False, features=features, 
                                       validation_set=None)
        if e[0]['app'] == m.predict(e)[0]: #, k=1, verbose=False)['reference_label'][0]:
            n_right += 1

    print features
    print n_right / float(total) * 100
    print n_right, total
    

['normalized_tf_idf', 'time_diff_mean', 'time_diff_var', 'swipe_percentage']
80.701754386
46 57
['normalized_tf_idf', 'swipe_percentage']
80.701754386
46 57
['tf_idf', 'time_diff_mean', 'time_diff_var', 'swipe_percentage']
82.4561403509
47 57
['tf_idf', 'swipe_percentage']
82.4561403509
47 57
['tf_idf']
80.701754386
46 57
['normalized_tf_idf', 'tf_idf', 'time_diff_mean', 'time_diff_var', 'swipe_percentage']
82.4561403509
47 57


In [23]:
train, test = grouped_words_sf.random_split(0.8)
features = ['tf_idf', 'time_diff_mean', 'time_diff_var', 'swipe_percentage', 'normalized_tf_idf']

model = graphlab.logistic_classifier.create(train, target="app", verbose=False, features=features, validation_set=None)

model.evaluate(test)

{'accuracy': 0.9, 'auc': 0.9707936507936509, 'confusion_matrix': Columns:
 	target_label	str
 	predicted_label	str
 	count	int
 
 Rows: 4
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |    tinder    |      tinder     |   3   |
 |   facebook   |     facebook    |   4   |
 |   whatsapp   |     whatsapp    |   2   |
 |   facebook   |      tinder     |   1   |
 +--------------+-----------------+-------+
 [4 rows x 3 columns], 'f1_score': 0.9153439153439153, 'log_loss': 0.5370727102855438, 'precision': 0.9166666666666666, 'recall': 0.9333333333333332, 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 	class	int
 
 Rows: 300003
 
 Data:
 +-----------+-----+-----+---+---+-------+
 | threshold | fpr | tpr | p | n | class |
 +-----------+-----+-----+---+---+-------+
 |    0.0    | 1.0 | 1.0 | 5 | 5 |   0   |
 |   1e-05   | 0.8 | 1.0 | 5 | 5 |   0   |
 |   2e-05   | 0.8 

# With nearest neightbor approach

In [9]:
for features in feature_sets:
    n_right = 0
    for i in range(len(grouped_words_sf)):
        e = grouped_words_sf.head(i+1).tail(1)
        to_use_grouped_words = grouped_words_sf[:i].append(grouped_words_sf[i+1:])
        m = graphlab.nearest_neighbors.create(
                to_use_grouped_words, label='app', distance='euclidean', method='brute_force', features=features,
                verbose=False)
        if e[0]['app'] == m.query(e, k=1, verbose=False)['reference_label'][0]:
            n_right += 1

    print features
    print n_right / float(total) * 100
    print n_right, total


['normalized_tf_idf', 'time_diff_mean', 'time_diff_var', 'swipe_percentage']
45.6140350877
26 57
['normalized_tf_idf', 'swipe_percentage']
64.9122807018
37 57
['tf_idf', 'time_diff_mean', 'time_diff_var', 'swipe_percentage']
45.6140350877
26 57
['tf_idf', 'swipe_percentage']
66.6666666667
38 57
['tf_idf']
66.6666666667
38 57
['normalized_tf_idf', 'tf_idf', 'time_diff_mean', 'time_diff_var', 'swipe_percentage']
45.6140350877
26 57


## Best seems to be classifier with only tf_idf, will try again with double the data