In [82]:
import glob
import os
import pandas as pd
import copy
import datetime
import json
from functools import reduce
from time import time
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from xgboost import XGBClassifier

# CONSTS

In [2]:
DATA_PATH = r".\data\raw\*.csv"

TOP_DOMAIN_USAGES = 100
DOMAIN_COUNT_THRESHOLD = 1

NUMBER_OF_QUERIES_EACH_CHUNK = 750
NUMBER_OF_USERS = 15

LEARNING_CHUNKS = 50
TESTING_CHUNKS = 40
USEABLE_CHUNKS = LEARNING_CHUNKS + TESTING_CHUNKS

COMMON_DOMAIN_COUNT = 100

In [3]:
users_data = []
for file in glob.glob(DATA_PATH):
    df = pd.DataFrame()
    df = df.from_csv(file)
    user_id = int(os.path.basename(file).split('.')[0].split('_user')[1])
    df['user'] = pd.Series([user_id] * len(df.index), index=df.index)
    users_data.append(df)

  after removing the cwd from sys.path.
  if (yield from self.run_code(code, result)):
  if (yield from self.run_code(code, result)):


# Explore Data

In [4]:
len(users_data)

15

In [5]:
print("number of columns: {}\n columns: {}\n".format(len(users_data[0].columns), users_data[0].columns))

number of columns: 27
 columns: Index(['frame.time_epoch', 'frame.time_relative', 'ip.dst', 'ip.src',
       'tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport',
       'dns.qry.type', 'dns.qry.name', 'dns.resp.type', 'dns.resp.name',
       'dns.resp.ttl', 'dns.a', 'dns.aaaa', 'dns.cname', 'dns.flags',
       'dns.flags.response', 'dns.flags.rcode', 'dns.count.queries',
       'dns.count.answers', 'dns.count.auth_rr', 'dns.count.add_rr',
       'dns.soa.mname', 'dns.srv.name', 'dns.time', 'user'],
      dtype='object')



In [6]:
def calculate_domains_stats(): 
    sum = 0
    min = 99999999
    max = 0
    for user in users_data:
        sum += len(user.index)
        if min > len(user.index):
            min = len(user.index)
        if max < len(user.index):
            max = len(user.index)
        # print("rows: {}".format(len(user.index)))
    print("rows avg: {}".format(sum / len(users_data)))
    print("rows min: {}".format(min))
    print("rows max: {}".format(max))
calculate_domains_stats()

rows avg: 210466.86666666667
rows min: 141471
rows max: 328846


# Explore Domains

In [7]:
domains_per_user = []
for s in [user['dns.qry.name'].unique() for user in users_data]:
    domains_per_user.append(set(s))

In [8]:
known_domains = pd.Series(reduce(lambda x, y: list(set(x).union(y)), domains_per_user))
known_domains = known_domains.unique()
print("Number of different domains in all data : {}".format(len(known_domains)))

Number of different domains in all data : 47044


In [9]:
# avg of unique domains per user:
sum_domains = reduce(lambda x, y: x + y, [len(user['dns.qry.name'].unique()) for user in users_data])
min_domains = reduce(lambda x, y: min(x, y), [len(user['dns.qry.name'].unique()) for user in users_data])
max_domains = reduce(lambda x, y: max(x, y), [len(user['dns.qry.name'].unique()) for user in users_data])
print("Average of domains without duplication per user : {}".format(sum_domains / len(users_data)))
print("Min of domains without duplication per user : {}".format(min_domains))
print("Max of domains without duplication per user : {}".format(max_domains))

Average of domains without duplication per user : 4236.533333333334
Min of domains without duplication per user : 2534
Max of domains without duplication per user : 6006


In [10]:
# avg time for pcap per user
users_sniff_time = [datetime.timedelta(seconds=int(user['frame.time_relative'].max())) for user in users_data]
print("Average of sniff time per user : {}".format(
    reduce(lambda x, y: x + y, users_sniff_time) / len(users_sniff_time)))
print("Min of sniff time per user : {}".format(min(users_sniff_time)))
print("Max of sniff time per user : {}".format(max(users_sniff_time)))

Average of sniff time per user : 36 days, 5:47:32.066667
Min of sniff time per user : 30 days, 19:32:27
Max of sniff time per user : 57 days, 21:39:32


In [11]:
# domain used only by specific user
def calc_unique_domains():
    user_sum = 0
    for index, user in enumerate(domains_per_user):
        user_domains = copy.deepcopy(user)
        for u in domains_per_user[:index]:
            user_domains -= u

        for u in domains_per_user[index + 1:]:
            user_domains -= u

        user_sum += len(user_domains)
        print("Unique domain for user {} is {}".format(index, len(user_domains)))
    
    print("Average of domains only used by specific users : {}".format(user_sum / len(users_data)))
calc_unique_domains()

Unique domain for user 0 is 1086
Unique domain for user 1 is 4152
Unique domain for user 2 is 2334
Unique domain for user 3 is 5351
Unique domain for user 4 is 1981
Unique domain for user 5 is 3503
Unique domain for user 6 is 2620
Unique domain for user 7 is 1856
Unique domain for user 8 is 2949
Unique domain for user 9 is 1909
Unique domain for user 10 is 5092
Unique domain for user 11 is 1718
Unique domain for user 12 is 1535
Unique domain for user 13 is 4435
Unique domain for user 14 is 2667
Average of domains only used by specific users : 2879.2


In [12]:
# top domain usages:
all_domains = []
for s in [user['dns.qry.name'].unique() for user in users_data]:
    all_domains.extend(s)
all_domains

['android.clients.google.com',
 'urlauth.ksmobile.net',
 'streamer.bizportal.co.il',
 'app.woorlds.com',
 'logjam.cyngn.com',
 'ws.pushapps.mobi',
 'www.quickode.com',
 'www.googleapis.com',
 'ssl.gstatic.com',
 'docs.google.com',
 'clients3.google.com',
 'csi.gstatic.com',
 'lh5.googleusercontent.com',
 'lh6.googleusercontent.com',
 'lh3.googleusercontent.com',
 'up.cm.ksmobile.com',
 'edge-mqtt.facebook.com',
 'news.google.com',
 'mclients.googleapis.com',
 'touch.www.linkedin.com',
 'graph.facebook.com',
 'a.config.skype.com',
 'inbox.google.com',
 'api.crittercism.com',
 'gemiuscookie.hit.gemius.pl',
 'upoll.umengcloud.com',
 'play.googleapis.com',
 'spengler.mobilethreat.net',
 'appinfocdn.ksmobile.net',
 'settings.crashlytics.com',
 'decide.mixpanel.com',
 'infoc2.duba.net',
 'device-api.urbanairship.com',
 '1e58de17.api.splkmobile.com',
 '130.231.215.54.in-addr.arpa',
 'utop.umengcloud.com',
 'adash.m.taobao.com',
 'secure-drm.imrworldwide.com',
 'clients4.google.com',
 'svcs.dr

In [14]:
common_domains = {}
for domain in known_domains:
    common_domains[domain] = len([user for user in domains_per_user if domain in user])

common_domains_below_threshold = {key: value for key, value in common_domains.items() if
                                  value <= DOMAIN_COUNT_THRESHOLD}
common_domains_without_dot = {key: value for key, value in common_domains.items() if '.' not in key}
common_domains_above_threshold = {key: value for key, value in common_domains.items() if value > 1}
suspicious_domains = set(common_domains_without_dot.keys()) & set(common_domains_below_threshold.keys())
valid_domains = set(common_domains.keys()) - suspicious_domains

print("Number of suspicious domains - with less then {} usage and without dot: {}. Exmaples:\n{}\n"
      .format(DOMAIN_COUNT_THRESHOLD, len(suspicious_domains), list(suspicious_domains)[:5]))

Number of suspicious domains - with less then 1 usage and without dot: 35236. Exmaples:
['hjijplvcblqun', 'tdwiqlwplrjqj', 'ujckfbdepynvpxk', 'seuyrwcems', 'sezuecy']



In [21]:
def get_domain_usage_count():
    domains_usage_count = {}
    all_packets = users_data[0].copy()
    for user in users_data[1:]:
        relevant_user_packet = user[:LEARNING_CHUNKS * NUMBER_OF_QUERIES_EACH_CHUNK] # comment this if you want to use queries from all data
        all_packets = all_packets.append(relevant_user_packet, ignore_index=True)

    all_relevant_packets = all_packets
    all_relevant_domains = all_relevant_packets['dns.qry.name'].value_counts().index
    domains_usage_in_relevant_packets = all_relevant_packets['dns.qry.name'].value_counts()

    if True:
        for domain in valid_domains:
            domains_usage_count[domain] = 0
            if domain in all_relevant_domains:
                domains_usage_count[domain] += int(domains_usage_in_relevant_packets[domain])

        with open("./data/domains_usage_count_only_train_data.json", 'w') as h:
            json.dump(domains_usage_count, h)
    else:
        with open("./data/domains_usage_count.json", 'r') as h:
            domains_usage_count = json.load(h)
    
    return domains_usage_count

In [22]:
domains_usage_count = get_domain_usage_count()
domains_usage_count

{'us.my.alibaba.com': 0,
 'www1063.mdotm.com': 0,
 'r6---sn-q4f7dm7l.googlevideo.com': 0,
 'mmi115.whatsapp.net': 4,
 'chatdepot.twitch.tv': 2,
 'mekomit.co.il': 4,
 'psapp-start.dl.playstation.net': 2,
 '65524.0.3.5.20703.rst15.r.skype.net': 4,
 'p4-dghageiot3y4m-5jgjgszwv6cyg5na-482792-i1-v6exp3-ds.metric.gstatic.com': 0,
 'seretil.me': 2,
 'www.middleeasteye.net': 0,
 'e13.whatsapp.net': 875,
 'p4-cqaofin4hk7p6-chkef5svkhbn2kl6-275209-i1-v6exp3-v4.metric.gstatic.com': 0,
 'cdn-1.convertexperiments.com': 6,
 'cdn1.clkmon.com': 2,
 'app.sketch.sc.sonymobile.com': 0,
 'igy-il.disqus.com': 0,
 'mmi218.whatsapp.net': 30,
 'videos-b-16.ak.instagram.com': 18,
 'r20---sn-q0c7dn7k.gvt1.com': 0,
 'r4---sn-q0c7dn7k.googlevideo.com': 4,
 's.ad132m.com': 2,
 '136.178.215.54.in-addr.arpa': 110,
 'mmv283.whatsapp.net': 4,
 'www.underwar.co.il': 0,
 'mgid.com': 2,
 'justwhookid.disqus.com': 0,
 'www.fisheye.co.il': 2,
 'www.maslulim-israel.co.il': 0,
 'tapestry.tapad.com': 14,
 'load.passionfruitad

In [25]:
domains_usage_count_df = pd.DataFrame.from_dict(
    {'domains': list(domains_usage_count.keys()), 'usage': list(domains_usage_count.values())})
domains_usage_count_df = domains_usage_count_df.sort_values('usage', ascending=False)

# Avg of unique domains per user from valid domains:
print("Number of valid domains {}. Examples:".format(len(valid_domains)))
print(domains_usage_count_df.head())

Number of valid domains 11808. Examples:
                          domains  usage
10816          graph.facebook.com  40112
5905               www.google.com  31716
5951              app.woorlds.com  29473
6486   android.clients.google.com  26122
9812     settings.crashlytics.com  25002


In [24]:
print("\n====Statistics with only valid domains:====\n")
# Average of domains without duplication per user : 4236.
domains_without_duplication_per_user = []
for user in users_data:
    domains_without_duplication_per_user.append(len((set(user['dns.qry.name']) - suspicious_domains)))

print("Average of domains without duplication per user : {}"
      .format(reduce(lambda x, y: x + y, domains_without_duplication_per_user) / len(users_data)))
print("Min of domains without duplication per user : {}".format(min(domains_without_duplication_per_user)))
print("Max of domains without duplication per user : {}".format(max(domains_without_duplication_per_user)))


====Statistics with only valid domains:====

Average of domains without duplication per user : 1887.4666666666667
Min of domains without duplication per user : 758
Max of domains without duplication per user : 2981


In [26]:
# Domain used only by specific user
unique_domains_per_user_valid = []
for index, user in enumerate(domains_per_user):
    user_domains = copy.deepcopy(user)
    user_domains -= suspicious_domains
    for u in domains_per_user[:index]:
        user_domains -= u

    for u in domains_per_user[index + 1:]:
        user_domains -= u

    unique_domains_per_user_valid.append(len(user_domains))
    # print("Unique domain for user {} is {}".format(index, len(user_domains)))

print("Average of domains only used by specific users : {}"
      .format(reduce(lambda x, y: x + y, unique_domains_per_user_valid) / len(users_data)))
print("Min of domains only used by specific user : {}".format(min(unique_domains_per_user_valid)))
print("Max of domains only used by specific user : {}".format(max(unique_domains_per_user_valid)))

Average of domains only used by specific users : 530.1333333333333
Min of domains only used by specific user : 74
Max of domains only used by specific user : 1224


In [27]:
# Valid domains which all users use:
valid_domains_used_by_all_users = set()
for domain in valid_domains:
    everyone_use_this_domain = True
    for user in domains_per_user:
        if domain not in user:
            everyone_use_this_domain = False
            break
    if everyone_use_this_domain:
        valid_domains_used_by_all_users.add(domain)

In [36]:
# Remove domain used by all users from domains_usage_count_df
everyone_domains_usage_count_df = pd.DataFrame(columns=['domains', 'usage'])
for domain in valid_domains_used_by_all_users:
    everyone_domains_usage_count_df = pd.concat(
        [everyone_domains_usage_count_df, domains_usage_count_df.loc[domains_usage_count_df['domains'] == domain]])
    domains_usage_count_df.drop(domains_usage_count_df.loc[domains_usage_count_df['domains'] == domain].index,
                                inplace=True)

everyone_domains_usage_count_df = everyone_domains_usage_count_df.sort_values('usage', ascending=False)

In [37]:
domains_usage_count_df

Unnamed: 0,domains,usage
5951,app.woorlds.com,29473
329,api.facebook.com,23487
1043,edge-mqtt.facebook.com,15444
11415,up.cm.ksmobile.com,12723
2481,logjam.cyngn.com,9774
1845,push.parse.com,9586
7817,device-api.urbanairship.com,7657
5174,api.appsflyer.com,6927
6979,2.android.pool.ntp.org,6708
1586,api.dropbox.com,6625


In [38]:
valid_domains

{'us.my.alibaba.com',
 'www1063.mdotm.com',
 'r6---sn-q4f7dm7l.googlevideo.com',
 'mmi115.whatsapp.net',
 'chatdepot.twitch.tv',
 'mekomit.co.il',
 'psapp-start.dl.playstation.net',
 '65524.0.3.5.20703.rst15.r.skype.net',
 'p4-dghageiot3y4m-5jgjgszwv6cyg5na-482792-i1-v6exp3-ds.metric.gstatic.com',
 'seretil.me',
 'www.middleeasteye.net',
 'e13.whatsapp.net',
 'p4-cqaofin4hk7p6-chkef5svkhbn2kl6-275209-i1-v6exp3-v4.metric.gstatic.com',
 'cdn-1.convertexperiments.com',
 'cdn1.clkmon.com',
 'app.sketch.sc.sonymobile.com',
 'igy-il.disqus.com',
 'mmi218.whatsapp.net',
 'videos-b-16.ak.instagram.com',
 'r20---sn-q0c7dn7k.gvt1.com',
 'r4---sn-q0c7dn7k.googlevideo.com',
 's.ad132m.com',
 '136.178.215.54.in-addr.arpa',
 'mmv283.whatsapp.net',
 'www.underwar.co.il',
 'mgid.com',
 'justwhookid.disqus.com',
 'www.fisheye.co.il',
 'www.maslulim-israel.co.il',
 'tapestry.tapad.com',
 'load.passionfruitads.com',
 'p4-bnaiwa6hzndlg-txx4r5tsa4ljnx7d-962642-s1-v6exp3-v4.metric.gstatic.com',
 'p-behacdn.

In [39]:
suspicious_domains

{'hjijplvcblqun',
 'tdwiqlwplrjqj',
 'ujckfbdepynvpxk',
 'seuyrwcems',
 'sezuecy',
 'twsfcslpepma',
 'iugbjexbj',
 'hsegvrkakuf',
 'yvgvzmr',
 'zbmqccedqby',
 'znkdajf',
 'jjaelrex',
 'uncfeqwnhnrb',
 'wcxjxhdde',
 'cdmvmnilsdd',
 'yrixwykz',
 'drvzwmh',
 'ebbtfmkbjhsmnfd',
 'muautqtcoisjwf',
 'dipxrjucdnw',
 'exdxgzw',
 'qyupdniyvn',
 'gkfmzsc',
 'pzrrjht',
 'mueqhgccffx',
 'gyozrvefavwnmk',
 'nciyuaxbitt',
 'pghhteo',
 'gytsrouiz',
 'dqzlldnrtxoeki',
 'xsexyqeh',
 'oyyjmuvhqznbf',
 'zqhyhrtwaxckrnn',
 'vkbtbanpdkogd',
 'clgqpblwnnpqws',
 'ubngwok',
 'omdtwtmemzpd',
 'wfwekuuxtgpbgpd',
 'ggtwdbcnoav',
 'otlzpskvnt',
 'zkkcjcidykyz',
 'pzmcehktui',
 'vvhydbvtrnussiu',
 'yyxrghvizyh',
 'rucygethzhdpekw',
 'cyldfogky',
 'szgvrhcn',
 'bswerjis',
 'hudgzdshnysvgdo',
 'phqbqusdmgke',
 'dyralfgbtm',
 'jxuuosirigshgxo',
 'xybbuhxmjoxkf',
 'esdzdaubvyogn',
 'gydpjevxmwqn',
 'oddborqhlda',
 'aygltbbekvjhzg',
 'ehwxqtleait',
 'qvpcxxoxnxtmjf',
 'kviiguxaskr',
 'mkgnjrfyjxsnhac',
 'mkghgfkmdxod',

# Data To Chunks

In [42]:
users_queries = []
for user in users_data:
    users_queries.append(list(user.loc[user['dns.flags.response'] == 0]['dns.qry.name']))

users_queries_split_by_chunks = []
for user_queries in users_queries:
    number_of_chunks = len(user_queries)
    user_queries_split_by_chunks = []
    for i in range(int(number_of_chunks / NUMBER_OF_QUERIES_EACH_CHUNK) + 1):
        user_queries_split_by_chunks.append(
            user_queries[i * NUMBER_OF_QUERIES_EACH_CHUNK:(1 + i) * NUMBER_OF_QUERIES_EACH_CHUNK])
    users_queries_split_by_chunks.append(user_queries_split_by_chunks)

# Build Main dataframe

In [45]:
"""
***only queries

df:
user-id chunk-number


"""
df = pd.DataFrame(columns=['User', 'Chunk']).set_index(['User', 'Chunk'])

# common domains - number of apperance of most 50 domains(which not everyone use)
top_100_common_domains = list(domains_usage_count_df.iloc[:COMMON_DOMAIN_COUNT]['domains'])
top_100_common_domains = set(top_100_common_domains)
for user_id in range(len(users_queries_split_by_chunks)):
    for chunk_id in range(len(users_queries_split_by_chunks[user_id])):
        for domain in top_100_common_domains:
            df.loc[(user_id, chunk_id), domain] = users_queries_split_by_chunks[user_id][chunk_id].count(domain)

# remove data not needed:
df = df.reset_index()
df = df.loc[df['Chunk'] < USEABLE_CHUNKS]
df = df.set_index(['User', 'Chunk'])

# Create Classifiers

In [76]:
### Add labels ###:
for user_id in range(NUMBER_OF_USERS):
    df['label{}'.format(user_id)] = pd.Series([0] * USEABLE_CHUNKS * user_id + [1] * USEABLE_CHUNKS + [0] * USEABLE_CHUNKS * (NUMBER_OF_USERS - 1 - user_id), index=df.index)

In [84]:
### create train_x train_y
features = df.reset_index()

train_x = features.loc[features['Chunk'] < LEARNING_CHUNKS].set_index(['User', 'Chunk'])  # features without labels (first 50)
train_x = shuffle(train_x)
train_y = pd.DataFrame({"label{}".format(i): list(train_x.pop("label{}".format(i))) for i in range(NUMBER_OF_USERS) }) # only labels (first 50)
train_x = train_x.reset_index()
train_x.pop('User')
train_x.pop('Chunk')

0      48
1      24
2       2
3       0
4      24
5      38
6      16
7       5
8      43
9       7
10     17
11      9
12     33
13     31
14     44
15     39
16      5
17     32
18     45
19     20
20      9
21      1
22     44
23     46
24     30
25     22
26     20
27     10
28     17
29      7
       ..
720    40
721    18
722    28
723    47
724     3
725    41
726    21
727    19
728    32
729     4
730    34
731    17
732    14
733    49
734    47
735    46
736    15
737    36
738    22
739    29
740    43
741    38
742    22
743    12
744    47
745    12
746     3
747    11
748    23
749    11
Name: Chunk, Length: 750, dtype: int64

In [85]:
# Create test_x, test_y
test_x = features.loc[features['Chunk'] >= LEARNING_CHUNKS].set_index(['User', 'Chunk'])  # features with label (last 40)
test_y = pd.DataFrame({"label{}".format(i): list(test_x.pop("label{}".format(i))) for i in
                        range(NUMBER_OF_USERS)})  # only labels (last 40)
test_y2 = pd.DataFrame(columns=['User', 'Chunk']).set_index(['User', 'Chunk']) # only labels (last 40)
for user_id in range(NUMBER_OF_USERS):
    for chunk_id in range(LEARNING_CHUNKS, TESTING_CHUNKS + LEARNING_CHUNKS):
        test_y2.loc[(user_id, chunk_id), 'label'] = user_id
test_x = test_x.reset_index()
test_x.pop('User')
test_x.pop('Chunk')

0      50
1      51
2      52
3      53
4      54
5      55
6      56
7      57
8      58
9      59
10     60
11     61
12     62
13     63
14     64
15     65
16     66
17     67
18     68
19     69
20     70
21     71
22     72
23     73
24     74
25     75
26     76
27     77
28     78
29     79
       ..
570    60
571    61
572    62
573    63
574    64
575    65
576    66
577    67
578    68
579    69
580    70
581    71
582    72
583    73
584    74
585    75
586    76
587    77
588    78
589    79
590    80
591    81
592    82
593    83
594    84
595    85
596    86
597    87
598    88
599    89
Name: Chunk, Length: 600, dtype: int64

In [86]:
### train ###
models = []
t1 = time()
print("Started XGBClassifier in", t1)
for i in range(NUMBER_OF_USERS):
    model = XGBClassifier()
    model.fit(train_x, train_y['label{}'.format(i)])
    models.append(model)
t2 = time()
print('Finished XGBClassifier in ', t2, ' Total time ', t2 - t1, ' sec.')

Started XGBClassifier in 1560259744.989995
Finished XGBClassifier in  1560259749.174502  Total time  4.184506893157959  sec.


# Test Results

In [87]:
# check predictions:
preds = []
for i in range(NUMBER_OF_USERS):
    preds.append(models[i].predict_proba(test_x))

In [88]:
# check accuracy 1:
predictions = pd.DataFrame(columns=["label{}".format(x) for x in range(NUMBER_OF_USERS)])
for chunk_number in range(TESTING_CHUNKS * NUMBER_OF_USERS):
    predicted_user_id = np.argmax([preds[user_id][:,1][chunk_number] for user_id in range(NUMBER_OF_USERS)])
    new_col = {"label{}".format(x):0 for x in range(NUMBER_OF_USERS)}
    new_col["label{}".format(predicted_user_id)] = 1
    predictions = predictions.append(new_col, ignore_index=True)

predictions.eq(test_y.values).mean() # calculate accuracy for each user separately

label0     1.000000
label1     0.998333
label2     1.000000
label3     1.000000
label4     0.998333
label5     0.973333
label6     1.000000
label7     0.998333
label8     0.998333
label9     0.998333
label10    1.000000
label11    0.996667
label12    1.000000
label13    0.975000
label14    1.000000
dtype: float64

In [89]:
# check accuracy 2:
predictions2 = pd.DataFrame(columns=['User', 'Chunk']).set_index(['User', 'Chunk'])
for user_id in range(NUMBER_OF_USERS):
    for chunk_id in range(LEARNING_CHUNKS, TESTING_CHUNKS + LEARNING_CHUNKS):
        real_chunk_id = chunk_id-LEARNING_CHUNKS + user_id * TESTING_CHUNKS
        predicted_user_id = \
            np.argmax([preds[i][:,1][real_chunk_id] for i in range(NUMBER_OF_USERS)])
        predictions2.loc[(user_id, chunk_id), 'label'] = predicted_user_id

accuracy = accuracy_score(predictions2, test_y2)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 96.83%
