In [64]:
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [102]:
dataframe = pd.read_json('../scraper/posts.json')
posts_data = dataframe.drop(['post_id', 'thread_id', 'user_name', 'msg', 'reply_to'], axis=1)

posts_data

Unnamed: 0,root,user_id,user_level,is_newbie,is_not_push_post,like_count,dislike_count
0,False,252142,10,False,False,69,0
1,False,288093,10,True,False,20,0
2,False,19842,10,False,True,5,0
3,False,303977,10,True,False,3,2
4,False,61776,10,False,True,1,1
...,...,...,...,...,...,...,...
11315,False,41584,10,False,False,0,0
11316,False,49456,10,False,False,0,0
11317,False,294182,10,True,False,0,0
11318,False,185565,10,False,False,1,0


In [103]:
import networkx as nx
import json

G = nx.DiGraph()

with open('../scraper/posts.json') as f:
  posts = json.load(f)

for post in posts:
    # print(post)
    if post['root'] is True:
        G.add_node(post['user_id'])
    else:
        if post['is_not_push_post'] is True:
            G.add_edge(post['user_id'], post['reply_to'], weight=0.3)
        else:
            G.add_edge(post['user_id'], post['reply_to'], weight=1)
            
hub, aut = nx.hits(G)

In [104]:
hub = {int(k):v for k,v in hub.items()}
aut = {int(k):v for k,v in aut.items()}

posts_data['hub_score'] = posts_data['user_id'].map(hub)
posts_data['aut_score'] = posts_data['user_id'].map(aut)

KNOWN_TROLLS = ['41853','194398','76776','71393','30019','37596','63950','310982','282494','72457','326966','322121','322637','121095','169','213597','226126','245201','159448','149494','181778','324892','273582','118226','240250','316329','77788','219767','71341','12186','228352','75196','28435','25002','247732','322304','258598','133525','14941','27416','222907','48631','158008','289946','96230','40470','186800','30788','288523','254191','84585','85242','123870','89514','170281','103799','64699','49699','299923','71140','122676','61748','202464','288351','234658','277044','149978','6237','98232','69484','165028','328913','132380','146030','331780','266100','52773','275585','249366','41351','273474','39751','51615','326708','213952','1210','270563','241151','70493','335209','276916','232237','94819','78468','203248','100028','291185','95649','332154','83060','264328','244624','105890','5591','171664','23721','129619','326414','14965','51985','191271','253253','25771','153956','32583','72825','336250','58089','70734','124277','173333','155597','137604','276602','41818','162674','151378','295632','147880','314774','57035','88029','97104','216114','64295','28833','203010','264400','273157','238973','57405','28870','132128','199261','126609','65530','172349','173849','192248','261633','127675','34757','166655','228589','343059','144849']
KNOWN_TROLLS = list(map(int, KNOWN_TROLLS))

posts_data['troll'] = posts_data['user_id'].apply(lambda x: 0 if (x not in KNOWN_TROLLS) else 1)

posts_data['root'] = posts_data['root'].apply(lambda x: 1 if x else 0)
posts_data['is_newbie'] = posts_data['is_newbie'].apply(lambda x: 1 if x else 0)
posts_data['is_not_push_post'] = posts_data['is_not_push_post'].apply(lambda x: 1 if x else 0)

posts_data = posts_data.sample(frac=1).reset_index(drop=True)
posts_data = posts_data.drop(['user_id'], axis=1)

posts_data

Unnamed: 0,root,user_level,is_newbie,is_not_push_post,like_count,dislike_count,hub_score,aut_score,troll
0,0,10,0,0,0,0,2.072898e-04,0.000000,0
1,0,10,0,0,0,0,7.813763e-142,0.000000,0
2,0,10,0,0,0,0,5.602247e-04,0.000000,0
3,0,10,0,0,0,0,1.091462e-04,0.000051,0
4,0,10,0,0,0,0,6.144361e-04,0.000000,0
...,...,...,...,...,...,...,...,...,...
11315,0,10,0,0,0,0,3.529349e-04,0.000000,0
11316,0,10,0,0,0,0,4.961434e-04,0.000000,0
11317,0,10,0,0,2,0,2.717987e-05,0.000000,0
11318,0,10,0,0,0,0,3.696238e-04,0.001163,0


In [105]:
train, test = train_test_split(posts_data, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

7244 train examples
1812 validation examples
2264 test examples


In [106]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('troll')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [108]:
batch_size = 64
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [109]:
for feature_batch, label_batch in train_ds.take(1):
  print('Every feature:', list(feature_batch.keys()))
  print('A batch of targets:', label_batch )

Every feature: ['root', 'user_level', 'is_newbie', 'is_not_push_post', 'like_count', 'dislike_count', 'hub_score', 'aut_score']
A batch of targets: tf.Tensor(
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(64,), dtype=int32)


In [110]:
feature_columns = []

# numeric cols
for header in ['user_level', 'like_count', 'dislike_count', 'hub_score', 'aut_score']:
  feature_columns.append(feature_column.numeric_column(header))

feature_columns.append(feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list(
      'is_newbie', [0, 1])))

feature_columns.append(feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list(
      'is_not_push_post', [0, 1])))

feature_columns.append(feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list(
      'root', [0, 1])))


In [112]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [113]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [116]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dropout(0.6),
  layers.Dense(1)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(train_ds,
          validation_data=val_ds,
          epochs=10)

Train for 227 steps, validate for 57 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x149d66a10>

In [117]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.9748233


NameError: name 'tfdocs' is not defined