In [1]:
import pandas as pd
import numpy as np
from sklearn import feature_extraction
from sklearn import linear_model
from sklearn import preprocessing
from scipy import sparse

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


## Load Data and Train/Test Split(s)

In [2]:
df=pd.read_json("../data/merged_troll_data.json")

In [3]:
df.shape

(332504, 8)

In [4]:
df.sample(5).head()

Unnamed: 0,content,followers,following,retweet,account_category,created_at,troll,orig_index
323796,RT @realDonaldTrump: Here are Hillary Clinton'...,3346,4064,1,NonTroll,2016-10-10 06:19:52,False,562329
328722,"He’s voting for me over Trump, but don’t tell ...",7318,6312,0,NonTroll,2016-08-02 01:37:11,False,52670
59203,The truth he spoke https://t.co/QoXu4wVo42,802,708,1,LeftTroll,2016-09-08 15:59:00,True,58881
208580,RT @realDonaldTrump: I was never a fan of Coli...,211,317,1,NonTroll,2016-09-15 04:30:08,False,16964
251418,RT @jennmcallister: Do women who support Trump...,226,378,1,NonTroll,2016-10-08 04:15:51,False,38252


In [5]:
ids=pd.read_json("../data/train_test_inds.json")

In [6]:
len(ids.random.train)

266003

## Prepare feature matrix

### Isolate matrices

In [7]:
def getxy(ids, feature_cols=['content', 'followers', 'following', 'retweet'], label_col=['troll']):
    return df[feature_cols].iloc[ids], df[label_col].iloc[ids]

In [8]:
# random
Xrand_train, yrand_train = getxy(ids.random.train)
Xrand_val, yrand_val = getxy(ids.random.val)
Xrand_test, yrand_test = getxy(ids.random.test)

# temporal
Xtemp_train, ytemp_train = getxy(ids.temporal.train)
Xtemp_val, ytemp_val = getxy(ids.temporal.val)
Xtemp_test, ytemp_test = getxy(ids.temporal.test)

In [9]:
Xrand_train.head()

Unnamed: 0,content,followers,following,retweet
204024,RT @businessinsider: OBAMA: The press doesn’t ...,14525,3311,1
45854,Review: Generation Startup https://t.co/lej8O8...,3086,2387,1
199686,RT @Kidrambler: @TomiLahren Vote for Gary John...,1117,3742,1
115712,in interpersonal relations with pple who are m...,936,582,1
245728,RT @PeterTownsend7: The Real #WarOnWomen #isi...,2891,1615,1


In [10]:
Xrand_train.shape, yrand_train.shape

((266003, 4), (266003, 1))

### Tokenize content

In [11]:
vocab_size=5000
tokenizer=feature_extraction.text.CountVectorizer(stop_words='english', max_features=vocab_size)
tokenizer=tokenizer.fit(df['content'])

In [12]:
Xrand_train_tok=tokenizer.transform(Xrand_train['content'])
Xrand_val_tok=tokenizer.transform(Xrand_val['content'])
Xrand_test_tok=tokenizer.transform(Xrand_test['content'])

Xtemp_train_tok=tokenizer.transform(Xtemp_train['content'])
Xtemp_val_tok=tokenizer.transform(Xtemp_val['content'])
Xtemp_test_tok=tokenizer.transform(Xtemp_test['content'])

In [13]:
Xrand_train_tok.shape # token matrix dim = n x vocab_size

(266003, 5000)

### Standardize followers/following

In [14]:
# one for each split
rand_scaler = preprocessing.StandardScaler().fit(Xrand_train[['followers','following']])
temp_scaler = preprocessing.StandardScaler().fit(Xtemp_train[['followers','following']])

In [15]:
print('rand means and scales: {}, {}'.format(rand_scaler.mean_, rand_scaler.scale_))
print('temp means and scales: {}, {}'.format(temp_scaler.mean_, rand_scaler.scale_))

rand means and scales: [8154.90645218 3016.03233422], [219679.05451009   7816.52064337]
temp means and scales: [8757.68069533 3020.22409146], [219679.05451009   7816.52064337]


They are very close. Could probably just use a single one, but I will use both anyways, in case it makes a difference.

In [16]:
col_to_std = ['followers', 'following']
Xrand_train[col_to_std]=rand_scaler.transform(Xrand_train[col_to_std])
Xrand_val[col_to_std]=rand_scaler.transform(Xrand_val[col_to_std])
Xrand_test[col_to_std]=rand_scaler.transform(Xrand_test[col_to_std])

Xtemp_train[col_to_std]=temp_scaler.transform(Xtemp_train[col_to_std])
Xtemp_val[col_to_std]=temp_scaler.transform(Xtemp_val[col_to_std])
Xtemp_test[col_to_std]=temp_scaler.transform(Xtemp_test[col_to_std])

In [17]:
Xrand_train[col_to_std].head()

Unnamed: 0,followers,following
204024,0.028997,0.037736
45854,-0.023074,-0.080475
199686,-0.032037,0.092876
115712,-0.032861,-0.311396
245728,-0.023962,-0.17924


### Binarize the boolean outcome

In [18]:
yrand_train.head()

Unnamed: 0,troll
204024,False
45854,True
199686,False
115712,True
245728,False


In [19]:
bool_to_bin = lambda x: 1 if x else 0
yrand_train['troll'] = yrand_train['troll'].apply(bool_to_bin)
yrand_train.head()

Unnamed: 0,troll
204024,0
45854,1
199686,0
115712,1
245728,0


In [20]:
yrand_val['troll'] = yrand_val['troll'].apply(bool_to_bin)
yrand_test['troll'] = yrand_test['troll'].apply(bool_to_bin)

ytemp_train['troll'] = ytemp_train['troll'].apply(bool_to_bin)
ytemp_val['troll'] = ytemp_val['troll'].apply(bool_to_bin)
ytemp_test['troll'] = ytemp_test['troll'].apply(bool_to_bin)

### Concatenate features

In [21]:
def concatenate_features(tok_matrix, data_df):
    """ concatenate the tokenized matrix (scipy.sparse) with other features """
    sparse_cols = sparse.csr_matrix(data_df[['followers', 'following', 'retweet']])
    combined = sparse.hstack([tok_matrix, sparse_cols])
    return combined

In [22]:
Xrand_train_combined = concatenate_features(Xrand_train_tok, Xrand_train)
Xrand_val_combined = concatenate_features(Xrand_val_tok, Xrand_val)
Xrand_test_combined = concatenate_features(Xrand_test_tok, Xrand_test)

Xtemp_train_combined = concatenate_features(Xtemp_train_tok, Xtemp_train)
Xtemp_val_combined = concatenate_features(Xtemp_val_tok, Xtemp_val)
Xtemp_test_combined = concatenate_features(Xtemp_test_tok, Xtemp_test)

## Train the model(s)

### Using only text

In [24]:
# random split
logit_rand = linear_model.LogisticRegression().fit(Xrand_train_tok, yrand_train['troll'])

In [25]:
logit_rand.score(Xrand_val_tok, yrand_val['troll'])

0.9000601503759399

In [26]:
logit_rand.score(Xrand_test_tok, yrand_test['troll'])

0.9054464527382635

In [28]:
# temporal split
logit_temp = linear_model.LogisticRegression().fit(Xtemp_train_tok, ytemp_train['troll'])

In [29]:
logit_temp.score(Xtemp_val_tok, ytemp_val['troll'])

0.8973533834586466

In [30]:
logit_temp.score(Xtemp_test_tok, ytemp_test['troll'])

0.8965444648281254

### Using all features

In [32]:
# random split
logit_rand_all = linear_model.LogisticRegression().fit(Xrand_train_combined, yrand_train['troll'])

In [35]:
logit_rand_all.score(Xrand_val_combined, yrand_val['troll'])

0.9673383458646616

In [36]:
logit_rand_all.score(Xrand_test_combined, yrand_test['troll'])

0.9657153168325765

In [37]:
# temporal split
logit_temp_all = linear_model.LogisticRegression().fit(Xtemp_train_combined, ytemp_train['troll'])

In [38]:
logit_temp_all.score(Xtemp_val_combined, ytemp_val['troll'])

0.953593984962406

In [39]:
logit_temp_all.score(Xtemp_test_combined, ytemp_test['troll'])

0.9577757059938047

## Summary of results


- Using only the text features, logistic regression is amazingly able to classify trolls with ~90.5% accuracy on the random test set and ~89.6% accuracy on the temporal test set.

- If we include followers/following/retweets, logistic regression is able to classify trolls with ~96.6% accuracy on the random test set and ~95.8% accuracy on the temporal test set.

- Amazingly, logistic regression fits the data in about 5 seconds on a 6-core machine.