In [1]:
# import stuff
%load_ext autoreload
%autoreload 2
%matplotlib inline

from random import randint
import h5py
from tqdm import tqdm_notebook as tqdm

import numpy as np
import torch
import json
import pandas as pd
import glob
import os

import tweepy
import sys
import jsonpickle
from sklearn.model_selection import train_test_split

# Pytorch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import SGD
from torch.autograd import Variable
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  from ._conv import register_converters as _register_converters


In [2]:
import sys
sys.path.insert(0, 'C:\\Users\\Abhimanyu\\Documents\\Coding\\Twitter_AC209a\\group\\troll_classification')
# print(sys.path)

In [3]:
# InferSent setup
from InferSent.models import *
model_version = 1
MODEL_PATH = "../InferSent/encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

# Keep it on CPU or put it on GPU
use_cuda = False
model = model.cuda() if use_cuda else model

# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = '../InferSent/dataset/GloVe/glove.840B.300d.txt' if model_version == 1 else '../dataset/fastText/crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)

# Load embeddings of K most frequent words
model.build_vocab_k_words(K=10000)

Vocab size : 10000


In [59]:
def GetEmbeddings(sentences, batch_size=128, verbose=False):
    embeddings = model.encode(sentences, bsize=batch_size, tokenize=False, verbose=verbose)
    if verbose: print('nb sentences encoded : {0}'.format(len(embeddings)))
    return embeddings

def GetBatch(df, feature_cols, response,  batch_size=128):
    '''
    Returns a batch of:
    (1) feature_cols
    (2) the 'content' column which contains text of the tweet
    (3) the response column
    '''
    
    df_size = len(df)
    for counter in range(int(len(df)/batch_size)+1):
        yield df[feature_cols].iloc[counter*batch_size:min((counter + 1)*batch_size, len(df))], \
        df['content'].iloc[counter*batch_size:min((counter + 1)*batch_size, len(df))], \
        df[response].iloc[counter*batch_size:min((counter + 1)*batch_size, len(df))]

## Read Data

In [48]:
in_file = '../data/merged_troll_data.json'
data_df = pd.read_json(in_file)

In [49]:
# Describe of whole dataframe
data_df.describe(include='all')

Unnamed: 0,content,followers,following,retweet,account_category,created_at,troll,orig_index
count,332504,332504.0,332504.0,332504.0,332504,332504,332504,332504.0
unique,264783,,,,3,206846,2,
top,RT @realDonaldTrump: Here is my statement. htt...,,,,NonTroll,2016-10-07 07:48:00,True,
freq,298,,,,166252,85,166252,
first,,,,,,2016-07-01 00:00:00,,
last,,,,,,2016-11-10 18:35:32,,
mean,,7935.872,3008.364465,0.74089,,,,177103.724776
std,,206628.5,7711.247498,0.438147,,,,165772.625588
min,,0.0,0.0,0.0,,,,0.0
25%,,612.0,541.0,0.0,,,,42472.0


In [50]:
data_df['troll'] = data_df['troll'].astype(int)

In [51]:
# Add dummy columns for categorical variables
print('Unique values for column Troll:', data_df.troll.unique())
print('Unique values for column Retweet:', data_df.retweet.unique())
print('Unique values for column Acccount Category:', data_df.account_category.unique())

Unique values for column Troll: [1 0]
Unique values for column Retweet: [1 0]
Unique values for column Acccount Category: ['LeftTroll' 'RightTroll' 'NonTroll']


In [52]:
dummy_cols = ['account_category']
for col in dummy_cols:
    data_df['Orig_' + col] = data_df[col]
    data_df = pd.get_dummies(data_df, columns=[col])
list(data_df.columns)

['content',
 'followers',
 'following',
 'retweet',
 'created_at',
 'troll',
 'orig_index',
 'Orig_account_category',
 'account_category_LeftTroll',
 'account_category_NonTroll',
 'account_category_RightTroll']

In [53]:
# Set aside columns to be used as features
feature_cols = [
 'followers',
 'following',
 'retweet',
 'account_category_LeftTroll',
 'account_category_NonTroll',
 'account_category_RightTroll']

In [54]:
# Read the indices file
index_file = '../data/train_test_inds.json'
idx_df = pd.read_json(index_file)
idx_df.head()

Unnamed: 0,random,temporal
test,"[39006, 13901, 54474, 53049, 47299, 59510, 236...","[88515, 145011, 308314, 163777, 165182, 165183..."
train,"[115587, 272344, 110764, 17462, 161923, 109189...","[54215, 140798, 157319, 47942, 140799, 157320,..."
val,"[282180, 316618, 291427, 234801, 273642, 66628...","[286589, 289599, 296694, 287370, 150894, 28757..."


In [101]:
def PrepareDataSplits(mode='random', batch_size=128):
    TEST_NUM = 0
    TRAIN_NUM = 1
    VAL_NUM = 2
    all_idx = list(idx_df[mode])
    sets = ['test', 'train', 'val']
    
    # Standardize continuous columns based on train set statistics
    all_dfs = {}
    all_dfs['train'] = data_df.iloc[all_idx[TRAIN_NUM][0:400]]
    all_dfs['val'] = data_df.iloc[all_idx[VAL_NUM][0:400]]
    all_dfs['test'] = data_df.iloc[all_idx[TEST_NUM][0:400]]
    
    cols_to_standardize = ['followers', 'following']
    for col in cols_to_standardize:
        train_mean = all_dfs['train'][col].mean()
        train_std = all_dfs['train'][col].std()
        for set_type in sets:
            all_dfs[set_type][col] = (all_dfs[set_type][col] - train_mean)/train_std
    
    # Compute embeddings, concatenate with other features, and write to h5py files
    feature_length = 4096 + len(feature_cols)
    for idx, set_type in enumerate(sets):
        embed_file = '../data/' + set_type + '_embeddings_mode_' + mode + '.h5py'
        num_tweets = len(all_idx[idx])
        pbar = tqdm(total=num_tweets)
        with h5py.File(embed_file, "a") as f:
            embed_dset = f.create_dataset('features', (num_tweets, feature_length))
            response_dset = f.create_dataset('is_troll', (num_tweets,))
            counter = 0
            for batch in GetBatch(all_dfs[set_type], feature_cols, 'troll', batch_size=batch_size):
                other_features = batch[0].values
                embeddings_  = GetEmbeddings(batch[1].values, batch_size=batch_size)
                is_troll_ = batch[2].values
                batch_len = other_features.shape[0]
                feature_vec = np.hstack((embeddings_, other_features))
                embed_dset[counter:(counter + batch_len)] = feature_vec
                response_dset[counter:(counter + batch_len)] = is_troll_
                counter += batch_len
                pbar.update(batch_len)

In [102]:
mode = 'random'
batch_size = 512
PrepareDataSplits(mode=mode, batch_size=batch_size)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


HBox(children=(IntProgress(value=0, max=33251), HTML(value='')))

HBox(children=(IntProgress(value=0, max=266003), HTML(value='')))

HBox(children=(IntProgress(value=0, max=33250), HTML(value='')))

In [105]:
# Read h5py file
def GetArrays(mode, set_type):
    embed_file = '../data/' + set_type + '_embeddings_mode_' + mode + '.h5py'
    with h5py.File(embed_file, "r") as f:
        keys = list(f.keys())
        X = np.array(f[keys[0]])
        y = np.array(f[keys[1]])
    return X, y

In [106]:
X_train, y_train = GetArrays(mode, 'train')
X_val, y_val = GetArrays(mode, 'val')

KeyboardInterrupt: 

In [None]:
def ShuffleArrays(X, y):
    arr = np.arange(X.shape[0])
    np.random.shuffle(arr)
    X = X[arr,:]
    y = y[arr]
    return X, y
def GetTrainBatches(X, y, batch_size=256):
    data_len = X.shape[0]
    for counter in range(int(data_len/batch_size)+1):
        yield X[counter*batch_size:min((counter + 1)*batch_size, data_len), :], \
        y[counter*batch_size:min((counter + 1)*batch_size, data_len)]

In [None]:
class TwitterNet(nn.Module):
    def __init__(self, num_classes):
        super(TwitterNet, self).__init__()
        self.fc1 = nn.Linear(4102,1024)
        self.drop1 = nn.Dropout()
        self.fc2 = nn.Linear(1024,512)
        self.drop2 = nn.Dropout()
        self.fc3 = nn.Linear(512,256)
        self.fc4 = nn.Linear(256,128)
        self.fc5 = nn.Linear(128,num_classes)
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.drop1(x)
        x = F.relu(self.fc2(x))
        x = self.drop2(x)
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = self.fc5(x)
        return x

In [None]:
X_train = Variable(torch.FloatTensor(X_train))
y_train = Variable(torch.FloatTensor(y_train))
X_test = Variable(torch.FloatTensor(X_test))
y_test = Variable(torch.FloatTensor(y_test))

In [None]:
criterion = nn.CrossEntropyLoss()
model_net = TwitterNet(2).to(device)
optimizer = SGD(model_net.parameters(), lr = 0.1, momentum=0.9)
num_epochs = 15
batch_size = 256

In [None]:
# Check accuracy on untrained network (with randomly initialized weights)
torch.set_grad_enabled(False)
running_corrects = 0
for inputs, labels in GetTrainBatches(X_test, y_test, batch_size):
    inputs = inputs.to(device)
    labels = labels.to(device)
    outputs = model_net(inputs)
    _, preds = torch.max(outputs, 1)
    running_corrects += torch.sum(preds == labels.long().data)
val_acc = running_corrects.double()/X_test.shape[0]
torch.set_grad_enabled(True)
print('Validation Accuracy on untrained net is {:.2%}'.format(val_acc))

In [None]:
torch.set_grad_enabled(True)
for epoch in range(num_epochs):
    X_train, y_train = ShuffleArrays(X_train, y_train)
    running_loss = 0.0
    train_corrects = 0
    for inputs, labels in GetTrainBatches(X_train, y_train, batch_size):
        inputs = inputs.to(device)
        labels = labels.to(device)
        # zero the parameter gradients
        optimizer.zero_grad()
        
        # forward pass
        outputs = model_net(inputs)
        loss = criterion(outputs, labels.long())
        _, preds = torch.max(outputs, 1)
        train_corrects += torch.sum(preds == labels.long().data)
        
        # backward pass
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    # Validation set accuracy
    train_acc = train_corrects.double()/X_train.shape[0]
    torch.set_grad_enabled(False)
    running_corrects = 0
    for inputs, labels in GetTrainBatches(X_test, y_test, batch_size):
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model_net(inputs)
        _, preds = torch.max(outputs, 1)
        running_corrects += torch.sum(preds == labels.long().data)
    val_acc = running_corrects.double()/X_test.shape[0]
    torch.set_grad_enabled(True)
    print('Loss after epoch {} is {:.3f}. Train Acc. is {:.2%} and Validation Acc. is {:.2%}'.\
          format(epoch+1, running_loss, train_acc, val_acc))

In [None]:
# Check accuracy on trained network
torch.set_grad_enabled(False)
running_corrects = 0
for inputs, labels in GetTrainBatches(X_test, y_test, batch_size):
    inputs = inputs.to(device)
    labels = labels.to(device)
    outputs = model_net(inputs)
    _, preds = torch.max(outputs, 1)
    running_corrects += torch.sum(preds == labels.long().data)
val_acc = running_corrects.double()/X_test.shape[0]
torch.set_grad_enabled(True)
print('Validation Accuracy on untrained net is {:.2%}'.format(val_acc))

In [None]:
np.sum(y_test[0:100]