## Data collection

In [3]:
import json

In [110]:
# Link to the dataset: https://www.kaggle.com/dataturks/dataset-for-detection-of-cybertrolls

dataset_path = './Desktop/DATASETS/Cyber-Trolls.json'
data = []
for line in open(dataset_path, 'r'):
    data.append(json.loads(line))

In [24]:
tweets = []
labels = []

In [29]:
for data_sample in data:
    tweets.append(data_sample['content'])
    labels.append(int(data_sample['annotation']['label'][0]))

## Tokenization, Stopwords removal and Stemming

In [33]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import PorterStemmer

In [34]:
tokenizer = RegexpTokenizer('[a-zA-Z]+')
sw = set(stopwords.words('english'))
ps = PorterStemmer()

In [50]:
def filter_words(words):
    useful_words = [ps.stem(w) for w in words if w not in sw]
    return useful_words

In [40]:
def myTokenizer(sentence):
    words = tokenizer.tokenize(sentence.lower())
    return filter_words(words)

In [44]:
print("Original tweet:",tweets[1])
print("Tokenized tweet:",myTokenizer(tweets[1]))

Original tweet:  She is as dirty as they come  and that crook Rengel  the Dems are so fucking corrupt it's a joke. Make Republicans look like  ...
Tokenized tweet: ['dirti', 'come', 'crook', 'rengel', 'dem', 'fuck', 'corrupt', 'joke', 'make', 'republican', 'look', 'like']


## Building a common vocab and vectorizing the documents

In [54]:
# Method 1: Using count vectorizer

from sklearn.feature_extraction.text import CountVectorizer

In [87]:
cv = CountVectorizer(tokenizer=myTokenizer)

In [88]:
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [106]:
print(len(cv.vocabulary_))

12723


In [99]:
# Method 2: Using tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [100]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=myTokenizer)

In [101]:
vc = tfidf_vectorizer.fit_transform(corpus).toarray()

In [107]:
print(len(tfidf_vectorizer.vocabulary_))

12723
