In [1]:
import re
import string
import numpy as np 
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression as lr

In [2]:
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean


def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs
df=pd.read_json("D:\B.E\sem7\Project work stage1\cyberbullying\Dataset for Detection of Cyber-Trolls.json",lines=True)

In [3]:
df["label"] = df.annotation.apply(lambda x: x.get('label'))
df["label"] = df.label.apply(lambda x: x[0])
df['label']=df['label'].astype(int)
df.head()

Unnamed: 0,content,annotation,extras,label
0,Get fucking real dude.,"{'notes': '', 'label': ['1']}",,1
1,She is as dirty as they come and that crook ...,"{'notes': '', 'label': ['1']}",,1
2,why did you fuck it up. I could do it all day...,"{'notes': '', 'label': ['1']}",,1
3,Dude they dont finish enclosing the fucking s...,"{'notes': '', 'label': ['1']}",,1
4,WTF are you talking about Men? No men thats n...,"{'notes': '', 'label': ['1']}",,1


In [4]:
df.drop(['extras'],inplace=True,axis=1)
df.head()

Unnamed: 0,content,annotation,label
0,Get fucking real dude.,"{'notes': '', 'label': ['1']}",1
1,She is as dirty as they come and that crook ...,"{'notes': '', 'label': ['1']}",1
2,why did you fuck it up. I could do it all day...,"{'notes': '', 'label': ['1']}",1
3,Dude they dont finish enclosing the fucking s...,"{'notes': '', 'label': ['1']}",1
4,WTF are you talking about Men? No men thats n...,"{'notes': '', 'label': ['1']}",1


In [5]:
X=df["content"]
y=df["label"]

In [6]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)



In [7]:
len(X_test)

4000

In [9]:
Vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2} 

# Note that we build vocab using training data
for tweet in X_train: 
    processed_tweet = process_tweet(tweet)
    for word in processed_tweet:
        if word not in Vocab: 
            Vocab[word] = len(Vocab)
    
print("Total words in vocab are",len(Vocab))
display(Vocab)

Total words in vocab are 12140


{'__PAD__': 0,
 '__</e>__': 1,
 '__UNK__': 2,
 'hate': 3,
 'know': 4,
 'french': 5,
 '...': 6,
 'use': 7,
 'fuck': 8,
 'shit': 9,
 'even': 10,
 'right': 11,
 'older': 12,
 'men': 13,
 'hit': 14,
 'forget': 15,
 'extra': 16,
 '2': 17,
 'liter': 18,
 'gotta': 19,
 'dunno': 20,
 'saw': 21,
 'go': 22,
 'top': 23,
 'stair': 24,
 'seem': 25,
 'like': 26,
 'ass': 27,
 'slide': 28,
 'clue': 29,
 "i'v": 30,
 'stumbl': 31,
 'upon': 32,
 'laugh': 33,
 'might': 34,
 'yeah': 35,
 'suck': 36,
 '6': 37,
 'yr': 38,
 'realiz': 39,
 'dont': 40,
 'get': 41,
 'snow': 42,
 'ice': 43,
 'nobodi': 44,
 'drive': 45,
 'rain': 46,
 'let': 47,
 'alon': 48,
 'bitch': 49,
 'craaazzzi': 50,
 'p': 51,
 ':)': 52,
 'peopl': 53,
 'word': 54,
 'bother': 55,
 'constant': 56,
 'receiv': 57,
 'giver': 58,
 'make': 59,
 'prepar': 60,
 'prison': 61,
 'blagojevich': 62,
 'new': 63,
 'campaign': 64,
 'slogan': 65,
 'rhyme': 66,
 'damnit': 67,
 'start': 68,
 'short': 69,
 'stori': 70,
 'dentist': 71,
 'flight': 72,
 'delay': 73,

In [10]:
def tweet_to_tensor(tweet, vocab_dict, unk_token='__UNK__', verbose=False):
    '''
    Input: 
        tweet - A string containing a tweet
        vocab_dict - The words dictionary
        unk_token - The special string for unknown tokens
        verbose - Print info durign runtime
    Output:
        tensor_l - A python list with
        
    '''  
    
    ### START CODE HERE (Replace instances of 'None' with your code) ###
    # Process the tweet into a list of words
    # where only important words are kept (stop words removed)
    word_l = process_tweet(tweet)
    
    if verbose:
        print("List of words from the processed tweet:")
        print(word_l)
        
    # Initialize the list that will contain the unique integer IDs of each word
    tensor_l = []
    
    # Get the unique integer ID of the __UNK__ token
    unk_ID = vocab_dict[unk_token]
    
    if verbose:
        print(f"The unique integer ID for the unk_token is {unk_ID}")
        
    # for each word in the list:
    for word in word_l:
        
        # Get the unique integer ID.
        # If the word doesn't exist in the vocab dictionary,
        # use the unique ID for __UNK__ instead.
        word_ID = vocab_dict[word] if word in vocab_dict else unk_ID
    ### END CODE HERE ###
        
        # Append the unique integer ID to the tensor list.
        tensor_l.append(word_ID) 
    
    return tensor_l

In [13]:
print("Actual tweet is\n", X_train[11284])
print("\nTensor of tweet:\n", tweet_to_tensor(X_train[11284], vocab_dict=Vocab))

Actual tweet is
 loves 1 Guy 1 Cup! She just admitted her lust for it! (@kristinaking ISO man who breaks glass in ass)

Tensor of tweet:
 [200, 312, 447, 312, 248, 192, 7045, 12134, 180, 1461, 2881, 27]


NameError: name 'val_pos' is not defined

In [12]:
def extract_features(tweet, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1 
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word, 1.0),0)
        
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word, 0.0),0)
    assert(x.shape == (1, 3))
    return x

In [62]:
tmp1 = extract_features(X_train[11284], freqs)
print(tmp1)

[[1.000e+00 1.421e+03 1.425e+03]]


In [64]:
x=np.zeros((len(X_train),3))


array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       ...,
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [74]:
j=0
for i in X_train:
    x[j,:]=extract_features(i,freqs)
    j+=1

In [78]:
clf=lr()

In [81]:
clf.fit(x,y_train)

LogisticRegression()

In [83]:
clf.predict([x[5,:]])

array([0])

In [94]:
def predict_tweet(tweet, freqs):
    x = extract_features(tweet,freqs)
    y_pred = clf.predict(x)
    return y_pred

In [95]:
result=[]
for i in tqdm(X_test):
    result.append(predict_tweet(i,freqs))

100%|█████████████████████████████████████████████████████████████████████████████| 4000/4000 [00:05<00:00, 689.78it/s]


In [108]:
correct_prediction=0
wrong_prediction=0
for i in range(4000):
    if result[i]==y_teet[i]:
        correct_prediction+=1
    else:
        wrong_prediction+=1


In [109]:
correct_prediction

2620

In [110]:
wrong_prediction

1380

In [112]:
4000-2620

1380

In [113]:
1320/4000

0.33

In [115]:
2620/4000

0.655