## DATA EXPLORATION ##

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [2]:
spam_df = pd.read_csv("lgtmt_tweets.csv")# reading spam tweets and dropping those with Nan

In [3]:
spam_df.dropna(inplace=True)

In [4]:
spam = spam_df['Tweet']

In [5]:
len(spam)

2353468

In [6]:
spam[:10]

0    MELBOURNE ENQUIRY: Seeking a variety of acts f...
1    THE BURLESQUE BOOTCAMP SYDNEY - Open Date tick...
2    THE BURLESQUE BOOTCAMP SYDNEY - Open Date tick...
3    THE BURLESQUE BOOTCAMP SYDNEY - Open Date tick...
4    Come to "The Burlesque Bootcamp - Sydney" Satu...
5    21st Century Pinups write about our girls perf...
6    The Burlesque Bootcamp – Coming to Sydney!:   ...
7    ATTN MELBOURNE: We have some group specials av...
8    Story on our lovely Vivi Valentine, on contrac...
9    and also Black Flamingo in Berlin with Catheri...
Name: Tweet, dtype: object

In [7]:
import re

### We will now try to look at the amount of URLs and mentions in the Spam Dataset ##

In [8]:
urls = 0
for tweet in spam:
    try:
        if re.search("http", tweet) != None:
            urls = urls + 1
    except:
        print(tweet)
        
print(urls)

1620227


In [9]:
print("Around {}% of spam tweets have URLs in them".format(urls/len(spam)*100))

Around 68.84423327616946% of spam tweets have URLs in them


In [10]:
mentions = 0
for tweet in spam:
    try:
        if re.search("@", tweet) != None:
            mentions += 1
    except:
        print(tweet)
        
print(mentions)
print("Around {}% of spam tweets have mentions in them".format(mentions/len(spam)*100))

411401
Around 17.48062858725931% of spam tweets have mentions in them


## *Lexical Richness* ##

### First we will try to tokenize the tweets and perform basic cleaning on them ##

In [11]:
from nltk.stem import WordNetLemmatizer 
import nltk
lm = WordNetLemmatizer()
import re

# converts symbols from nltk format to wordnet format.
# Taken from stackoverflow https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 'n'

In [12]:
re.sub(r'(http://)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '',spam[0])
# Removed the URLs

'MELBOURNE ENQUIRY: Seeking a variety of acts for our end of year show. Payment is $120 per slot or $200 for 2.... '

In [13]:
re.sub(r'(http://)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '',"www.google.com")

''

In [14]:
re.sub(r'[\.\\/\(\),\-!@#$%^&*~`:0-9\?]', '', spam[0])
# Removes all the special characters I could think of (including numbers)

'MELBOURNE ENQUIRY Seeking a variety of acts for our end of year show Payment is  per slot or  for  httpbitlyAhfF'

In [15]:
# returns a list 
def clean(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r'(http://)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '',tweet)
    tweet = re.sub(r'[\.\\/\(\),\-!@#$%^&*~`:0-9\?+=-\[\]\{\};\'\"<>]', '', tweet)
    tweet = tweet.split()
    tweet = nltk.pos_tag(tweet) # adds the "part of sentence (noun, verb, adjective, etc )" after each word
    tweet = [lm.lemmatize(x[0], get_wordnet_pos(x[1])) for x in tweet]
    return tweet

In [16]:
spam_vocab = set()
for tweet in spam[:1000]:
    tweet = clean(tweet)
    spam_vocab.update(tweet)

In [17]:
len(spam_vocab)

2499

In [18]:
ttr = []
for tweet in spam[:1000]:
    tweet = clean(tweet)
    try:
        ttr.append(len(set(tweet))/len(tweet))
    except:
        print(tweet)

[]
[]
[]


In [19]:
ttr[:10]

[0.8947368421052632,
 1.0,
 1.0,
 1.0,
 0.8571428571428571,
 1.0,
 0.6,
 1.0,
 0.9333333333333333,
 0.9230769230769231]

In [24]:
np.average(ttr)

0.954053551603292

### Now let's try the same on not-spam tweets ###

In [25]:
ham = pd.read_csv('spam_tweets.csv')

In [27]:
ham.dropna(inplace=True)

In [28]:
tweets = ham['Tweet']

In [29]:
urls = 0
for tweet in tweets:
    try:
        if re.search("http", tweet) != None:
            urls = urls + 1
    except:
        print(tweet)
        
print(urls)

673166


In [30]:
print("Around {}% of ham tweets have URLs in them".format(urls/len(tweets)*100))

Around 20.651221235387034% of spam tweets have URLs in them


In [31]:
# This is a big drop compared to almost 70 % in spam tweets 

In [32]:
mentions = 0
for tweet in tweets:
    try:
        if re.search("@", tweet) != None:
            mentions += 1
    except:
        print(tweet)
        
print(mentions)
print("Around {}% of ham tweets have mentions in them".format(mentions/len(tweets)*100))

1489738
Around 45.70181652187278% of ham tweets have mentions in them


In [33]:
# It appears that spam tweets actually have less mentions than those not spam.

In [34]:
ttr = []
for tweet in tweets[:1000]:
    tweet = clean(tweet)
    try:
        ttr.append(len(set(tweet))/len(tweet))
    except:
        print(tweet)

[]


In [35]:
np.average(ttr)

0.9664079833392205

In [36]:
# The TTR ratio only seems to be slightly higher in ham than spam.

In [38]:
ham_vocab = set()
for tweet in tweets[:1000]:
    tweet = clean(tweet)
    ham_vocab.update(tweet)
    
print(len(ham_vocab))

3725


### The vocabulary itself seems to be much richer ###

In [39]:
print("Ham vocab is {}% larger than spam vocab".format((3725-2499)/2499*100))

Ham vocab is 49.05962384953982% larger than spam vocab


### Next we can try to see the diversity and similarity between the words used in the two vocabs using a pretrained word2vec model ### 