#Part I: Load Data

In [1]:
import pandas as pd
import numpy as np
# Madhu


In [2]:
#from google.colab import drive
#drive.mount('/content/drive/')

#Part 2: Initial Data Exploration

In [3]:
df_raw = pd.read_csv('sentiment140.csv', encoding = "ISO-8859-1", header=None)
df_raw.columns = ["label", "time", "date", "query", "username", "text"]
df_raw.head()

Unnamed: 0,label,time,date,query,username,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
# Checking the data's output balance
# The label '4' denotes positive sentiment and '0' denotes negative sentiment
print(df_raw['label'].value_counts())

4    800000
0    800000
Name: label, dtype: int64


In [5]:
# We then change all the values of 4 to 1
df_raw['label'] = (df_raw['label'] == 4).astype(int)
df_raw['label'].value_counts()

1    800000
0    800000
Name: label, dtype: int64

In [6]:
# Ommiting every column except for the text and the label, as we won't need any of the other information
df = df_raw[['label', 'text']]
df.head()
# Seperating positive and negative rows
df_pos = df[df['label'] == 1]
df_neg = df[df['label'] == 0]
print(len(df_pos), len(df_neg))

800000 800000


##Print examples of positive and negative tweet

In [7]:
print('POSITIVE:\n')
for i in range(5):
    print(str(i) + ':  ' + df_pos['text'].iloc[i])

POSITIVE:

0:  I LOVE @Health4UandPets u guys r the best!! 
1:  im meeting up with one of my besties tonight! Cant wait!!  - GIRL TALK!!
2:  @DaRealSunisaKim Thanks for the Twitter add, Sunisa! I got to meet you once at a HIN show here in the DC area and you were a sweetheart. 
3:  Being sick can be really cheap when it hurts too much to eat real food  Plus, your friends make you soup
4:  @LovesBrooklyn2 he has that effect on everyone 


In [8]:
print('NEGATIVE:\n')
for i in range(5):
    print(str(i) + ':  ' + df_neg['text'].iloc[i])

NEGATIVE:

0:  @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
1:  is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!
2:  @Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds
3:  my whole body feels itchy and like its on fire 
4:  @nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. 


In [9]:
# Add UCI dataset to sentiment140 dataset

text = []
label = []

f = open('imdb_labelled.txt', 'r')
for line in f:
    if line.find('  \t') != -1:
        label.append(line[-2])
        index = line.find('  \t')
        text.append(line[:index])
    elif line.find('   ') != -1:
        label.append(line[-2])
        index = line.find('   ')
        text.append(line[:index])
f.close()

f = open('yelp_labelled.txt', 'r')
for line in f:
    if line.find('\t') != -1:
        label.append(line[-2])
        index = line.find('\t')
        text.append(line[:index])
f.close()

f = open('amazon_cells_labelled.txt', 'r')
for line in f:
    if line.find('\t') != -1:
        label.append(line[-2])
        index = line.find('\t')
        text.append(line[:index])
f.close()

df_UCI = pd.DataFrame({"label":label, "text":text})
df.append(df_UCI, ignore_index = True) 


Unnamed: 0,label,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1602995,0,The screen does get smudged easily because it ...
1602996,0,What a piece of junk.. I lose more calls on th...
1602997,0,Item Does Not Match Picture.
1602998,0,The only thing that disappoint me is the infra...


#Part 3: Data Cleaning and Data Processing


## Tokenization

To clean the data, we will first use nltk.tokenize.casual module, which is a twitter-aware tokenizer, designed to be flexible and easy to adapt to new domains and tasks. The basic logic is this:
1. The tuple regex_strings defines a list of regular expression strings.

2. The regex_strings strings are put, in order, into a compiled regular expression object called word_re.

3. The tokenization is done by word_re.findall(s), where s is the user-supplied string, inside the tokenize() method of the class Tokenizer.

4. When instantiating Tokenizer objects, there is a single option: preserve_case. By default, it is set to True. If it is set to False, then the tokenizer will downcase everything except for emoticons.

In [10]:
from time import time
import random
from nltk.tokenize import TweetTokenizer

start_time = time()
# The redule_len parameter will allow a maximum of 3 consecutive repeating characters, while trimming the rest
# For example, it will transform the word: 'Helloooooooooo' to: 'Hellooo'
tk = TweetTokenizer(reduce_len=True)

data = []

# Separating our features (text) and our labels into two lists to smoothen our work
X = df['text'].tolist()
Y = df['label'].tolist()

#Building our data list, that is a list of tuples, where each tuple is a pair of the tokenized text and its corresponding label
for x, y in zip(X, Y):
    if y == 1:
        data.append((tk.tokenize(x), 1))
    else:
        data.append((tk.tokenize(x), 0))
        
# Printing the CPU time and the first 5 elements of our 'data' list
print('CPU Time:', time() - start_time)
data[:5]

CPU Time: 101.72077703475952


[(['@switchfoot',
   'http://twitpic.com/2y1zl',
   '-',
   'Awww',
   ',',
   "that's",
   'a',
   'bummer',
   '.',
   'You',
   'shoulda',
   'got',
   'David',
   'Carr',
   'of',
   'Third',
   'Day',
   'to',
   'do',
   'it',
   '.',
   ';D'],
  0),
 (['is',
   'upset',
   'that',
   'he',
   "can't",
   'update',
   'his',
   'Facebook',
   'by',
   'texting',
   'it',
   '...',
   'and',
   'might',
   'cry',
   'as',
   'a',
   'result',
   'School',
   'today',
   'also',
   '.',
   'Blah',
   '!'],
  0),
 (['@Kenichan',
   'I',
   'dived',
   'many',
   'times',
   'for',
   'the',
   'ball',
   '.',
   'Managed',
   'to',
   'save',
   '50',
   '%',
   'The',
   'rest',
   'go',
   'out',
   'of',
   'bounds'],
  0),
 (['my',
   'whole',
   'body',
   'feels',
   'itchy',
   'and',
   'like',
   'its',
   'on',
   'fire'],
  0),
 (['@nationwideclass',
   'no',
   ',',
   "it's",
   'not',
   'behaving',
   'at',
   'all',
   '.',
   "i'm",
   'mad',
   '.',
   'why',
   'a

##Lemmatization

We will use nltk's WordNetLemmatizer to lemmatize our dataset.

In [14]:
import nltk
#nltk.download()
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import re, string

STOP_WORDS = stopwords.words('english')

def clean_data(data):
    cleaned_tokens = []

    for token, label in pos_tag(data):
        #remove links
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        # remove @
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        #lemmatize
        if label.startswith("NN"):
            pos = 'n'
        elif label.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
    
        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)
        cleaned_token = token.lower()
        if cleaned_token not in string.punctuation and len(cleaned_token) > 2 and cleaned_token not in STOP_WORDS:
            cleaned_tokens.append(cleaned_token)
    return cleaned_tokens


# Baseline Method 1: Bayesian Classifier


In [None]:
def list_to_dict(cleaned_tokens):
    return dict([token, True] for token in cleaned_tokens)

cleaned_tokens_list = []

for tokens, label in data:
    cleaned_tokens_list.append((clean_data(tokens), label))

print('Cleaned Data, CPU Time:', time() - start_time)
start_time = time()
final_data = []
for tokens, label in cleaned_tokens_list:
    final_data.append((list_to_dict(tokens), label))

final_data[:5]

In [None]:
import random
from nltk import classify
from nltk import NaiveBayesClassifier

random.Random(140).shuffle(final_data)

trim_index = int(len(final_data) * 0.9)

train_data = final_data[:trim_index]
test_data = final_data[trim_index:]

start_time = time()

classifier = NaiveBayesClassifier.train(train_data)

# Output the model accuracy on the train and test data
print('Accuracy on train data:', classify.accuracy(classifier, train_data))
print('Accuracy on test data:', classify.accuracy(classifier, test_data))

# Output the words that provide the most information about the sentiment of a tweet.
# These are words that are heavily present in one sentiment group and very rarely present in the other group.
print(classifier.show_most_informative_features(25))

print('\nCPU Time:', time() - start_time)



In [None]:
custom_tweet = "bad"

custom_tokens = clean_data(tk.tokenize(custom_tweet))

print(classifier.classify(dict([token, True] for token in custom_tokens)))

# SVM using TF-IDF library

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(use_idf = True)

X = df['text']
y = df['label']
a = []
b = []
for i in range(1000):
    a.append(X[i])
for i in range(1000):
    a.append(X[len(X) - 1 - i])
for i in range(1000):
    b.append(y[i])
for i in range(1000):
    b.append(y[len(y) - 1 - i])

X_train, X_test, y_train, y_test = train_test_split(a, b, test_size = 0.20)

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)

y_pred = svclassifier.predict(X_test)

print(svclassifier.score(X_test, y_test))



0.705


# SVM using ngrams

# Our Implementation of SVM with Kernel Pegasos

In [40]:
import numpy as np
from collections import defaultdict, Counter
from functools import wraps
from tqdm import tqdm

def cache_decorator():
    """
    Cache decorator. Stores elements to avoid repeated computations.
    For more details see: https://stackoverflow.com/questions/36684319/decorator-for-a-class-method-that-caches-return-value-after-first-access
    """
    def wrapper(function):
        """
        Return element if in cache. Otherwise compute and store.
        """
        cache = {}

        @wraps(function)
        def element(*args):
            if args in cache:
                result = cache[args]
            else:
                result = function(*args)
                cache[args] = result
            return result

        def clear():
            """
            Clear cache.
            """
            cache.clear()

        # Clear the cache
        element.clear = clear
        return element
    return wrapper

class Kernel(object):
    def evaluate(self, s, t):
        raise NotImplementedError()

    def compute_kernel_matrix(self, *, X, X_prime=None):
        X_prime = X if not X_prime else X_prime
        kernel_matrix = np.zeros((len(X), len(X_prime)), dtype=np.float32)

        for row in range(len(X)):
            for col in range(len(X_prime)):
                kernel_matrix[row][col] = self.evaluate(X[row],X_prime[col])
        return kernel_matrix


class NgramKernel(Kernel):
    def __init__(self, *, ngram_length):
        self.ngram_length = ngram_length


    def generate_ngrams(self, doc):
        ngrams = set()
        for i in range(len(doc) - self.ngram_length + 1):
            ngrams.add(doc[i : i + self.ngram_length])
        return ngrams
    
    @cache_decorator()
    def evaluate(self, s, t):
        s_ngrams = self.generate_ngrams(s)
        t_ngrams = self.generate_ngrams(t)
        x = len(s_ngrams.intersection(t_ngrams))
        y = len(s_ngrams.union(t_ngrams))
        if y == 0:
            return 1
        else:
            return x / y


class TFIDFKernel(Kernel):
    def __init__(self, *, X, X_prime=None):
        self.tfidf = self.compute_tfidf(X, X_prime)
        

    def compute_tf(self, doc):
        words = doc.split()
        tf = Counter(words)
        for key in tf.keys():
            tf[key] = tf[key]/len(words)
        return tf

    def compute_df(self, X, vocab):
        df = defaultdict(int)
        for word in vocab:
            num_docs = 0
            for doc in X:
                if word in doc:
                    num_docs += 1
            df[word] = num_docs
        return df


    def compute_tfidf(self, X, X_prime):
        if X_prime:
            X = X + X_prime

        tf_idf = defaultdict(float)
        words = []
        N = len(X)

        for doc in X:
            words.extend(doc.split())

        vocab = set(words)
        df = self.compute_df(X, vocab)

        for doc in X:
            tf = self.compute_tf(doc)
            for word in tf.keys():
                tf_idf[(doc, word)] = tf[word] * np.log(len(X) / (df[word] + 1))

        return tf_idf

    @cache_decorator()
    def evaluate(self, s, t):
        k = 0.0
        
        words = s.split()
        s_unique_words = set(words)
        t_words = t.split()
        tf = self.compute_tf(t)

        for word in s_unique_words:
            if word in t_words:
                freq = tf[word]
                k += freq * self.tfidf[(s, word)]

        return k

In [41]:
import numpy as np

class Model(object):
    def __init__(self, lmbda):
        self.lmbda = lmbda

    def fit(self, *, X, y, kernel_matrix):
        raise NotImplementedError()

    def predict(self, X, kernel_matrix):
        raise NotImplementedError()


class KernelPegasos(Model):

    def __init__(self, *, nexamples, lmbda):
        super().__init__(lmbda=lmbda)
        self.b = np.zeros(nexamples, dtype=int)
        self.t = 1
        self.support_vectors = None
        self.labels_corresp_to_svs = None
        self.sv_indices = None

    def fit(self, *, X, y, kernel_matrix):
        self.support_vectors = []
        self.labels_corresp_to_svs = []
        self.sv_indices = []
        for j in range(len(X)):
            self.t += 1
            s = 0
            for i in range(len(X)):
                if y[j] == 0:
                    y[j] = -1
                s += self.b[i] * y[i] * kernel_matrix[i, j]
            if y[j] / (self.lmbda * (self.t - 1)) * s < 1:
                self.b[j] += 1

        for i in range(len(self.b)):
            if self.b[i] > 0:
                self.support_vectors.append(X[i])
                self.labels_corresp_to_svs.append(y[i])
                self.sv_indices.append(self.b[i])

        
    def predict(self, *, X, kernel_matrix):
        result = np.zeros(len(X), dtype=int)
        for j in range(len(X)):
            s = 0
            for i in range(len(self.sv_indices)):
                alpha = 1 / (self.lmbda * self.t) * self.sv_indices[i]
                s += alpha * self.labels_corresp_to_svs[i] * kernel_matrix[i][j]
            if s > 0:
                result[j] = 1
        return result
    
    def predict(self, *, X, kernel_matrix):
        result = np.zeros(len(X), dtype=int)
        for j in range(len(X)):
            s = 0
            for i in range(len(self.sv_indices)):
                alpha = 1 / (self.lmbda * self.t) * self.sv_indices[i]
                s += alpha * self.labels_corresp_to_svs[i] * kernel_matrix[i][j]
            if s > 0:
                result[j] = 1
        return result


In [42]:
def train(X, y, kernel_type, ngram_length, train_epochs):
    model = KernelPegasos(nexamples=len(X), lmbda=1e-3)
    if kernel_type == 'ngram':
        kernel = NgramKernel(ngram_length=ngram_length)
    elif kernel_type == 'tfidf':
        kernel = TFIDFKernel(X=X)
    kernel_matrix = kernel.compute_kernel_matrix(X=X)

    for epoch in range(train_epochs):
        model.fit(X=X, y=y, kernel_matrix=kernel_matrix)

    return model

def test(model, X, kernel_type, ngram_length, train_epochs):
    if kernel_type == 'ngram':
        kernel = NgramKernel(ngram_length=ngram_length)
    elif kernel_type == 'tfidf':
        kernel = TFIDFKernel(X=model.support_vectors, X_prime=X)
    kernel_matrix = kernel.compute_kernel_matrix(X=model.support_vectors, X_prime=X)

    preds = model.predict(X=X, kernel_matrix=kernel_matrix)
    
    return preds

In [43]:
# run our implementation of Kernel Pegasos SVM
X = df['text']
y = df['label']
a = []
b = []
for i in range(1000):
    a.append(X[i])
for i in range(1000):
    a.append(X[len(X) - 1 - i])
for i in range(1000):
    b.append(y[i])
for i in range(1000):
    b.append(y[len(y) - 1 - i])

X_train, X_test, y_train, y_test = train_test_split(a, b, test_size = 0.20)

model = train(X_train, y_train, "ngram", 3, 5)
result = test(model, X_test, "ngram", 3, 5)
same = 0
for i in range(len(result)):
    if result[i] == y_test[i]:
        same += 1

print(same / len(y_test))

0.715
