### Import data from text files

In [1]:
import os
import glob
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import emoji
from collections import Counter, defaultdict

In [2]:
import os
import glob

def load_data(base_path):
    data = []
    labels = []
    os.chdir(base_path)
    
    for sentiment in ['positive', 'negative']:
        os.chdir(sentiment)
        files = glob.glob('*.txt')
        for file_name in files:
            with open(file_name, 'r', encoding='utf-8') as file:
                content = file.read().strip()
                data.append(content)
                labels.append(1 if sentiment == 'positive' else 0)
        
        os.chdir('..')

    os.chdir('..')
    return data, labels

base_path = '.'
train_data, train_labels = load_data(base_path + '/train')
test_data, test_labels = load_data(base_path + '/test')


In [3]:
print(train_data[:5])
print(test_data[:5])

['@SouthwestAir I would appreciate that.  Thank you.', '@USAirways thank you very much.', "@JetBlue I'm all set. About to fly. Not bad for a first date with a giant metal bird machine. She even brought snacks.", '@SouthwestAir I got a flight at 11:55am on Thursday but looking for something tomorrow anything available?', "@AmericanAir you're my early frontrunner for best airline! #oscars2016"]
['@united maybe on my return trip 👍', "@AmericanAir no kidding! Gonna take some beating on the apron... And there are some good lookin' planes out there!", '@AmericanAir thanks', '@AmericanAir many trips coming up!  I will see you soon 😃', '@JetBlue Thank you guys! Brilliant customer service']


### Clean data

#### Lowercase capitals at the beginning of words

In [4]:
def lowercase_caps(text):

    def lower_first_letter(match):
        return match.group(1) + match.group(2).lower() + match.group(3)

    pattern = r'(\A|\.\s+|\?\s+|!\s+)([A-Z])(\w*)'
    result = re.sub(pattern, lower_first_letter, text)
    
    return result

In [5]:
train_lower = [lowercase_caps(tweet) for tweet in train_data]
test_lower = [lowercase_caps(tweet) for tweet in test_data]

#### Tokenize, build vocabulary

In [6]:
def demojize_text(text):
    return emoji.demojize(text)

train_lower_demoj = [demojize_text(tweet) for tweet in train_lower]
test_lower_demoj = [demojize_text(tweet) for tweet in test_lower]

#### Create 2 versions of V: with stemming and without stemming

In [7]:
def tokenize(text):
    return word_tokenize(text)

def stem_tokens(tokens, stemmer):
    return [stemmer.stem(token) for token in tokens]

def build_vocabulary(texts, use_stemming=False):
    """ Build vocabulary with optional stemming """
    stemmer = PorterStemmer()
    vocabulary = Counter()

    for text in texts:
        tokens = tokenize(text)
        if use_stemming:
            tokens = stem_tokens(tokens, stemmer)
        vocabulary.update(tokens)

    return list(vocabulary.keys())

In [8]:
v_stemming = build_vocabulary(train_lower_demoj, use_stemming=False)
v_no_stemming = build_vocabulary(train_lower_demoj, use_stemming=True)

print("V without Stemming:", v_stemming[:100])
print("V with Stemming:", v_no_stemming[:100])

V without Stemming: ['@', 'SouthwestAir', 'I', 'would', 'appreciate', 'that', '.', 'thank', 'you', 'USAirways', 'very', 'much', 'JetBlue', "'m", 'all', 'set', 'about', 'to', 'fly', 'not', 'bad', 'for', 'a', 'first', 'date', 'with', 'giant', 'metal', 'bird', 'machine', 'she', 'even', 'brought', 'snacks', 'got', 'flight', 'at', '11:55am', 'on', 'Thursday', 'but', 'looking', 'something', 'tomorrow', 'anything', 'available', '?', 'AmericanAir', "'re", 'my', 'early', 'frontrunner', 'best', 'airline', '!', '#', 'oscars2016', 'RedCarpet', 'Southwest', 'Companion', 'Pass', 'be', 'great', 'major', 'issues', 'getting', 'out', 'of', 'Boston', 'your', 'crew', 'has', 'been', 'exceptional', 'let', "'s", 'see', 'how', 'things', 'roll', 'in', 'Philly', 'thanks', 'i', 'prompt', 'response', 'united', 'such', 'relaxing', 'space', 'drink', 'before', '(', 'United', 'Global', 'First', 'Lounge', ')', 'https', ':']
V with Stemming: ['@', 'southwestair', 'i', 'would', 'appreci', 'that', '.', 'thank', 'you', 'u

In [9]:
import pandas as pd

In [10]:
# dictionary of lists 
dictionary = {'text': train_data, 'label': train_labels} 
   
df = pd.DataFrame(dictionary)

df.head()

Unnamed: 0,text,label
0,@SouthwestAir I would appreciate that. Thank ...,1
1,@USAirways thank you very much.,1
2,@JetBlue I'm all set. About to fly. Not bad fo...,1
3,@SouthwestAir I got a flight at 11:55am on Thu...,1
4,@AmericanAir you're my early frontrunner for b...,1


In [11]:
positive_words_stemmed = []
positive_words_nostem = []

#list of all words where label = 1
for index, row in df[df['label'] == 1].iterrows():
    lower_tweet = lowercase_caps(row['text'])
    demoj_tweet = demojize_text(lower_tweet)
    tokenized_tweet = word_tokenize(demoj_tweet)
    
    positive_words_stemmed.extend(build_vocabulary(tokenized_tweet, use_stemming=True))
    positive_words_nostem.extend(build_vocabulary(tokenized_tweet, use_stemming=False))

In [12]:
negative_words_stemmed = []
negative_words_nostem = []

#list of all words where label= 0
for index, row in df[df['label'] == 0].iterrows():
    lower_tweet = lowercase_caps(row['text'])
    demoj_tweet = demojize_text(lower_tweet)
    tokenized_tweet = word_tokenize(demoj_tweet)
    
    negative_words_stemmed.extend(build_vocabulary(tokenized_tweet, use_stemming=True))
    negative_words_nostem.extend(build_vocabulary(tokenized_tweet, use_stemming=False))

In [13]:
#using the stemmed words for now
def create_combined_dict(pos_words, neg_words):
    positive_words= set(pos_words)
    remove_list =[]
    negative_words= set(neg_words)

    #Removal of Links in word sets
    for word in positive_words:
        if "//t.co" in word:
            remove_list.append(word)

    for word in remove_list:
        positive_words.remove(word)
    remove_list =[]
    for word in negative_words:
        if "//t.co" in word:
            remove_list.append(word)

    for word in remove_list:
        negative_words.remove(word)

    #create the dictionary
    positive_words_dict = {word: 1 for word in positive_words}
    negative_words_dict = {word: 0 for word in negative_words}

    #combine both dictionaries
    combined_dict = {**positive_words_dict, **negative_words_dict}
    return combined_dict

combined_dict_stemmed = create_combined_dict(positive_words_stemmed, negative_words_stemmed)
combined_dict_nostem = create_combined_dict(positive_words_nostem, negative_words_nostem)
# print(combined_dict)
#print(positive_words_nostem)

In [14]:
#Count Vectorizer
#Supply data set of just tweets, pass in dataframe
def CountVectorizer(data, use_stemming=False):

    #Incase of stem usage
    stemmer = PorterStemmer()
    # Create a dictionary to store counts
    counts = {1: defaultdict(int), 0: defaultdict(int)}

    # Process the text
    for index, row in data.iterrows():
        lower_tweet = lowercase_caps(row['text'])
        demoj_tweet = demojize_text(lower_tweet)
        tokenized_tweet = word_tokenize(demoj_tweet)
        if use_stemming:
            tokenized_tweet = stem_tokens(tokenized_tweet, stemmer)
    
        # Update counts for the respective label
        for word in tokenized_tweet:
            #Ignores links
            if "//t.co" not in word:
                counts[row['label']][word] += 1
    return counts

MultiCountsStem = CountVectorizer(df, use_stemming=True)
MultiCountsNoStem = CountVectorizer(df, use_stemming=False)
# Display the counts
#print("Positive counts:", dict(Multicountsstem[1]))
#print("Positive counts:", dict(Multicountsnostem[1]))

### Create Binary Vectorizor

In [15]:
def BinaryVectorizer(data, labels, use_stemming=False):
    #Incase of stem usage
    stemmer = PorterStemmer()
    # initialize dictionaries (pos and neg tweets)
    counts = {
        1: defaultdict(int),
        0: defaultdict(int) 
    }
    
        # clean and tokenize each tweet
    for text, label in zip(data, labels):

        text = lowercase_caps(text)
        text = demojize_text(text)
        tokens = word_tokenize(text)
        if use_stemming:
            tokens = stem_tokens(tokens, stemmer)

        # use a set to track unique words
        unique_words = set(tokens)
        for word in unique_words:
            #Ignores links
            if "//t.co" not in word:
                counts[label][word] += 1

    return counts

BinaryCountsStem = BinaryVectorizer(train_data, train_labels, use_stemming=True)
BinaryCountsNoStem = BinaryVectorizer(train_data, train_labels, use_stemming=False)
#print(BinaryCountsStem)

In [16]:
#used to calculate the priors
num_positive_tweets= len(df[df['label'] == 1])
num_negative_tweets= len(df[df['label'] == 0])
total_train= len(train_data)

print(f"Number of Positive Tweets: {num_positive_tweets}")
print(f"Number of Negative Tweets: {num_negative_tweets}")
print(f"Total Number of Tweets: {total_train}")

Number of Positive Tweets: 1181
Number of Negative Tweets: 3000
Total Number of Tweets: 4181


In [17]:
def get_likelihoods(vector, num_negative_words, num_positive_words, total_vocab):
    #iteerate over each row in the dataframe
    likelihoods = {
    0: defaultdict(int),  # Likelihoods for word to show up in negative tweets
    1: defaultdict(int)   # Likelihoods for word to show up in positive tweets
}
    #get probabilities for words in corpus being in a negative tweet
    for word in dict(vector[0]):
        likelihoods[0][word] = vector[0][word]/(num_negative_words + total_vocab)
    for word in dict(vector[1]):
        likelihoods[1][word] = vector[1][word]/(num_positive_words + total_vocab)
    return likelihoods

In [18]:
#Creating Variables for Likelihood usage
#ms = Count/Multi + Stem, mn = Count/Multi + No Stem
#bs = Binary + Stem, bn = Binary + No Stem
num_negative_words_ms= len(dict(MultiCountsStem[0]))
num_positive_words_ms= len(dict(MultiCountsStem[1]))
num_negative_words_mn= len(dict(MultiCountsNoStem[0]))
num_positive_words_mn= len(dict(MultiCountsNoStem[1]))
num_negative_words_bs= len(dict(BinaryCountsStem[0]))
num_positive_words_bs= len(dict(BinaryCountsStem[1]))
num_negative_words_bn= len(dict(BinaryCountsNoStem[0]))
num_positive_words_bn= len(dict(BinaryCountsNoStem[1]))
total_vocab_stemmed= len(combined_dict_stemmed)
total_vocab_nostem= len(combined_dict_nostem)

likelihoodMS = get_likelihoods(MultiCountsStem, num_negative_words_ms, num_positive_words_ms, total_vocab_stemmed)
likelihoodMN = get_likelihoods(MultiCountsNoStem, num_negative_words_mn, num_positive_words_mn, total_vocab_nostem)
likelihoodBS = get_likelihoods(BinaryCountsStem, num_negative_words_bs, num_positive_words_bs, total_vocab_stemmed)
likelihoodBN = get_likelihoods(BinaryCountsNoStem, num_negative_words_bn, num_positive_words_bn, total_vocab_nostem)

In [31]:
#print positive likelihoods
print("Likelihoods for positive class:", dict(likelihoodMS[1]))

Likelihoods for positive class: {'@': 0.15187867813490266, 'southwestair': 0.03168854685377999, 'i': 0.043232231779085556, 'would': 0.0033952014486192846, 'appreci': 0.005319148936170213, 'that': 0.010638297872340425, '.': 0.11238116794929832, 'thank': 0.0631507469443187, 'you': 0.05160706201901313, 'usairway': 0.014599366229062924, 'veri': 0.003961068356722499, 'much': 0.007582616568583069, 'jetblu': 0.03361249434133092, "'m": 0.0029425079221367134, 'all': 0.007016749660479855, 'set': 0.0004526935264825713, 'about': 0.0028293345405160705, 'to': 0.05477591670439113, 'fli': 0.0069035762788592124, 'not': 0.005319148936170213, 'bad': 0.001244907197827071, 'for': 0.0377999094612947, 'a': 0.028519692168401993, 'first': 0.002716161158895428, 'date': 0.000679040289723857, 'with': 0.010638297872340425, 'giant': 0.00022634676324128565, 'metal': 0.00011317338162064282, 'bird': 0.0003395201448619285, 'machin': 0.00011317338162064282, 'she': 0.0022634676324128564, 'even': 0.002037120869171571, 'br

In [28]:
#Print negative likelihoods
#print("Likelihoods for positive class:", dict(likelihoodMS[0]))
print (total_vocab_nostem)

8440
