### Import data from text files

In [1]:
import os
import glob
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import emoji
from collections import Counter, defaultdict

In [2]:
import os
import glob

def load_data(base_path):
    data = []
    labels = []
    os.chdir(base_path)
    
    for sentiment in ['positive', 'negative']:
        os.chdir(sentiment)
        files = glob.glob('*.txt')
        for file_name in files:
            with open(file_name, 'r', encoding='utf-8') as file:
                content = file.read().strip()
                data.append(content)
                labels.append(1 if sentiment == 'positive' else 0)
        
        os.chdir('..')

    os.chdir('..')
    return data, labels

base_path = '.'
train_data, train_labels = load_data(base_path + '/train')
test_data, test_labels = load_data(base_path + '/test')


In [3]:
print(train_data[:5])
print(test_data[:5])

['@SouthwestAir I would appreciate that.  Thank you.', '@USAirways thank you very much.', "@JetBlue I'm all set. About to fly. Not bad for a first date with a giant metal bird machine. She even brought snacks.", '@SouthwestAir I got a flight at 11:55am on Thursday but looking for something tomorrow anything available?', "@AmericanAir you're my early frontrunner for best airline! #oscars2016"]
['@united maybe on my return trip 👍', "@AmericanAir no kidding! Gonna take some beating on the apron... And there are some good lookin' planes out there!", '@AmericanAir thanks', '@AmericanAir many trips coming up!  I will see you soon 😃', '@JetBlue Thank you guys! Brilliant customer service']


### Clean data

#### Lowercase capitals at the beginning of words

In [4]:
def lowercase_caps(text):

    def lower_first_letter(match):
        return match.group(1) + match.group(2).lower() + match.group(3)

    pattern = r'(\A|\.\s+|\?\s+|!\s+)([A-Z])(\w*)'
    result = re.sub(pattern, lower_first_letter, text)
    
    return result

In [5]:
train_lower = [lowercase_caps(tweet) for tweet in train_data]
test_lower = [lowercase_caps(tweet) for tweet in test_data]

#### Tokenize, build vocabulary

In [6]:
def demojize_text(text):
    return emoji.demojize(text)

train_lower_demoj = [demojize_text(tweet) for tweet in train_lower]
test_lower_demoj = [demojize_text(tweet) for tweet in test_lower]

#### Create 2 versions of V: with stemming and without stemming

In [7]:
def tokenize(text):
    return word_tokenize(text)

def stem_tokens(tokens, stemmer):
    return [stemmer.stem(token) for token in tokens]

def build_vocabulary(texts, use_stemming=False):
    """ Build vocabulary with optional stemming """
    stemmer = PorterStemmer()
    vocabulary = Counter()

    for text in texts:
        tokens = tokenize(text)
        if use_stemming:
            tokens = stem_tokens(tokens, stemmer)
        vocabulary.update(tokens)

    return list(vocabulary.keys())

In [8]:
v_stemming = build_vocabulary(train_lower_demoj, use_stemming=False)
v_no_stemming = build_vocabulary(train_lower_demoj, use_stemming=True)

print("V without Stemming:", v_stemming[:100])
print("V with Stemming:", v_no_stemming[:10])

V without Stemming: ['@', 'SouthwestAir', 'I', 'would', 'appreciate', 'that', '.', 'thank', 'you', 'USAirways', 'very', 'much', 'JetBlue', "'m", 'all', 'set', 'about', 'to', 'fly', 'not', 'bad', 'for', 'a', 'first', 'date', 'with', 'giant', 'metal', 'bird', 'machine', 'she', 'even', 'brought', 'snacks', 'got', 'flight', 'at', '11:55am', 'on', 'Thursday', 'but', 'looking', 'something', 'tomorrow', 'anything', 'available', '?', 'AmericanAir', "'re", 'my', 'early', 'frontrunner', 'best', 'airline', '!', '#', 'oscars2016', 'RedCarpet', 'Southwest', 'Companion', 'Pass', 'be', 'great', 'major', 'issues', 'getting', 'out', 'of', 'Boston', 'your', 'crew', 'has', 'been', 'exceptional', 'let', "'s", 'see', 'how', 'things', 'roll', 'in', 'Philly', 'thanks', 'i', 'prompt', 'response', 'united', 'such', 'relaxing', 'space', 'drink', 'before', '(', 'United', 'Global', 'First', 'Lounge', ')', 'https', ':']
V with Stemming: ['@', 'southwestair', 'i', 'would', 'appreci', 'that', '.', 'thank', 'you', 'u

In [9]:
import pandas as pd

In [10]:
# dictionary of lists 
dictionary = {'text': train_data, 'label': train_labels} 
   
df = pd.DataFrame(dictionary)

df.head()

Unnamed: 0,text,label
0,@SouthwestAir I would appreciate that. Thank ...,1
1,@USAirways thank you very much.,1
2,@JetBlue I'm all set. About to fly. Not bad fo...,1
3,@SouthwestAir I got a flight at 11:55am on Thu...,1
4,@AmericanAir you're my early frontrunner for b...,1


In [11]:
positive_words_stemmed = []
positive_words_nostem = []

#list of all words where label = 1
for index, row in df[df['label'] == 1].iterrows():
    lower_tweet = lowercase_caps(row['text'])
    demoj_tweet = demojize_text(lower_tweet)
    tokenized_tweet = word_tokenize(demoj_tweet)
    
    positive_words_stemmed.extend(build_vocabulary(tokenized_tweet, use_stemming=True))
    positive_words_nostem.extend(build_vocabulary(tokenized_tweet, use_stemming=False))

In [12]:
negative_words_stemmed = []
negative_words_nostem = []

#list of all words where label= 0
for index, row in df[df['label'] == 0].iterrows():
    lower_tweet = lowercase_caps(row['text'])
    demoj_tweet = demojize_text(lower_tweet)
    tokenized_tweet = word_tokenize(demoj_tweet)
    
    negative_words_stemmed.extend(build_vocabulary(tokenized_tweet, use_stemming=True))
    negative_words_nostem.extend(build_vocabulary(tokenized_tweet, use_stemming=False))

In [13]:
#using the stemmed words for now
positive_words= set(positive_words_stemmed)
remove_list =[]
negative_words= set(negative_words_stemmed)

#Removal of Links in word sets
for word in positive_words:
    if "//t.co" in word:
        remove_list.append(word)

for word in remove_list:
    positive_words.remove(word)
remove_list =[]
for word in negative_words:
    if "//t.co" in word:
        remove_list.append(word)

for word in remove_list:
    negative_words.remove(word)

#create the dictionary
positive_words_dict = {word: 1 for word in positive_words}
negative_words_dict = {word: 0 for word in negative_words}

#combine both dictionaries
combined_dict = {**positive_words_dict, **negative_words_dict}

# print(combined_dict)
#print(positive_words_nostem)

In [14]:
#Count Vectorizer
#Supply data set of just tweets, creates mega bag count vectorizer
def CountVectorizer(data):

    # Create a dictionary to store counts
    counts = {1: defaultdict(int), 0: defaultdict(int)}

    # Process the text
    for index, row in data.iterrows():
        lower_tweet = lowercase_caps(row['text'])
        demoj_tweet = demojize_text(lower_tweet)
        tokenized_tweet = word_tokenize(demoj_tweet)
    
        # Update counts for the respective label
        for word in tokenized_tweet:
            counts[row['label']][word] += 1
    return counts

Multicounts = CountVectorizer(df)
# Display the counts
print("Positive counts:", dict(Multicounts[1]))
print("Negative counts:", dict(Multicounts[0]))

Positive counts: {'@': 1342, 'SouthwestAir': 272, 'I': 264, 'would': 29, 'appreciate': 32, 'that': 85, '.': 993, 'thank': 183, 'you': 433, 'USAirways': 125, 'very': 33, 'much': 64, 'JetBlue': 283, "'m": 26, 'all': 58, 'set': 4, 'about': 25, 'to': 477, 'fly': 28, 'not': 44, 'bad': 10, 'for': 330, 'a': 250, 'first': 21, 'date': 6, 'with': 94, 'giant': 1, 'metal': 1, 'bird': 3, 'machine': 1, 'she': 20, 'even': 12, 'brought': 2, 'snacks': 5, 'got': 49, 'flight': 181, 'at': 89, '11:55am': 1, 'on': 183, 'Thursday': 1, 'but': 48, 'looking': 14, 'something': 8, 'tomorrow': 16, 'anything': 7, 'available': 2, '?': 60, 'AmericanAir': 193, "'re": 20, 'my': 171, 'early': 7, 'frontrunner': 1, 'best': 35, 'airline': 36, '!': 1008, '#': 355, 'oscars2016': 1, 'RedCarpet': 2, 'Southwest': 15, 'Companion': 2, 'Pass': 2, 'be': 69, 'great': 111, 'major': 3, 'issues': 5, 'getting': 15, 'out': 49, 'of': 126, 'Boston': 6, 'your': 114, 'crew': 36, 'has': 22, 'been': 23, 'exceptional': 4, 'let': 8, "'s": 96, 's

### Create Binary Vectorizor

In [15]:
def BinaryVectorizer(data, labels):

    # initialize dictionaries (pos and neg tweets)
    counts = {
        1: defaultdict(int),
        0: defaultdict(int) 
    }
    
        # clean and tokenize each tweet
    for text, label in zip(data, labels):

        text = lowercase_caps(text)
        text = demojize_text(text)
        tokens = word_tokenize(text)

        # use a set to track unique words
        unique_words = set(tokens)
        for word in unique_words:
            counts[label][word] += 1

    return counts

binary_counts = BinaryVectorizer(train_data, train_labels)
print(binary_counts)



In [16]:
#used to calculate the priors
num_positive_tweets= len(df[df['label'] == 1])
num_negative_tweets= len(df[df['label'] == 0])
total_train= len(train_data)

print(f"Number of Positive Tweets: {num_positive_tweets}")
print(f"Number of Negative Tweets: {num_negative_tweets}")
print(f"Total Number of Tweets: {total_train}")

Number of Positive Tweets: 1181
Number of Negative Tweets: 3000
Total Number of Tweets: 4181


In [17]:
num_negative_words= len(dict(Multicounts[0]))
num_positive_words= len(dict(Multicounts[1]))
total_vocab= len(combined_dict)

In [18]:
likelihoods = {
    0: defaultdict(int),  # Likelihoods for negative tweets
    1: defaultdict(int)   # Likelihoods for positive tweets
}

In [19]:
def get_likelihoods(df, counts, num_negative_words, num_positive_words, total_vocab):
    #iteerate over each row in the dataframe
    for _, row in df.iterrows():
        lower_tweet = lowercase_caps(row['text'])
        demoj_tweet = demojize_text(lower_tweet)
        tokenized_tweet = word_tokenize(demoj_tweet)
        
        for word in tokenized_tweet:
            neg_count = counts[0].get(word, 0)
            pos_count = counts[1].get(word, 0)
            
            neg_likelihood = (neg_count + 1) / (num_negative_words + total_vocab)
            pos_likelihood = (pos_count + 1) / (num_positive_words + total_vocab)
            
            likelihoods[0][word] = neg_likelihood
            likelihoods[1][word] = pos_likelihood

In [20]:
get_likelihoods(df, Multicounts, num_negative_words, num_positive_words, total_vocab)

In [21]:
#print positive likelihoods
print("Likelihoods for positive class:", dict(likelihoods[1]))

