### Import data from text files

In [26]:
import os
import glob
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import emoji
from collections import Counter, defaultdict

In [2]:
import os
import glob

def load_data(base_path):
    data = []
    labels = []
    os.chdir(base_path)
    
    for sentiment in ['positive', 'negative']:
        os.chdir(sentiment)
        files = glob.glob('*.txt')
        for file_name in files:
            with open(file_name, 'r', encoding='utf-8') as file:
                content = file.read().strip()
                data.append(content)
                labels.append(1 if sentiment == 'positive' else 0)
        
        os.chdir('..')

    os.chdir('..')
    return data, labels

base_path = '.'
train_data, train_labels = load_data(base_path + '/train')
test_data, test_labels = load_data(base_path + '/test')


In [3]:
print(train_data[:5])
print(test_data[:5])

['@SouthwestAir I would appreciate that.  Thank you.', '@USAirways thank you very much.', "@JetBlue I'm all set. About to fly. Not bad for a first date with a giant metal bird machine. She even brought snacks.", '@SouthwestAir I got a flight at 11:55am on Thursday but looking for something tomorrow anything available?', "@AmericanAir you're my early frontrunner for best airline! #oscars2016"]
['@united maybe on my return trip 👍', "@AmericanAir no kidding! Gonna take some beating on the apron... And there are some good lookin' planes out there!", '@AmericanAir thanks', '@AmericanAir many trips coming up!  I will see you soon 😃', '@JetBlue Thank you guys! Brilliant customer service']


### Clean data

#### Lowercase capitals at the beginning of words

In [4]:
def lowercase_caps(text):

    def lower_first_letter(match):
        return match.group(1) + match.group(2).lower() + match.group(3)

    pattern = r'(\A|\.\s+|\?\s+|!\s+)([A-Z])(\w*)'
    result = re.sub(pattern, lower_first_letter, text)
    
    return result

In [5]:
train_lower = [lowercase_caps(tweet) for tweet in train_data]
test_lower = [lowercase_caps(tweet) for tweet in test_data]

#### Tokenize, build vocabulary

In [6]:
def demojize_text(text):
    return emoji.demojize(text)

train_lower_demoj = [demojize_text(tweet) for tweet in train_lower]
test_lower_demoj = [demojize_text(tweet) for tweet in test_lower]

#### Create 2 versions of V: with stemming and without stemming

In [7]:
def tokenize(text):
    return word_tokenize(text)

def stem_tokens(tokens, stemmer):
    return [stemmer.stem(token) for token in tokens]

def build_vocabulary(texts, use_stemming=False):
    """ Build vocabulary with optional stemming """
    stemmer = PorterStemmer()
    vocabulary = Counter()

    for text in texts:
        tokens = tokenize(text)
        if use_stemming:
            tokens = stem_tokens(tokens, stemmer)
        vocabulary.update(tokens)

    return list(vocabulary.keys())

In [8]:
v_stemming = build_vocabulary(train_lower_demoj, use_stemming=False)
v_no_stemming = build_vocabulary(train_lower_demoj, use_stemming=True)

print("V without Stemming:", v_stemming[:100])
print("V with Stemming:", v_no_stemming[:10])

V without Stemming: ['@', 'SouthwestAir', 'I', 'would', 'appreciate', 'that', '.', 'thank', 'you', 'USAirways', 'very', 'much', 'JetBlue', "'m", 'all', 'set', 'about', 'to', 'fly', 'not', 'bad', 'for', 'a', 'first', 'date', 'with', 'giant', 'metal', 'bird', 'machine', 'she', 'even', 'brought', 'snacks', 'got', 'flight', 'at', '11:55am', 'on', 'Thursday', 'but', 'looking', 'something', 'tomorrow', 'anything', 'available', '?', 'AmericanAir', "'re", 'my', 'early', 'frontrunner', 'best', 'airline', '!', '#', 'oscars2016', 'RedCarpet', 'Southwest', 'Companion', 'Pass', 'be', 'great', 'major', 'issues', 'getting', 'out', 'of', 'Boston', 'your', 'crew', 'has', 'been', 'exceptional', 'let', "'s", 'see', 'how', 'things', 'roll', 'in', 'Philly', 'thanks', 'i', 'prompt', 'response', 'united', 'such', 'relaxing', 'space', 'drink', 'before', '(', 'United', 'Global', 'First', 'Lounge', ')', 'https', ':']
V with Stemming: ['@', 'southwestair', 'i', 'would', 'appreci', 'that', '.', 'thank', 'you', 'u

In [9]:
import pandas as pd

In [10]:
# dictionary of lists 
dictionary = {'text': train_data, 'label': train_labels} 
   
df = pd.DataFrame(dictionary)

df.head()

Unnamed: 0,text,label
0,@SouthwestAir I would appreciate that. Thank ...,1
1,@USAirways thank you very much.,1
2,@JetBlue I'm all set. About to fly. Not bad fo...,1
3,@SouthwestAir I got a flight at 11:55am on Thu...,1
4,@AmericanAir you're my early frontrunner for b...,1


In [11]:
positive_words_stemmed = []
positive_words_nostem = []

#list of all words where label = 1
for index, row in df[df['label'] == 1].iterrows():
    lower_tweet = lowercase_caps(row['text'])
    demoj_tweet = demojize_text(lower_tweet)
    tokenized_tweet = word_tokenize(demoj_tweet)
    
    positive_words_stemmed.extend(build_vocabulary(tokenized_tweet, use_stemming=True))
    positive_words_nostem.extend(build_vocabulary(tokenized_tweet, use_stemming=False))

In [12]:
negative_words_stemmed = []
negative_words_nostem = []

#list of all words where label= 0
for index, row in df[df['label'] == 0].iterrows():
    lower_tweet = lowercase_caps(row['text'])
    demoj_tweet = demojize_text(lower_tweet)
    tokenized_tweet = word_tokenize(demoj_tweet)
    
    negative_words_stemmed.extend(build_vocabulary(tokenized_tweet, use_stemming=True))
    negative_words_nostem.extend(build_vocabulary(tokenized_tweet, use_stemming=False))

In [23]:
#using the stemmed words for now
positive_words= set(positive_words_stemmed)
remove_list =[]
negative_words= set(negative_words_stemmed)

#Removal of Links in word sets
for word in positive_words:
    if "//t.co" in word:
        remove_list.append(word)

for word in remove_list:
    positive_words.remove(word)
remove_list =[]
for word in negative_words:
    if "//t.co" in word:
        remove_list.append(word)

for word in remove_list:
    negative_words.remove(word)

#create the dictionary
positive_words_dict = {word: 1 for word in positive_words}
negative_words_dict = {word: 0 for word in negative_words}

#combine both dictionaries
combined_dict = {**positive_words_dict, **negative_words_dict}

print(combined_dict)
#print(positive_words_dict)

{'flightlat': 0, 'domest': 0, 'ppl': 0, 'whatev': 0, '32': 0, 'he': 0, 'two_heart': 1, 'awhil': 0, 'friendlyski': 1, 'back-end': 1, 'told': 0, 'amiltx3': 1, 'similar': 1, 'promis': 0, 'brancato': 1, 'away': 0, '833': 1, 'md80': 0, 'complaint': 0, 'cayman_island': 1, 'believ': 0, 'start': 0, 'center': 0, 'allyoucanjetpass': 1, 'afford': 0, 'respond': 0, 'serv': 0, 'meggersrock': 1, 'readi': 0, 'current': 0, 'automat': 0, '0xjare': 1, 'rock': 0, 'a1': 1, 'expedit': 1, 'soulandinspir': 1, 'card': 0, 'advantag': 0, 'jilt': 1, 'feelbett': 1, 'q3': 1, 'booz': 1, 'aa': 0, 'flavor': 1, 'ad': 0, 'nyc-jfk': 1, 'vp': 0, 'herman': 1, 'flatter': 0, 'altonbrownl': 1, 'valentin': 0, 'dai_presid': 1, 'bay': 0, 'glitch': 0, 'yvonn': 1, 'rep': 0, 'southwestair': 0, 'system': 0, 'social': 0, '51': 0, 'tampa': 0, 'emb145': 1, 'yesso': 1, 'act': 0, '20': 0, 'upgrad': 0, 'into': 0, 'befor': 0, 'him': 0, 'flybett': 1, 'ella-ma': 1, 'spell': 1, 'gettin': 0, 'man': 0, 'ps': 0, 'team': 0, 'liveri': 1, 'wikipear

In [42]:
#Count Vectorizer
#Supply data set of just tweets
def CountVectorizer(data):
    for tweet in data:
        count = dict(Counter(tweet.split(' ')))
        print(count)
CountVectorizer(train_lower_demoj)
            
    

{'@SouthwestAir': 1, 'I': 1, 'would': 1, 'appreciate': 1, 'that.': 1, '': 1, 'thank': 1, 'you.': 1}
{'@USAirways': 1, 'thank': 1, 'you': 1, 'very': 1, 'much.': 1}
{'@JetBlue': 1, "I'm": 1, 'all': 1, 'set.': 1, 'about': 1, 'to': 1, 'fly.': 1, 'not': 1, 'bad': 1, 'for': 1, 'a': 2, 'first': 1, 'date': 1, 'with': 1, 'giant': 1, 'metal': 1, 'bird': 1, 'machine.': 1, 'she': 1, 'even': 1, 'brought': 1, 'snacks.': 1}
{'@SouthwestAir': 1, 'I': 1, 'got': 1, 'a': 1, 'flight': 1, 'at': 1, '11:55am': 1, 'on': 1, 'Thursday': 1, 'but': 1, 'looking': 1, 'for': 1, 'something': 1, 'tomorrow': 1, 'anything': 1, 'available?': 1}
{'@AmericanAir': 1, "you're": 1, 'my': 1, 'early': 1, 'frontrunner': 1, 'for': 1, 'best': 1, 'airline!': 1, '#oscars2016': 1}
{'@SouthwestAir': 1, '#RedCarpet': 1, 'Southwest': 1, 'Companion': 1, 'Pass': 1, 'would': 1, 'be': 1, 'great!': 1}
{'@USAirways': 1, '@AmericanAir': 1, 'major': 1, 'issues': 1, 'getting': 1, 'out': 2, 'of': 1, 'Boston': 1, 'but': 1, 'your': 1, 'crew': 1, 'h