# Analysis of tweet text

In [1]:
import pymongo
import re

In [2]:
# Get the tweets stored in MongoDB
client = pymongo.MongoClient()
tweets = client['twitter']['tweets'].find()
latest_tweet = tweets[0]

In [3]:
# Break the text content apart for better formatting.
words = latest_tweet['full_text'].split(' ')

In [4]:
# Display the broken down list of each 'word'.
words

["Here's",
 'how',
 'one',
 "man's",
 'bad',
 'math',
 'helped',
 'ruin',
 'decades',
 'of',
 'English',
 'Soccer:',
 'https://t.co/hKaRbO11en']

In [5]:
# Find any urls in the list of words.
# Located better regex for url pattern matching - http://www.noah.org/wiki/RegEx_Python
def url_locator(word_list):
    urls = []
    for word in word_list:
        url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', word)
        if url:
            urls.append(url[0])
    return urls
urls = url_locator(words)
urls

['https://t.co/hKaRbO11en']

In [6]:
def remove_urls_from_word_list(word_list, url_list):
    for url in url_list:
        idx = word_list.index(url)
        del word_list[idx]
    return word_list

words = remove_urls_from_word_list(words, urls)
words

["Here's",
 'how',
 'one',
 "man's",
 'bad',
 'math',
 'helped',
 'ruin',
 'decades',
 'of',
 'English',
 'Soccer:']

In [7]:
# remove @mentions
def remove_user_handles(word_list):
    user_handle = []
    for word in word_list:
        if word.startswith('@'):
            user_handle.append(word)
    return user_handle

users = remove_user_handles(words)
users        

[]

In [8]:
# def remove_extra_characters(word_list):
def strip_extra_characters(word_list):
    clean_list = []
    regex = re.compile('[^a-zA-Z0-9]')
    for word in word_list:
        clean_list.append(regex.sub('', word))
    return clean_list
clean_words = strip_extra_characters(words)
clean_words

['Heres',
 'how',
 'one',
 'mans',
 'bad',
 'math',
 'helped',
 'ruin',
 'decades',
 'of',
 'English',
 'Soccer']

In [9]:
list(set(words) - set(clean_words))

["Here's", "man's", 'Soccer:']