# Analysis of tweet text

In [1]:
import pymongo
import re

In [2]:
# Get the tweets stored in MongoDB
client = pymongo.MongoClient()
tweets = client['twitter']['tweets'].find()
tweets[4]

{'_id': ObjectId('5ae674b81c398f21b4830a9f'),
 'created_at': 'Mon Apr 30 01:15:01 +0000 2018',
 'id': 990761405816229888,
 'id_str': '990761405816229888',
 'full_text': 'RT @rrrins: What if we could detect the initial stages of breast cancer with something as simple as a breath test? https://t.co/S2VISdoC4K',
 'truncated': False,
 'display_text_range': [0, 138],
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [{'screen_name': 'rrrins',
    'name': 'Rina Raphael',
    'id': 20960764,
    'id_str': '20960764',
    'indices': [3, 10]}],
  'urls': [{'url': 'https://t.co/S2VISdoC4K',
    'expanded_url': 'https://www.fastcompany.com/40565043/breath-and-urine-tests-might-be-the-future-of-breast-cancer-screening',
    'display_url': 'fastcompany.com/40565043/breat…',
    'indices': [115, 138]}]},
 'source': '<a href="http://bufferapp.com" rel="nofollow">Buffer</a>',
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to

In [3]:
# Break the text content apart for better formatting.
words = tweets[4]['full_text'].split(' ')

In [4]:
# Display the broken down list of each 'word'.
words

['RT',
 '@rrrins:',
 'What',
 'if',
 'we',
 'could',
 'detect',
 'the',
 'initial',
 'stages',
 'of',
 'breast',
 'cancer',
 'with',
 'something',
 'as',
 'simple',
 'as',
 'a',
 'breath',
 'test?',
 'https://t.co/S2VISdoC4K']

In [5]:
# Find any urls in the list of words.
# Located better regex for url pattern matching - http://www.noah.org/wiki/RegEx_Python
def url_locator(word_list):
    urls = []
    for word in word_list:
        url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', word)
        if url:
            urls.append(url[0])
    return urls
urls = url_locator(words)
urls

['https://t.co/S2VISdoC4K']

In [6]:
def remove_urls_from_word_list(word_list, url_list):
    for url in url_list:
        idx = word_list.index(url)
        del word_list[idx]
    return word_list

words = remove_urls_from_word_list(words, urls)
words

['RT',
 '@rrrins:',
 'What',
 'if',
 'we',
 'could',
 'detect',
 'the',
 'initial',
 'stages',
 'of',
 'breast',
 'cancer',
 'with',
 'something',
 'as',
 'simple',
 'as',
 'a',
 'breath',
 'test?']

In [7]:
# remove @mentions
def remove_user_handles(word_list):
    user_handle = []
    for word in word_list:
        if word.startswith('@'):
            user_handle.append(word)
    return user_handle

users = remove_user_handles(words)
users        

['@rrrins:']

In [8]:
# def remove_extra_characters(word_list):
def strip_extra_characters(word_list):
    clean_list = []
    regex = re.compile('[^a-zA-Z0-9]')
    for word in word_list:
        clean_list.append(regex.sub('', word))
    return clean_list
clean_words = strip_extra_characters(words)
clean_words

['RT',
 'rrrins',
 'What',
 'if',
 'we',
 'could',
 'detect',
 'the',
 'initial',
 'stages',
 'of',
 'breast',
 'cancer',
 'with',
 'something',
 'as',
 'simple',
 'as',
 'a',
 'breath',
 'test']

In [9]:
list(set(words) - set(clean_words))

['test?', '@rrrins:']