# Analysis of tweet text

In [1]:
import pymongo
import re

In [2]:
# Get the tweets stored in MongoDB
client = pymongo.MongoClient()
tweets = client['twitter']['tweets'].find()
tweets[6]

{'_id': ObjectId('5ae618241c398f14a3c0d5a8'),
 'created_at': 'Sun Apr 29 18:22:41 +0000 2018',
 'id': 990657640584941575,
 'id_str': '990657640584941575',
 'text': 'It’s hard to choose just 1 lake of our 10,000+, but @mprnews has narrowed it down in their Minnesota’s Most-Loved L… https://t.co/74H5danWhH',
 'truncated': True,
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [{'screen_name': 'MPRnews',
    'name': 'MPR News',
    'id': 15965292,
    'id_str': '15965292',
    'indices': [52, 60]}],
  'urls': [{'url': 'https://t.co/74H5danWhH',
    'expanded_url': 'https://twitter.com/i/web/status/990657640584941575',
    'display_url': 'twitter.com/i/web/status/9…',
    'indices': [117, 140]}]},
 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id': 3

In [3]:
# Break the text content apart for better formatting.
words = tweets[6]['text'].split(' ')

In [4]:
# Display the broken down list of each 'word'.
words

['It’s',
 'hard',
 'to',
 'choose',
 'just',
 '1',
 'lake',
 'of',
 'our',
 '10,000+,',
 'but',
 '@mprnews',
 'has',
 'narrowed',
 'it',
 'down',
 'in',
 'their',
 'Minnesota’s',
 'Most-Loved',
 'L…',
 'https://t.co/74H5danWhH']

In [5]:
# Find any urls in the list of words.
# Located better regex for url pattern matching - http://www.noah.org/wiki/RegEx_Python
def url_locator(word_list):
    urls = []
    for word in word_list:
        url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', word)
        if url:
            urls.append(url[0])
    return urls
urls = url_locator(words)
urls

['https://t.co/74H5danWhH']

In [6]:
def remove_urls_from_word_list(word_list, url_list):
    for url in url_list:
        idx = word_list.index(url)
        del word_list[idx]
    return word_list

words = remove_urls_from_word_list(words, urls)
words

['It’s',
 'hard',
 'to',
 'choose',
 'just',
 '1',
 'lake',
 'of',
 'our',
 '10,000+,',
 'but',
 '@mprnews',
 'has',
 'narrowed',
 'it',
 'down',
 'in',
 'their',
 'Minnesota’s',
 'Most-Loved',
 'L…']

In [7]:
# remove @mentions
def remove_user_handles(word_list):
    user_handle = []
    for word in word_list:
        if word.startswith('@'):
            user_handle.append(word)
    return user_handle

users = remove_user_handles(words)
users        

['@mprnews']

In [8]:
# def remove_extra_characters(word_list):
def strip_extra_characters(word_list):
    clean_list = []
    regex = re.compile('[^a-zA-Z0-9]')
    for word in word_list:
        clean_list.append(regex.sub('', word))
    return clean_list
clean_words = strip_extra_characters(words)
clean_words

['Its',
 'hard',
 'to',
 'choose',
 'just',
 '1',
 'lake',
 'of',
 'our',
 '10000',
 'but',
 'mprnews',
 'has',
 'narrowed',
 'it',
 'down',
 'in',
 'their',
 'Minnesotas',
 'MostLoved',
 'L']

In [9]:
list(set(words) - set(clean_words))

['Minnesota’s', 'L…', '@mprnews', 'It’s', 'Most-Loved', '10,000+,']