# Analysis of tweet text

In [1]:
import pymongo
import re
import pprint

In [2]:
# Get the tweets stored in MongoDB
client = pymongo.MongoClient()
tweets = client['twitter']['tweets'].find()
latest_tweet = tweets[15]

In [3]:
def remove_from_word_list(original_list, list_subset):
    for each in list_subset:
        idx = original_list.index(each)
        del original_list[idx]
    return original_list

In [4]:
def print_important_content(tweet):
    print(tweet['user']['name'] + ' - ' + tweet['created_at'])
    print('"' + tweet['full_text'] + '"')
    pprint.pprint(tweet['entities'])
    
print_important_content(latest_tweet)

Todd Waterbury - Sun Apr 29 23:55:50 +0000 2018
"Thanks @forbes for this piece on the power of  #inclusivity + #creativity and proud that @target is part of this amazing organization https://t.co/xvIGtWTpsB"
{'hashtags': [{'indices': [47, 59], 'text': 'inclusivity'},
              {'indices': [62, 73], 'text': 'creativity'}],
 'symbols': [],
 'urls': [{'display_url': 'forbes.com/sites/westernb…',
           'expanded_url': 'http://www.forbes.com/sites/westernbonime/2018/04/29/creative-growth-where-great-art-has-become-the-coolest-fashion-show-in-town/#770906a7182d',
           'indices': [134, 157],
           'url': 'https://t.co/xvIGtWTpsB'}],
 'user_mentions': [{'id': 91478624,
                    'id_str': '91478624',
                    'indices': [7, 14],
                    'name': 'Forbes',
                    'screen_name': 'Forbes'},
                   {'id': 89084561,
                    'id_str': '89084561',
                    'indices': [89, 96],
                    'name

In [5]:
def hashtag(tweet):
    hashtag_list = []
    for tag in tweet['hashtags']:
        hashtag_list.append('#' + tag['text'])
    return hashtag_list

hashtags = hashtag(latest_tweet['entities'])
hashtags

['#inclusivity', '#creativity']

In [6]:
latest_tweet

{'_id': ObjectId('5ae674b81c398f21b4830aaa'),
 'created_at': 'Sun Apr 29 23:55:50 +0000 2018',
 'id': 990741478279471104,
 'id_str': '990741478279471104',
 'full_text': 'Thanks @forbes for this piece on the power of  #inclusivity + #creativity and proud that @target is part of this amazing organization https://t.co/xvIGtWTpsB',
 'truncated': False,
 'display_text_range': [0, 157],
 'entities': {'hashtags': [{'text': 'inclusivity', 'indices': [47, 59]},
   {'text': 'creativity', 'indices': [62, 73]}],
  'symbols': [],
  'user_mentions': [{'screen_name': 'Forbes',
    'name': 'Forbes',
    'id': 91478624,
    'id_str': '91478624',
    'indices': [7, 14]},
   {'screen_name': 'Target',
    'name': 'Target',
    'id': 89084561,
    'id_str': '89084561',
    'indices': [89, 96]}],
  'urls': [{'url': 'https://t.co/xvIGtWTpsB',
    'expanded_url': 'http://www.forbes.com/sites/westernbonime/2018/04/29/creative-growth-where-great-art-has-become-the-coolest-fashion-show-in-town/#770906a7182d',
  

In [7]:
# Break the text content apart for better formatting.
words = latest_tweet['full_text'].split(' ')

In [8]:
# Display the broken down list of each 'word'.
words

['Thanks',
 '@forbes',
 'for',
 'this',
 'piece',
 'on',
 'the',
 'power',
 'of',
 '',
 '#inclusivity',
 '+',
 '#creativity',
 'and',
 'proud',
 'that',
 '@target',
 'is',
 'part',
 'of',
 'this',
 'amazing',
 'organization',
 'https://t.co/xvIGtWTpsB']

In [9]:
words = remove_from_word_list(words, hashtags)
words

['Thanks',
 '@forbes',
 'for',
 'this',
 'piece',
 'on',
 'the',
 'power',
 'of',
 '',
 '+',
 'and',
 'proud',
 'that',
 '@target',
 'is',
 'part',
 'of',
 'this',
 'amazing',
 'organization',
 'https://t.co/xvIGtWTpsB']

In [10]:
# Find any urls in the list of words.
# Located better regex for url pattern matching - http://www.noah.org/wiki/RegEx_Python
def url_locator(word_list):
    urls = []
    for word in word_list:
        url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', word)
        if url:
            urls.append(url[0])
    return urls
urls = url_locator(words)
urls

['https://t.co/xvIGtWTpsB']

In [11]:
words = remove_from_word_list(words, urls)
words

['Thanks',
 '@forbes',
 'for',
 'this',
 'piece',
 'on',
 'the',
 'power',
 'of',
 '',
 '+',
 'and',
 'proud',
 'that',
 '@target',
 'is',
 'part',
 'of',
 'this',
 'amazing',
 'organization']

In [12]:
# remove @mentions
def detect_user_handles(word_list):
    user_handle = []
    for word in word_list:
        if word.startswith('@'):
            user_handle.append(word)
    return user_handle

users = detect_user_handles(words)
users

['@forbes', '@target']

In [13]:
words = remove_from_word_list(words, users)
words

['Thanks',
 'for',
 'this',
 'piece',
 'on',
 'the',
 'power',
 'of',
 '',
 '+',
 'and',
 'proud',
 'that',
 'is',
 'part',
 'of',
 'this',
 'amazing',
 'organization']

In [14]:
# def remove_extra_characters(word_list):
def strip_nonalphanumeric_characters(word_list):
    clean_list = []
    regex = re.compile('[^a-zA-Z0-9]')
    for word in word_list:
        word = regex.sub('', word)
        if word != '':
            clean_list.append(word)
    return clean_list
clean_words = strip_nonalphanumeric_characters(words)
clean_words

['Thanks',
 'for',
 'this',
 'piece',
 'on',
 'the',
 'power',
 'of',
 'and',
 'proud',
 'that',
 'is',
 'part',
 'of',
 'this',
 'amazing',
 'organization']

In [15]:
list(set(words) - set(clean_words))

['', '+']