# Analysis of tweet text

In [1]:
import pymongo
import re
import pprint
import logging
from lark import Lark
import pandas

In [2]:
# Logging functionality
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
logging.debug('Start of program')

2018-05-13 11:15:51,544 - DEBUG - Start of program


In [3]:
# Get the tweets stored in MongoDB
client = pymongo.MongoClient()
tweets = client['twitter']['tweets'].find()
latest_tweet = tweets[75]

In [4]:
def remove_from_word_list(original_list, list_subset):
    for each in list_subset:
        idx = original_list.index(each)
        del original_list[idx]
    return original_list

In [5]:
def print_important_content(tweet):
    print('By {} on {}\n'.format(tweet['user']['name'], tweet['created_at']))
    print('"{}"\n'.format(tweet['full_text']))
    
    if not tweet['entities']['urls']:
        expanded_url = 'No URL'
    else:
        expanded_url = tweet['entities']['urls'][0]['expanded_url']
    print('{}\n'.format(expanded_url))
    
    if not tweet['entities']['hashtags']:
        hashtag_list = 'No Hashtags'
    else:
        hashtag_list = tweet['entities']['hashtags']
    print('{}\n'.format(hashtag_list))
    
    if not tweet['entities']['symbols']:
        symbols = 'No Symbols'
    else:
        symbols = tweet['entities']['symbols']
    print('{}\n'.format(symbols))
    pprint.pprint(tweet['entities'])
    
print_important_content(latest_tweet)

By FiveThirtyEight on Mon Apr 30 17:45:02 +0000 2018

"The Houston Rockets have a 53% chance of winning the title, while everybody else combines for a 47% chance of taking the championship. https://t.co/wdZr2zn2Mz"

https://53eig.ht/2HDKF75

No Hashtags

No Symbols

{'hashtags': [],
 'symbols': [],
 'urls': [{'display_url': '53eig.ht/2HDKF75',
           'expanded_url': 'https://53eig.ht/2HDKF75',
           'indices': [135, 158],
           'url': 'https://t.co/wdZr2zn2Mz'}],
 'user_mentions': []}


In [6]:
def hashtag(tweet):
    hashtag_list = []
    for tag in tweet['hashtags']:
        hashtag_list.append('#' + tag['text'])
    return hashtag_list

hashtags = hashtag(latest_tweet['entities'])
hashtags

[]

In [7]:
latest_tweet

{'_id': ObjectId('5ae766611c398f0d01baa71f'),
 'created_at': 'Mon Apr 30 17:45:02 +0000 2018',
 'id': 991010554281185282,
 'id_str': '991010554281185282',
 'full_text': 'The Houston Rockets have a 53% chance of winning the title, while everybody else combines for a 47% chance of taking the championship. https://t.co/wdZr2zn2Mz',
 'truncated': False,
 'display_text_range': [0, 158],
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [],
  'urls': [{'url': 'https://t.co/wdZr2zn2Mz',
    'expanded_url': 'https://53eig.ht/2HDKF75',
    'display_url': '53eig.ht/2HDKF75',
    'indices': [135, 158]}]},
 'source': '<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>',
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id': 2303751216,
  'id_str': '2303751216',
  'name': 'FiveThirtyEight',
  'screen_name': 'FiveThirtyEight',

In [8]:
# Break the text content apart for better formatting.
words = latest_tweet['full_text'].split(' ')

In [9]:
# Display the broken down list of each 'word'.
words

['The',
 'Houston',
 'Rockets',
 'have',
 'a',
 '53%',
 'chance',
 'of',
 'winning',
 'the',
 'title,',
 'while',
 'everybody',
 'else',
 'combines',
 'for',
 'a',
 '47%',
 'chance',
 'of',
 'taking',
 'the',
 'championship.',
 'https://t.co/wdZr2zn2Mz']

In [10]:
words = remove_from_word_list(words, hashtags)
words

['The',
 'Houston',
 'Rockets',
 'have',
 'a',
 '53%',
 'chance',
 'of',
 'winning',
 'the',
 'title,',
 'while',
 'everybody',
 'else',
 'combines',
 'for',
 'a',
 '47%',
 'chance',
 'of',
 'taking',
 'the',
 'championship.',
 'https://t.co/wdZr2zn2Mz']

In [11]:
# Find any urls in the list of words.
# Located better regex for url pattern matching - http://www.noah.org/wiki/RegEx_Python
def url_locator(word_list):
    urls = []
    for word in word_list:
        url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', word)
        if url:
            urls.append(url[0])
    return urls
urls = url_locator(words)
urls

['https://t.co/wdZr2zn2Mz']

In [12]:
words = remove_from_word_list(words, urls)
words

['The',
 'Houston',
 'Rockets',
 'have',
 'a',
 '53%',
 'chance',
 'of',
 'winning',
 'the',
 'title,',
 'while',
 'everybody',
 'else',
 'combines',
 'for',
 'a',
 '47%',
 'chance',
 'of',
 'taking',
 'the',
 'championship.']

In [13]:
# remove @mentions
def detect_user_handles(word_list):
    user_handle = []
    for word in word_list:
        if word.startswith('@'):
            user_handle.append(word)
    return user_handle

users = detect_user_handles(words)
users

[]

In [14]:
words = remove_from_word_list(words, users)
words

['The',
 'Houston',
 'Rockets',
 'have',
 'a',
 '53%',
 'chance',
 'of',
 'winning',
 'the',
 'title,',
 'while',
 'everybody',
 'else',
 'combines',
 'for',
 'a',
 '47%',
 'chance',
 'of',
 'taking',
 'the',
 'championship.']

In [15]:
# def remove_extra_characters(word_list):
def strip_nonalphanumeric_characters(word_list):
    clean_list = []
    regex = re.compile('[^a-zA-Z0-9]')
    for word in word_list:
        word = regex.sub('', word)
        if word != '':
            clean_list.append(word)
    return clean_list
clean_words = strip_nonalphanumeric_characters(words)
clean_words

['The',
 'Houston',
 'Rockets',
 'have',
 'a',
 '53',
 'chance',
 'of',
 'winning',
 'the',
 'title',
 'while',
 'everybody',
 'else',
 'combines',
 'for',
 'a',
 '47',
 'chance',
 'of',
 'taking',
 'the',
 'championship']

In [16]:
list(set(words) - set(clean_words))

['championship.', 'title,', '53%', '47%']

In [17]:
clean_words=[x.lower() for x in clean_words]
clean_words.sort()
clean_words.sort(key=len)
clean_words

['a',
 'a',
 '47',
 '53',
 'of',
 'of',
 'for',
 'the',
 'the',
 'the',
 'else',
 'have',
 'title',
 'while',
 'chance',
 'chance',
 'taking',
 'houston',
 'rockets',
 'winning',
 'combines',
 'everybody',
 'championship']

In [18]:
df = pandas.DataFrame(clean_words, columns=['Words'])
df['Words'].value_counts()

the             3
a               2
chance          2
of              2
combines        1
taking          1
for             1
rockets         1
title           1
while           1
houston         1
53              1
everybody       1
winning         1
championship    1
47              1
have            1
else            1
Name: Words, dtype: int64

In [19]:
logging.debug('End of program')

2018-05-13 11:15:51,682 - DEBUG - End of program
