# Analysis of tweet text

In [1]:
import pymongo
import re
import pprint
import logging
from lark import Lark
import pandas
import time

In [2]:
# Logging functionality
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
logging.debug('Start of program')

2018-05-27 16:40:48,131 - DEBUG - Start of program


In [3]:
# Get the tweets stored in MongoDB
client = pymongo.MongoClient()
tweets = client['twitter']['tweets'].find()
latest_tweet = tweets[200]

In [4]:
def remove_from_word_list(original_list, list_subset):
    for each in list_subset:
        idx = original_list.index(each)
        del original_list[idx]
    return original_list

In [5]:
def print_important_content(tweet):
    ts = time.strftime('%H:%M:%S %Y-%m-%d', time.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y'))
    print('{} characters by {} at {}\n'.format(len(tweet['full_text']),tweet['user']['name'], ts))
    print('"{}"\n'.format(tweet['full_text']))
    
    if not tweet['entities']['urls']:
        expanded_url = 'No URL'
    else:
        expanded_url = []
        for url in tweet['entities']['urls']:
            expanded_url.append(tweet['entities']['urls'][0]['expanded_url'])
    print(f'{expanded_url}\n')
    
    if not tweet['entities']['hashtags']:
        hashtag_list = 'No Hashtags'
    else:
        hashtag_list = '#{}'.format(tweet['entities']['hashtags'][0]['text'])
    print(f'{hashtag_list}\n')
    
    if not tweet['entities']['symbols']:
        symbols = 'No Symbols'
    else:
        symbols = tweet['entities']['symbols']
    print(f'{symbols}\n')
    
    if not tweet['entities']['user_mentions']:
        mentions = 'No @Mentions'
    else:
        mentions = []
        users = tweet['entities']['user_mentions']
        for user in users:
            mentions.append('@' + user['screen_name'])
    print(f'{mentions}\n')
    pprint.pprint(tweet['entities'])
    
print_important_content(latest_tweet)

140 characters by Kai Ryssdal at 14:39:20 2018-04-30

"RT @kaylatausche: When the @WhiteHouse initially announced the exemptions for allies, it did so at 10:53pm before a midnight deadline. Woul…"

No URL

No Hashtags

No Symbols

['@kaylatausche', '@WhiteHouse']

{'hashtags': [],
 'symbols': [],
 'urls': [],
 'user_mentions': [{'id': 30043148,
                    'id_str': '30043148',
                    'indices': [3, 16],
                    'name': 'Kayla Tausche',
                    'screen_name': 'kaylatausche'},
                   {'id': 822215673812119553,
                    'id_str': '822215673812119553',
                    'indices': [27, 38],
                    'name': 'The White House',
                    'screen_name': 'WhiteHouse'}]}


In [6]:
def hashtag(tweet):
    hashtag_list = []
    for tag in tweet['hashtags']:
        hashtag_list.append('#' + tag['text'])
    return hashtag_list

hashtags = hashtag(latest_tweet['entities'])
hashtags

[]

In [7]:
latest_tweet

{'_id': ObjectId('5ae766611c398f0d01baa79c'),
 'created_at': 'Mon Apr 30 14:39:20 +0000 2018',
 'id': 990963817667416064,
 'id_str': '990963817667416064',
 'full_text': 'RT @kaylatausche: When the @WhiteHouse initially announced the exemptions for allies, it did so at 10:53pm before a midnight deadline. Woul…',
 'truncated': False,
 'display_text_range': [0, 140],
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [{'screen_name': 'kaylatausche',
    'name': 'Kayla Tausche',
    'id': 30043148,
    'id_str': '30043148',
    'indices': [3, 16]},
   {'screen_name': 'WhiteHouse',
    'name': 'The White House',
    'id': 822215673812119553,
    'id_str': '822215673812119553',
    'indices': [27, 38]}],
  'urls': []},
 'source': '<a href="http://tapbots.com/tweetbot" rel="nofollow">Tweetbot for iΟS</a>',
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user':

In [8]:
# Break the text content apart for better formatting.
words = latest_tweet['full_text'].split(' ')

In [9]:
# Display the broken down list of each 'word'.
words

['RT',
 '@kaylatausche:',
 'When',
 'the',
 '@WhiteHouse',
 'initially',
 'announced',
 'the',
 'exemptions',
 'for',
 'allies,',
 'it',
 'did',
 'so',
 'at',
 '10:53pm',
 'before',
 'a',
 'midnight',
 'deadline.',
 'Woul…']

In [10]:
words = remove_from_word_list(words, hashtags)
words

['RT',
 '@kaylatausche:',
 'When',
 'the',
 '@WhiteHouse',
 'initially',
 'announced',
 'the',
 'exemptions',
 'for',
 'allies,',
 'it',
 'did',
 'so',
 'at',
 '10:53pm',
 'before',
 'a',
 'midnight',
 'deadline.',
 'Woul…']

In [11]:
# Find any urls in the list of words.
# Located better regex for url pattern matching - http://www.noah.org/wiki/RegEx_Python
def url_locator(word_list):
    urls = []
    for word in word_list:
        url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', word)
        if url:
            urls.append(url[0])
    return urls
urls = url_locator(words)
urls

[]

In [12]:
words = remove_from_word_list(words, urls)
words

['RT',
 '@kaylatausche:',
 'When',
 'the',
 '@WhiteHouse',
 'initially',
 'announced',
 'the',
 'exemptions',
 'for',
 'allies,',
 'it',
 'did',
 'so',
 'at',
 '10:53pm',
 'before',
 'a',
 'midnight',
 'deadline.',
 'Woul…']

In [13]:
# remove @mentions
def detect_user_handles(word_list):
    user_handle = []
    for word in word_list:
        if word.startswith('@'):
            user_handle.append(word)
    return user_handle

users = detect_user_handles(words)
users

['@kaylatausche:', '@WhiteHouse']

In [14]:
words = remove_from_word_list(words, users)
words

['RT',
 'When',
 'the',
 'initially',
 'announced',
 'the',
 'exemptions',
 'for',
 'allies,',
 'it',
 'did',
 'so',
 'at',
 '10:53pm',
 'before',
 'a',
 'midnight',
 'deadline.',
 'Woul…']

In [15]:
# def remove_extra_characters(word_list):
def strip_nonalphanumeric_characters(word_list):
    clean_list = []
    regex = re.compile('[^a-zA-Z0-9]')
    for word in word_list:
        word = word.split('-')
        for each in word:
            each = regex.sub('', each)
            if each != '':
                clean_list.append(each)
    return clean_list
clean_words = strip_nonalphanumeric_characters(words)
clean_words

['RT',
 'When',
 'the',
 'initially',
 'announced',
 'the',
 'exemptions',
 'for',
 'allies',
 'it',
 'did',
 'so',
 'at',
 '1053pm',
 'before',
 'a',
 'midnight',
 'deadline',
 'Woul']

In [16]:
list(set(words) - set(clean_words))

['allies,', 'Woul…', '10:53pm', 'deadline.']

In [17]:
clean_words=[x.lower() for x in clean_words]
clean_words.sort()
clean_words.sort(key=len)
clean_words

['a',
 'at',
 'it',
 'rt',
 'so',
 'did',
 'for',
 'the',
 'the',
 'when',
 'woul',
 '1053pm',
 'allies',
 'before',
 'deadline',
 'midnight',
 'announced',
 'initially',
 'exemptions']

In [18]:
df = pandas.DataFrame(clean_words, columns=['Words'])
df['Words'].value_counts()

the           2
did           1
exemptions    1
when          1
it            1
announced     1
deadline      1
at            1
1053pm        1
midnight      1
allies        1
initially     1
rt            1
before        1
woul          1
for           1
so            1
a             1
Name: Words, dtype: int64

In [19]:
logging.debug('End of program')

2018-05-27 16:40:48,302 - DEBUG - End of program
