# Analysis of tweet text

In [1]:
import pymongo
import re
import pprint
import logging
from lark import Lark
import pandas
import time

In [2]:
# Logging functionality
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
logging.debug('Start of program')

2018-05-18 20:04:06,314 - DEBUG - Start of program


In [3]:
# Get the tweets stored in MongoDB
client = pymongo.MongoClient()
tweets = client['twitter']['tweets'].find()
latest_tweet = tweets[180]

In [4]:
def remove_from_word_list(original_list, list_subset):
    for each in list_subset:
        idx = original_list.index(each)
        del original_list[idx]
    return original_list

In [5]:
def print_important_content(tweet):
    ts = time.strftime('%H:%M:%S %Y-%m-%d', time.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y'))
    print('{} characters by {} at {}\n'.format(len(tweet['full_text']),tweet['user']['name'], ts))
    print('"{}"\n'.format(tweet['full_text']))
    
    if not tweet['entities']['urls']:
        expanded_url = 'No URL'
    else:
        expanded_url = []
        for url in tweet['entities']['urls']:
            expanded_url.append(tweet['entities']['urls'][0]['expanded_url'])
    print('{}\n'.format(expanded_url))
    
    if not tweet['entities']['hashtags']:
        hashtag_list = 'No Hashtags'
    else:
        hashtag_list = tweet['entities']['hashtags'][0]['text']
    print('#{}\n'.format(hashtag_list))
    
    if not tweet['entities']['symbols']:
        symbols = 'No Symbols'
    else:
        symbols = tweet['entities']['symbols']
    print('{}\n'.format(symbols))
    pprint.pprint(tweet['entities'])
    
print_important_content(latest_tweet)

106 characters by Fast Company at 15:16:58 2018-04-30

"Here’s how Facebook can regain trust at its #F8 conference https://t.co/qz9HfUfR9p https://t.co/xpUhNTU1tw"

['http://f-st.co/ouy08Gq']

#F8

No Symbols

{'hashtags': [{'indices': [44, 47], 'text': 'F8'}],
 'media': [{'display_url': 'pic.twitter.com/xpUhNTU1tw',
            'expanded_url': 'https://twitter.com/FastCompany/status/990973291471998977/photo/1',
            'id': 990973288045215745,
            'id_str': '990973288045215745',
            'indices': [83, 106],
            'media_url': 'http://pbs.twimg.com/media/DcCk9H_WAAEzKHv.jpg',
            'media_url_https': 'https://pbs.twimg.com/media/DcCk9H_WAAEzKHv.jpg',
            'sizes': {'large': {'h': 790, 'resize': 'fit', 'w': 1412},
                      'medium': {'h': 671, 'resize': 'fit', 'w': 1200},
                      'small': {'h': 380, 'resize': 'fit', 'w': 680},
                      'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
            'type': 'photo

In [6]:
def hashtag(tweet):
    hashtag_list = []
    for tag in tweet['hashtags']:
        hashtag_list.append('#' + tag['text'])
    return hashtag_list

hashtags = hashtag(latest_tweet['entities'])
hashtags

['#F8']

In [7]:
latest_tweet

{'_id': ObjectId('5ae766611c398f0d01baa788'),
 'created_at': 'Mon Apr 30 15:16:58 +0000 2018',
 'id': 990973291471998977,
 'id_str': '990973291471998977',
 'full_text': 'Here’s how Facebook can regain trust at its #F8 conference https://t.co/qz9HfUfR9p https://t.co/xpUhNTU1tw',
 'truncated': False,
 'display_text_range': [0, 82],
 'entities': {'hashtags': [{'text': 'F8', 'indices': [44, 47]}],
  'symbols': [],
  'user_mentions': [],
  'urls': [{'url': 'https://t.co/qz9HfUfR9p',
    'expanded_url': 'http://f-st.co/ouy08Gq',
    'display_url': 'f-st.co/ouy08Gq',
    'indices': [59, 82]}],
  'media': [{'id': 990973288045215745,
    'id_str': '990973288045215745',
    'indices': [83, 106],
    'media_url': 'http://pbs.twimg.com/media/DcCk9H_WAAEzKHv.jpg',
    'media_url_https': 'https://pbs.twimg.com/media/DcCk9H_WAAEzKHv.jpg',
    'url': 'https://t.co/xpUhNTU1tw',
    'display_url': 'pic.twitter.com/xpUhNTU1tw',
    'expanded_url': 'https://twitter.com/FastCompany/status/99097329147199897

In [8]:
# Break the text content apart for better formatting.
words = latest_tweet['full_text'].split(' ')

In [9]:
# Display the broken down list of each 'word'.
words

['Here’s',
 'how',
 'Facebook',
 'can',
 'regain',
 'trust',
 'at',
 'its',
 '#F8',
 'conference',
 'https://t.co/qz9HfUfR9p',
 'https://t.co/xpUhNTU1tw']

In [10]:
words = remove_from_word_list(words, hashtags)
words

['Here’s',
 'how',
 'Facebook',
 'can',
 'regain',
 'trust',
 'at',
 'its',
 'conference',
 'https://t.co/qz9HfUfR9p',
 'https://t.co/xpUhNTU1tw']

In [11]:
# Find any urls in the list of words.
# Located better regex for url pattern matching - http://www.noah.org/wiki/RegEx_Python
def url_locator(word_list):
    urls = []
    for word in word_list:
        url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', word)
        if url:
            urls.append(url[0])
    return urls
urls = url_locator(words)
urls

['https://t.co/qz9HfUfR9p', 'https://t.co/xpUhNTU1tw']

In [12]:
words = remove_from_word_list(words, urls)
words

['Here’s',
 'how',
 'Facebook',
 'can',
 'regain',
 'trust',
 'at',
 'its',
 'conference']

In [13]:
# remove @mentions
def detect_user_handles(word_list):
    user_handle = []
    for word in word_list:
        if word.startswith('@'):
            user_handle.append(word)
    return user_handle

users = detect_user_handles(words)
users

[]

In [14]:
words = remove_from_word_list(words, users)
words

['Here’s',
 'how',
 'Facebook',
 'can',
 'regain',
 'trust',
 'at',
 'its',
 'conference']

In [15]:
# def remove_extra_characters(word_list):
def strip_nonalphanumeric_characters(word_list):
    clean_list = []
    regex = re.compile('[^a-zA-Z0-9]')
    for word in word_list:
        word = word.split('-')
        for each in word:
            each = regex.sub('', each)
            if each != '':
                clean_list.append(each)
    return clean_list
clean_words = strip_nonalphanumeric_characters(words)
clean_words

['Heres',
 'how',
 'Facebook',
 'can',
 'regain',
 'trust',
 'at',
 'its',
 'conference']

In [16]:
list(set(words) - set(clean_words))

['Here’s']

In [17]:
clean_words=[x.lower() for x in clean_words]
clean_words.sort()
clean_words.sort(key=len)
clean_words

['at',
 'can',
 'how',
 'its',
 'heres',
 'trust',
 'regain',
 'facebook',
 'conference']

In [18]:
df = pandas.DataFrame(clean_words, columns=['Words'])
df['Words'].value_counts()

its           1
can           1
regain        1
heres         1
trust         1
at            1
conference    1
how           1
facebook      1
Name: Words, dtype: int64

In [19]:
logging.debug('End of program')

2018-05-18 20:04:06,449 - DEBUG - End of program
