## Basic Tweet Cleaning for NLP

In [35]:
import re

import pandas as pd
import numpy as np
import spacy 

In [36]:
raw = pd.read_csv('../data/tweets/tweets_24_10_2019.csv', index_col='created_at')

raw.rename({'text':'raw_text'}, axis=1, inplace=True)

In [37]:
raw.head()

Unnamed: 0_level_0,source,raw_text,retweet_count,favorite_count,is_retweet,id_str
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10-24-2019 03:59:32,Twitter for iPhone,RT @bopinion: Of Wisconsin’s 72 counties 23 sw...,4802,0,True,1187217033164967939
10-24-2019 03:54:48,Twitter for iPhone,RT @realDonaldTrump: https://t.co/ESnxdzR0wM h...,6264,0,True,1187215842435575809
10-24-2019 02:01:35,Twitter for iPhone,https://t.co/ESnxdzR0wM https://t.co/pcXjrYuQ9e,6296,20463,False,1187187349622575104
10-23-2019 21:19:33,Twitter for iPhone,PROMISES MADE PROMISES KEPT! #SHALEINSIGHT2019...,16184,61381,False,1187116372997922821
10-23-2019 21:12:06,Twitter for iPhone,It was wonderful to be back in Pittsburgh Penn...,13979,49186,False,1187114497384898560


In [38]:
# create three new columns: names mentioned (@...), hashtags used (#...), links ('http...')

def find_hashtags(tweet):
    hash_regex = re.compile(r'#\w*')
    return hash_regex.findall(tweet)

def find_mentions(tweet):
    mention_regex = re.compile(r'@\w*')
    return mention_regex.findall(tweet)

def find_links(tweet):
    link_regex = re.compile(r'http\S*')
    return link_regex.findall(tweet)
    


In [39]:
print(raw.raw_text[4]) 

It was wonderful to be back in Pittsburgh Pennsylvania with the incredible Patriots who fuel our factories light up our homes power our industries and fill our hearts with true American Pride! #SHALEINSIGHT2019 https://t.co/hWmN7zNud3


In [40]:
print(find_hashtags(raw.raw_text[4]))
print(find_mentions(raw.raw_text[4]))
print(find_links(raw.raw_text[4]))

['#SHALEINSIGHT2019']
[]
['https://t.co/hWmN7zNud3']


In [41]:
raw['hashtags'] = raw.raw_text.apply(find_hashtags)
raw['mentions'] = raw.raw_text.apply(find_mentions)
raw['links'] = raw.raw_text.apply(find_links)
raw.head()

Unnamed: 0_level_0,source,raw_text,retweet_count,favorite_count,is_retweet,id_str,hashtags,mentions,links
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10-24-2019 03:59:32,Twitter for iPhone,RT @bopinion: Of Wisconsin’s 72 counties 23 sw...,4802,0,True,1187217033164967939,[],[@bopinion],"[https://t.co/4E0xFLZwlF, https://t.co/f…]"
10-24-2019 03:54:48,Twitter for iPhone,RT @realDonaldTrump: https://t.co/ESnxdzR0wM h...,6264,0,True,1187215842435575809,[],[@realDonaldTrump],"[https://t.co/ESnxdzR0wM, https://t.co/pcXjrYu..."
10-24-2019 02:01:35,Twitter for iPhone,https://t.co/ESnxdzR0wM https://t.co/pcXjrYuQ9e,6296,20463,False,1187187349622575104,[],[],"[https://t.co/ESnxdzR0wM, https://t.co/pcXjrYu..."
10-23-2019 21:19:33,Twitter for iPhone,PROMISES MADE PROMISES KEPT! #SHALEINSIGHT2019...,16184,61381,False,1187116372997922821,[#SHALEINSIGHT2019],[],[https://t.co/kCkw3K8k5o]
10-23-2019 21:12:06,Twitter for iPhone,It was wonderful to be back in Pittsburgh Penn...,13979,49186,False,1187114497384898560,[#SHALEINSIGHT2019],[],[https://t.co/hWmN7zNud3]


In [42]:
# now let's clean the actual tweet so it's just the message

# removes RT header ('RT @IvankaTrump: '), links, and hashtags
# leave instances of '@HillaryClinton?' or 'MikePence!' since these
# are directed *at* people. Will tag as entities later through nlp
    
def clean_tweet(df):
    word_list = ['RT', '...'] + df.hashtags + [mention+':' for mention in df.mentions] + df.links
    
    raw_tweet = df.raw_text
    try:
        for stop in word_list:
            raw_tweet = raw_tweet.replace(stop, '')
        return raw_tweet
    except:
        return np.nan
        
    

In [43]:
raw['clean_text'] = raw.apply(clean_tweet, axis=1)

In [44]:
raw.head()

Unnamed: 0_level_0,source,raw_text,retweet_count,favorite_count,is_retweet,id_str,hashtags,mentions,links,clean_text
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10-24-2019 03:59:32,Twitter for iPhone,RT @bopinion: Of Wisconsin’s 72 counties 23 sw...,4802,0,True,1187217033164967939,[],[@bopinion],"[https://t.co/4E0xFLZwlF, https://t.co/f…]",Of Wisconsin’s 72 counties 23 switched from ...
10-24-2019 03:54:48,Twitter for iPhone,RT @realDonaldTrump: https://t.co/ESnxdzR0wM h...,6264,0,True,1187215842435575809,[],[@realDonaldTrump],"[https://t.co/ESnxdzR0wM, https://t.co/pcXjrYu...",
10-24-2019 02:01:35,Twitter for iPhone,https://t.co/ESnxdzR0wM https://t.co/pcXjrYuQ9e,6296,20463,False,1187187349622575104,[],[],"[https://t.co/ESnxdzR0wM, https://t.co/pcXjrYu...",
10-23-2019 21:19:33,Twitter for iPhone,PROMISES MADE PROMISES KEPT! #SHALEINSIGHT2019...,16184,61381,False,1187116372997922821,[#SHALEINSIGHT2019],[],[https://t.co/kCkw3K8k5o],PROMISES MADE PROMISES KEPT!
10-23-2019 21:12:06,Twitter for iPhone,It was wonderful to be back in Pittsburgh Penn...,13979,49186,False,1187114497384898560,[#SHALEINSIGHT2019],[],[https://t.co/hWmN7zNud3],It was wonderful to be back in Pittsburgh Penn...


In [45]:
print(raw.raw_text[4], '\n')
print(raw.clean_text[4])

It was wonderful to be back in Pittsburgh Pennsylvania with the incredible Patriots who fuel our factories light up our homes power our industries and fill our hearts with true American Pride! #SHALEINSIGHT2019 https://t.co/hWmN7zNud3 

It was wonderful to be back in Pittsburgh Pennsylvania with the incredible Patriots who fuel our factories light up our homes power our industries and fill our hearts with true American Pride!  


In [46]:
nlp = spacy.load('en_core_web_sm')

In [47]:
# saves each tweet as a spacy 'doc'
raw['doc'] = raw.clean_text.apply(nlp)

In [48]:
# create word vectors for each cleaned tweet
raw['vector'] = raw.doc.apply(lambda x: x.vector)

def find_ents(doc):
    return [ent.text for ent in doc.ents]
raw['entities'] = raw.doc.apply(find_ents)

In [49]:
raw[['doc','vector','entities']].head()

Unnamed: 0_level_0,doc,vector,entities
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10-24-2019 03:59:32,"( , Of, Wisconsin, ’s, 72, counties, 23, swit...","[-0.3747789, 1.725721, -0.59561896, -1.2791008...","[Wisconsin, 72, 23, Obama, 2012, Trump, 2016]"
10-24-2019 03:54:48,( ),"[-2.7594738, 1.6421937, -0.110908926, 1.366160...",[]
10-24-2019 02:01:35,( ),"[-1.1562011, 1.0382627, -1.3893983, -0.6023454...",[]
10-23-2019 21:19:33,"(PROMISES, MADE, PROMISES, KEPT, !, )","[-1.8844684, 0.60330266, -2.3846438, -1.404496...",[]
10-23-2019 21:12:06,"(It, was, wonderful, to, be, back, in, Pittsbu...","[-0.575121, -0.23645799, -1.6973678, -1.135657...","[Pittsburgh, Pennsylvania, American]"


In [50]:
for word in str(raw.doc[0]).split():
    print(word)

Of
Wisconsin’s
72
counties
23
switched
from
voting
for
Obama
in
2012
to
Trump
in
2016


In [51]:
len(str(raw.doc[0]).split())

16

In [52]:
len(raw.doc[0])
# one longer because it separates the apostrophe 's' as a particle... 
# see below

19

In [53]:
for token in raw.doc[0]:
    print(token.pos_)

SPACE
ADP
PROPN
PART
NUM
NOUN
NUM
VERB
ADP
VERB
ADP
PROPN
ADP
NUM
ADP
PROPN
ADP
NUM
SPACE


In [54]:
spacy.explain('PART')

'particle'

In [55]:
print(pd.DataFrame.to_pickle.__doc__)


        Pickle (serialize) object to file.

        Parameters
        ----------
        path : str
            File path where the pickled object will be stored.
        compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None},         default 'infer'
            A string representing the compression to use in the output file. By
            default, infers from the file extension in specified path.

            .. versionadded:: 0.20.0
        protocol : int
            Int which indicates which protocol should be used by the pickler,
            default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible
            values for this parameter depend on the version of Python. For
            Python 2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a
            valid value. For Python >= 3.4, 4 is a valid value. A negative
            value for the protocol parameter is equivalent to setting its value
            to HIGHEST_PROTOCOL.

            .. [1] https://docs.py

In [56]:
# pickle the dataframe so Docs are saved for later use
raw.to_pickle('../data/tweets/clean_tweets_24_10_2019.pkl')