In [12]:
import pandas as pd
import nltk
from nltk import TweetTokenizer
import string
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer

In [13]:
event_name = '[TEDxNations]'
event_data = pd.read_table('data/%s_data.txt' % event_name, sep='\t', header=0, encoding='utf-8')

In [14]:
#remove links
event_data['text_nolink'] = event_data['text'].apply(lambda text: ' '.join([(w[:w.find('http')] if 'http' in w else w) for w in text.split()]))

In [15]:
#show duplicates on raw text *without link (uncomment next line)
#event_data[event_data.duplicated(['text_nolink'], take_last=True) | event_data.duplicated(['text_nolink'])].sort('text')
#drop duplicates
event_data = event_data.drop_duplicates('text_nolink')

In [16]:
#remove hashtags
#only remove symbol
event_data['text_clean'] = event_data['text_nolink'].apply(lambda text: text.replace('#', ''))
#remove entire token
#event_data['text_clean'] = event_data['text_clean'].apply(lambda text: ' '.join([w for w in text.split() if not w.startswith('#')]))

In [17]:
#remove mentions
#only remove symbol
event_data['text_clean'] = event_data['text_clean'].apply(lambda text: text.replace('@', ''))
#remove entire token
#event_data['text_clean'] = event_data['text_clean'].apply(lambda text: ' '.join([w for w in text.split() if not w.startswith('@')]))

In [18]:
#remove ampersand
event_data['text_clean'] = event_data['text_clean'].apply(lambda text: text.replace('&amp;', ' & '))

In [19]:
#tokenize, remove stopwords
#method preserves mentions and hashtags
tknzr = TweetTokenizer()
#alternatives
#nltk.word_tokenize()
#nltk.tokenize.wordpunct_tokenize()
#tknzr_regrex.tokenize()
#create stop word list
stop = nltk.corpus.stopwords.words('english')
punct = list(string.punctuation)
punct.extend(['...', '..', '…', '”', '“', 'the', '.@', 'RT'])
stop.extend(punct)
#apply tokenization, stopword removal
event_data['text_clean_tokens'] = event_data['text_clean'].apply(lambda s: ' '.join([w.lower() for w in tknzr.tokenize(str(s)) if w.lower() not in stop and len(w) > 1]))

In [20]:
#stemming
stemmer = SnowballStemmer('english', ignore_stopwords=True)
#alternatives
#stemmer = PorterStemmer()
#apply stemming
event_data['text_clean_stems'] = event_data['text_clean_tokens'].apply(lambda tokens: ' '.join([stemmer.stem(str(s)) for s in tokens.split()]))

In [21]:
#TEST EXAMPLE Cleaning
tweet = event_data.iloc[0]
print('raw: "%s"' % tweet['text'])
print('no_link: "%s"' % tweet['text_nolink'])
print('clean: "%s"' % tweet['text_clean'])
print('tokens: "%s"' % tweet['text_clean_tokens'])
print('stems: "%s"' % tweet['text_clean_stems'])

raw: "How does @ICRC work globally to address #sexualviolence in conflict? Learn more here: https://t.co/So5vbZAxPw  #TEDxNations via @PMeigeICRC"
no_link: "How does @ICRC work globally to address #sexualviolence in conflict? Learn more here:  #TEDxNations via @PMeigeICRC"
clean: "How does ICRC work globally to address sexualviolence in conflict? Learn more here:  TEDxNations via PMeigeICRC"
tokens: "icrc work globally address sexualviolence conflict learn tedxnations via pmeigeicrc"
stems: "icrc work global address sexualviol conflict learn tedxnat via pmeigeicrc"


In [22]:
#doc clean columns
event_data_clean = event_data[['id', 'created_at', 'text', 'text_nolink', 'text_clean', 'text_clean_tokens', 'text_clean_stems']]
#save clean data
event_data_clean.to_csv('data/%s_data_clean.txt' % event_name, sep='\t', encoding='utf-8', header=True, index=False)