In [177]:
# Import the libraries that we're going to use
from nltk.corpus import twitter_samples
from nltk import casual_tokenize, word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
import string
import re
from nltk.tag import StanfordNERTagger
from tqdm import tqdm_notebook, tnrange
from gensim.models.ldamodel import LdaModel
from gensim import corpora

In [49]:
# Lowercase all text
#
# @param strings
#     An array of sentences (not word tokenized)
# @returns an array of lowercased sentences

def lowercase(strings):
    return [i.lower() for i in strings]

In [50]:
# Tokenize into words (using Tweet tokenizer, probably not suitable for non-Tweet text)
#
# @param strings
#     An array of sentences
# @returns an array of tokenized sentences (each tokenized sentence is an array, so this returns an array of arrays)

def tokenize_tweets(strings):
    return [casual_tokenize(i) for i in strings]

In [59]:
# Tokenize into words (regular text, not Tweet text)
#
# @param strings
#     An array of sentences
# @returns an array of tokenized sentences (each tokenized sentence is an array, so this returns an array of arrays)

def tokenize_regular(strings):
    return [word_tokenize(i) for i in strings]

In [61]:
# Tokenize into sentences (regular text, not Tweet text)
#
# @param text
#     A string of text (not broken into sentences)
# @returns an array of sentences

def tokenize_sentences(text):
    return sent_tokenize(text)

In [89]:
# Remove all punctuation from sentences
#
# @param strings
#     An array of sentences
# @returns an array of sentences with no punctuation

def remove_punctuation(strings):
    stripped = [''.join([i for i in sentence if i not in string.punctuation]) for sentence in strings]
    return [i for i in stripped if len(i.strip()) > 0]

In [126]:
# Either remove all digits from sentences or replace them with pound sign
#
# @param tokenized
#     An array of tokenized sentences
# @ param replace
#     Whether to replace the digits with # or not (default = True)
# @returns an array of sentences with digits removed or replaced

def remove_digits(tokenized,replace=True):
    if replace:
        stripped = [[re.sub('[0123456789]','#',word) for word in sentence] for sentence in tokenized]
    else:
        stripped = [[re.sub('[0123456789]','',word) for word in sentence] for sentence in tokenized]
        stripped = [[word for word in sentence if len(word) > 0 ] for sentence in stripped]
    return [i for i in stripped if len(i) > 0] 

In [132]:
# Stem all words
#
# @param tokenized
#     An array of tokenized sentences
# @returns an array of tokenized sentences with all of the words stemmed

def stem_words(tokenized):
    stemmer = PorterStemmer()
    return [[stemmer.stem(word) for word in sentence] for sentence in tokenized]

In [68]:
# Remove words that occur less than a certain number of times
#
# @param tokenized
#     An array of tokenized sentences
# @param threshold
#     The minimum number of times a word has to occur before it is removed (default = 5)
# @returns an array of tokenized sentences with rare words replaced with 'UNK'

def remove_rare_words(tokenized,threshold=5):
    #count the number of times each word appears in all the sentences
    counter = Counter([word for sentence in tokenized for word in sentence])
    
    #remove words that appear less than the threshold number of times (replace with 'UNK')
    return [[word if counter[word] >= threshold else 'UNK' for word in sentence] for sentence in tokenized]

In [52]:
# Remove links (text that begins with 'http://' or 'https://')
#
# @param tokenized
#     An array of tokenized sentences
# @returns an array of tokenized sentences with links replaced with 'LINK'

def remove_links(tokenized):
    return [[word if word[:7] != 'http://' and word[:8] != 'https://' else 'LINK' for word in sentence] \
            for sentence in tokenized]

In [154]:
# Extract NER relationships from text
# Make sure that you've downloaded the Stanford NER files
# This only extracts persons, organizations, and locations
#
# @param tokenized
#     An array of tokenized sentences
# @returns an array of sentences where each sentence is a list of tuples of (word, entity label)

def extract_ner(tokenized):
    # Download these files from https://nlp.stanford.edu/software/
    # Make sure the paths are set correctly
    st = StanfordNERTagger('/Users/laura/software/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',\
                      '/Users/laura/software/stanford-ner/stanford-ner.jar') 
    return st.tag_sents(tokenized) #Batch processing is important - speeds it up tremendously!

In [192]:
# Calculate topics from text using LDA
#
# @param tokenized
#     An array of tokenized sentences
# @param num_topics
#     The number of topics to calculate
# @returns a list of topics and their weighted words

def topic_modeling(tokenized,num_topics = 10):
    dictionary = corpora.Dictionary(tokenized)
    corpus = [dictionary.doc2bow(sentence) for sentence in tokenized]
    lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary)
    return lda.print_topics()

In [41]:
# Ok, let's get some sample tweets! (you can replace this with your own data)
strings = twitter_samples.strings('tweets.20150430-223406.json') + \
    twitter_samples.strings('positive_tweets.json') + \
    twitter_samples.strings('negative_tweets.json')

In [55]:
# Prepare data
lowercased = lowercase(strings)
tokenized = tokenize_tweets(lowercased)
tokenized = remove_links(tokenized)
tokenized = remove_rare_words(tokenized)

In [155]:
#Named Entity Recognition
ner = extract_ner(tokenized)

In [179]:
topics = topic_modeling(tokenized)

In [134]:
text = 'This is a 1234 bunch of sentences. !!! ??? alks;df Hello there233, world!'

In [135]:
strings = tokenize_sentences(text)
strings = remove_punctuation(strings)
#strings = remove_digits(strings)

In [136]:
strings = tokenize_regular(strings)
strings = remove_digits(strings,replace=False)
strings = stem_words(strings)

In [171]:
#Let's look at some entities
entity = 'PLACE'
entities = set()
counter = Counter()
for sentence in ner:
    entityStarted = False
    fullEntity = ''
    for (word,entityLabel) in sentence:
        if entityLabel == entity:
            print(entityLabel)
        if entityLabel == entity:
            if entityStarted:
                fullEntity += ' ' + word
            else:
                fullEntity = word
                entityStarted = True
        elif entityStarted:
            entities.add(fullEntity)
            counter[fullEntity] += 1
            fullEntity = ''
            entityStarted = False
if fullEntity != '':
    entities.add(fullEntity)
    counter[fullEntity] += 1

In [169]:
for entity in entities:
    print(entity,counter[entity])

In [193]:
lda = topic_modeling(tokenized,num_topics=2)

In [191]:
lda

<gensim.models.ldamodel.LdaModel at 0x121a1ce10>