In [None]:
import pandas as pd
import random

from mongo_aggregation_verbs import *

from lib import create_mongo_client_to_database_collection

collection_reference = create_mongo_client_to_database_collection('twitter', 'tweets')

- https://alexisperrier.com/nlp/2015/09/16/segmentation_twitter_timelines_lda_vs_lsa.html
- https://alexisperrier.com/nlp/2015/09/04/topic-modeling-of-twitter-followers.html

In [None]:
match_empty_url_arrays = { MATCH : { "entities.urls" : [] } }

list(collection_reference.aggregate(
    [
        match_empty_url_arrays,
        { COUNT : "text" }
    ]
))

In [None]:
job_hashtags = ['job', 'jobs', 'hiring', 'careerarc']
location_hashtags = ['california', 'losangeles', 'la', 'santamonica', 'glendale', 'paloalto']
match_not_in_bad = { MATCH : { "text" : { "$in" : job_hashtags + location_hashtags } } }
project_to_text_keep_id = { PROJECT : { "text" : "$entities.hashtags.text" } }
project_to_id = { PROJECT : { "_id" : 1 } }

bad_ids = list(collection_reference.aggregate(
    [
        match_non_empty_hashtag_arrays,
        project_to_text_keep_id,
        unwind_text,
        project_to_lower,
        match_not_in_bad,
        project_to_id
    ]
))
bad_ids[:10], len(bad_ids)

In [None]:
bad_ids = [bad_id['_id'] for bad_id in bad_ids]
bad_ids[:10]

In [None]:
not_in_bad_ids = { "$nin" : bad_ids }

In [None]:
not_in_bad_ids_and_no_url = { 
    "_id"           : not_in_bad_ids, 
    "entities.urls" : []
}

just_the_text = {
    "text" : 1,
    "_id"  : 0
}

In [None]:
collection_reference.find_one(
    not_in_bad_ids_and_no_url,
    just_the_text
)

In [None]:
cur  = collection_reference.find(
    not_in_bad_ids_and_no_url,
    just_the_text
)

tweets = list(cur)
tweet_text = pd.DataFrame(tweets)

In [None]:
len(tweet_text)

In [None]:
tweet_text.head()

In [None]:
tweet_text.text = tweet_text.text.str.replace('http\S+|www.\S+', '', case=False)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(tweet_text.text)
word_occurence = tfidf.transform(tweet_text.text).todense()

In [None]:
word_occurence.shape

In [None]:
words = tfidf.get_feature_names()
word_sample = random.sample(words, 20)
word_occurence_m = pd.DataFrame(word_occurence, columns=words)
word_occurence_m[word_sample].head()

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
lda = LatentDirichletAllocation(n_topics=10, learning_method='batch')
lda.fit(word_occurence)

In [None]:
lda_df = pd.DataFrame(lda.components_, columns=words).T

In [None]:
def filter_topic(lda_df, index, threshold):
    return (lda_df[lda_df[index] > threshold][index]
            .sort_values(ascending=False))

In [None]:
filter_topic(lda_df, 0, 2)

In [None]:
filter_topic(lda_df, 1, 2)

In [None]:
filter_topic(lda_df, 2, 2)

In [None]:
filter_topic(lda_df, 3, 2)

In [None]:
filter_topic(lda_df, 4, 2)

In [None]:
filter_topic(lda_df, 5, 2)

In [None]:
filter_topic(lda_df, 6, 2)

In [None]:
filter_topic(lda_df, 7, 2)