In [3]:
import os
import sys
import networkx as nx
import community
import json
import re
import nltk
from topik import read_input, tokenize, vectorize, run_model, visualize
from topik.visualizers.termite_plot import termite
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook

output_notebook()

In [4]:
#------------------
# Helper functions
#-----------------

def tweets_n_edges(tweet_file):
    tweets=[]
    edges=[]

    for i in open(tweet_file,"r"):
        if i=="\n":
            next
        else:
            try:
                tweet = json.JSONDecoder().raw_decode(i)[0]
                usr_mentions= tweet['entities']['user_mentions']
                if len(usr_mentions)>0:
                    for ii in usr_mentions:
                        if tweet['user']['screen_name'] != ii['screen_name']:
                            edges.append((tweet['user']['screen_name'], ii['screen_name']))
                tweets.append(tweet)
            except: # if no user mentions, or something unexpected
                continue

    return (tweets,edges)


In [5]:
def get_communities(tweets, edges):
    G_un=nx.Graph()
    G_un.add_edges_from(edges)
    parts = community.best_partition(G_un)
    values = [parts.get(node) for node in G_un.nodes()]

    communities = {}

    for i in tweets:
        screen_name = i['user']['screen_name'].encode("ascii","ignore")
        raw_text = i['text'].encode("ascii","ignore")
        if screen_name in parts.keys() and i['lang'] in ('en','und'): # get english tweets
            comm_num = parts[screen_name]
            if comm_num in communities.keys():
                if screen_name in communities[comm_num].keys():
                    text = communities[comm_num][screen_name]['raw_text']
                    communities[comm_num][screen_name]['n_tweets'] += 1
                    communities[comm_num][screen_name]['raw_text'] = ' '.join([text, raw_text]) 
                else:
                    communities[comm_num][screen_name] = {
                        'raw_text' : raw_text,
                        'n_tweets' : 1 
                    }
            else:
                communities[comm_num] = {}
                communities[comm_num][screen_name] = {
                    'raw_text' : raw_text,
                    'n_tweets' : 1 
                }
        else:
            continue

    return communities

In [6]:
def make_dir_struc(communities):
    os.makedirs("communities")

    for i in communities.keys():
        os.makedirs("./communities/"+str(i))
        for ii in communities[i].keys():
            if communities[i][ii]['n_tweets']>2:
                raw_text = communities[i][ii]['raw_text']

                # try to get rid of links
                taw_text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', raw_text)
                raw_text = ' '.join([iii for iii in raw_text.split() if iii[:4] !="http"])

                # try to get rid of hashtags and user mentions 
                raw_text = ' '.join([iii for iii in raw_text.split() if "#" not in iii])
                raw_text = ' '.join([iii for iii in raw_text.split() if "@" not in iii])

                # clean up
                raw_text = raw_text.encode("ascii","ignore").replace('\n', ' ')
                if len(raw_text.split()) > 100:
                    comm_user = open("./communities/"+str(i)+"/"+ii,"w")
                    comm_user.write(raw_text)
                    comm_user.close()

In [35]:
def topic_model(directory, stopwords, ntopics):
    raw_data = read_input(directory)
    content_field = "text"
    raw_data = ((hash(item[content_field]), item[content_field]) for item in raw_data)
    tokenized_corpus = tokenize(raw_data,stopwords=stopwords)
    vectorized_corpus = vectorize(tokenized_corpus)
    model = run_model(vectorized_corpus, ntopics=ntopics)
    return model

In [None]:
#------------------
# Do work
#-----------------

tweets,edges = tweets_n_edges("data_science_twitter1.txt")
communities = get_communities(tweets,edges)
make_dir_struc(communities)

In [54]:
print len(tweets)
print len(edges)

159600
162070


In [53]:
G=nx.DiGraph()    
G.add_edges_from(edges)
ev_cent=nx.eigenvector_centrality(G,max_iter=10000)

ev_tuple = []
for i in ev_cent.keys():
    ev_tuple.append((i,ev_cent[i]))
    
zip(range(1,21)[::-1],sorted(ev_tuple,key=lambda x: x[1])[-20:])[::-1]

[(1, (u'GilPress', 0.38942565243403915)),
 (2, (u'KirkDBorne', 0.30906334335611996)),
 (3, (u'Forbes', 0.23035596746895132)),
 (4, (u'BernardMarr', 0.21142119479688257)),
 (5, (u'bobehayes', 0.2072355059058224)),
 (6, (u'kdnuggets', 0.15597621686762647)),
 (7, (u'Ronald_vanLoon', 0.15518713444196847)),
 (8, (u'LinkedIn', 0.12561861905035457)),
 (9, (u'DataScienceCtrl', 0.11756733241544594)),
 (10, (u'BoozAllen', 0.11138358070618962)),
 (11, (u'EvanSinar', 0.10228642259454886)),
 (12, (u'OReillyMedia', 0.10067698971253553)),
 (13, (u'mapr', 0.09343251516949147)),
 (14, (u'VoltDB', 0.0932179752390341)),
 (15, (u'kaggle', 0.0790831959961186)),
 (16, (u'forrester', 0.07873766813000185)),
 (17, (u'Datafloq', 0.0723137887537289)),
 (18, (u'bigdata', 0.0699210736037395)),
 (19, (u'InformationWeek', 0.06776873576815852)),
 (20, (u'LaurenNealPhD', 0.06770816501023502))]

<img src="week_3_14_16.png">

In [56]:
#-------------------------
#
#  Top 3 communities
#  by number of documents
#  are 17, 22, 38
#
#  The rest become somewhat less 
#  clear in terms of general
#  topics, which I suspect is
#  due to diminishing data
# 
#-------------------------

# 22 # machine learning, deep learning, AI - somewhat clear
# 17 # business, oracle, rdbms, sql, databases, mongodb - somewhat clear
# 38 # iot, mobile, security-  somewhat clear


stopwords=['amp','get','got','hey','hmm','hoo','hop','iep','let','ooo','par',
            'pdt','pln','pst','wha','yep','yer','aest','didn','nzdt','via',
            'one','com','new','like','great','make','top','awesome','best',
            'good','wow','yes','say','yay','would','thanks','thank','going','ht',
            'new','use','should','could','best','really','see','want','nice', 'rt',
            'while','know','big','data','bigdatablogs']

stopwords=set(stopwords+nltk.corpus.stopwords.words("english"))
ntopics = 50

directory = "./communities/22/" # start with community 22
model = topic_model(directory, stopwords, ntopics)
show(termite(model))

<img src="community22.png">

In [50]:
directory = "./communities/17/" # go on to 17
model = topic_model(directory, stopwords, ntopics)
show(termite(model))

<img src="community17.png">

In [57]:
directory = "./communities/38/" # lastly, look at 38
model = topic_model(directory, stopwords, ntopics)
show(termite(model))

<img src="community38.png">