# Setup

In [19]:
from tqdm import tqdm
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
import glob
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer



# Data

In [21]:
path =r'./cleaned_data/' # use your path
allFiles = glob.glob(path + "/*.csv")
df = pd.DataFrame()

list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None)
    list_.append(df)

df = pd.concat(list_)
df.reset_index(inplace=True, drop=True)

In [22]:
sid = SentimentIntensityAnalyzer()

In [23]:
sid.polarity_scores("awsaome football game")

{'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0}

In [24]:
df.head()

Unnamed: 0,timestamp,category_id,topic_id,topic_title,message_number,message_author,message_text,quoteheader
0,1514379921,57,2653937,16.5K will be the major resistance for BTC ?,1,AICoin_Official,"As you can see, price started a correction imm...",
1,1514380918,57,2653937,16.5K will be the major resistance for BTC ?,2,nokati,And&nbsp; who is this Kuang Ren with such a wi...,[]
2,1514381025,57,2653937,16.5K will be the major resistance for BTC ?,3,fabiorem,"Yes, theres a lot of resistance at this level,...",[]
3,1514382464,57,2653937,16.5K will be the major resistance for BTC ?,4,heringasem,With the current price fluctuations the price ...,[]
4,1514384271,57,2653937,16.5K will be the major resistance for BTC ?,5,CuDoCuDau,You should buy bitcoin,"['<div class=""quoteheader""><a href=""https://bi..."


In [29]:
df['pos'] = 0.
df['neg'] = 0.
df['neu'] = 0.
df['compound'] = 0.

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41328 entries, 0 to 41327
Data columns (total 12 columns):
timestamp         41328 non-null int64
category_id       41328 non-null int64
topic_id          41328 non-null int64
topic_title       41328 non-null object
message_number    41328 non-null int64
message_author    41327 non-null object
message_text      41111 non-null object
quoteheader       41324 non-null object
pos               41328 non-null float64
neg               41328 non-null float64
neu               41328 non-null float64
compound          41328 non-null float64
dtypes: float64(4), int64(4), object(4)
memory usage: 3.8+ MB


In [35]:
## LABELING POSTS AS NEGATIVE, POSITIVE OR NEUTRAL

for i, row in df.iterrows():
    pol_scores = sid.polarity_scores(str(row.message_text))
    df.set_value(i,'pos', float(pol_scores['pos']))
    df.set_value(i,'neg', float(pol_scores['neg']))
    df.set_value(i,'neu', float(pol_scores['neu']))
    df.set_value(i,'compound', float(pol_scores['compound']))

  """
  
  import sys
  


In [36]:
df.to_csv('pos_neg.csv')

In [41]:
df.tail()

Unnamed: 0,timestamp,category_id,topic_id,topic_title,message_number,message_author,message_text,quoteheader,pos,neg,neu,compound
41323,1411050454,57,787222,.,2,Hfertig,And back down to 8365 without a short squ...,"['<div class=""quoteheader""><a href=""https://bi...",,,,
41324,1411050918,57,787222,.,3,klee,So what is their next move? Up?,[],,,,
41325,1411051112,57,787222,.,4,Hfertig,Difficult to say. Selling pressure is sti...,"['<div class=""quoteheader""><a href=""https://bi...",,,,
41326,1411075852,57,787222,.,5,molecular,doesn't the same go for the $20 millions ...,"['<div class=""quoteheader""><a href=""https://bi...",,,,
41327,1411075887,57,787222,.,6,grappa_barricata,"If i may add something, a sizable part of the ...",[],,,,


In [145]:
df.tail()

Unnamed: 0,timestamp,category_id,topic_id,topic_title,message_number,message_author,message_text,quoteheader
41323,1411050454,57,787222,.,2,Hfertig,And back down to 8365 without a short squ...,"['<div class=""quoteheader""><a href=""https://bi..."
41324,1411050918,57,787222,.,3,klee,So what is their next move? Up?,[]
41325,1411051112,57,787222,.,4,Hfertig,Difficult to say. Selling pressure is sti...,"['<div class=""quoteheader""><a href=""https://bi..."
41326,1411075852,57,787222,.,5,molecular,doesn't the same go for the $20 millions ...,"['<div class=""quoteheader""><a href=""https://bi..."
41327,1411075887,57,787222,.,6,grappa_barricata,"If i may add something, a sizable part of the ...",[]


---

# Message Concatenating:

In [42]:
grouped_topics = df.groupby(df.topic_title).groups

In [147]:
dict_topic = {}
for tit in tqdm(grouped_topics.keys()):
    sentence = (df.message_text[grouped_topics[tit]].values)
    dict_topic.update({tit: ''.join(str(v) for v in sentence) })

100%|██████████| 1100/1100 [00:00<00:00, 1827.90it/s]


In [148]:
list_message = []
for i in dict_topic.keys():
    list_message.append(dict_topic[i])

-------------

# LDA

In [149]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')  #match any word characters until it reaches a non-word character, like a space.

In [150]:
raw = list_message[0].lower()
tokens = tokenizer.tokenize(raw)  

# Stop words

In [151]:
from stop_words import get_stop_words

# create English stop words list
en_stop = get_stop_words('en')

In [152]:
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]
print(stopped_tokens)

['even', 'saw', 'much', 'impact', 'upcoming', 'fork', 'predicted', '6', '7k', 'easily', 'end', 'year', 'nbsp', 'much', 'news', 'btc', 'lot', 'adoption', 'now', 'fork', 'getting', 'closer', 'everyone', 'dumping', 'alt', 'coins', 'buy', 'btc', 'get', 'free', 'nbsp', 'btg', 'coins', 'nbsp', 'created', 'selling', 'alt', 'coin', 'buying', 'frenzy', 'btc', 'dumped', 'decent', 'amount', 'alt', 'coins', 'last', 'week', 'dropped', '25', '60', '7', 'days', 'across', 'board', 'nbsp', 'sure', 'makes', 'great', 'buying', 'opportunity', 'alt', 'coins', 'nbsp', 'get', 'alts', 'cheaper', 'added', 'value', 'btc', 'buy', 'nbsp', 'btc', '1470', 'since', 'last', 'week', 'nbsp', 'o', 'nbsp', 'nbsp', 'smashed', '5k', 'barrier', 'like', 'didn', 't', 'exist', 'dont', 'see', '6k', 'barrier', 'either', 'kind', 'frenzy', '7k', 'likely', 'wouldn', 't', 'surprised', 'see', 'touch', '8k', 'btc', 'gold', 'thar', 'hills', 'nbsp', 'nbsp', 'd', 'nbsp', '1470', '1', 'week', 'huge', 'massive', 'next', 'step', '7000']


# Our words

In [153]:
remove_words = ['will','s' , 'nbsp', 't', 'com', 'http', 'amp', '1xnk8bc', 'href', 'oto', 'www' ]
stopped_tokens = [i for i in stopped_tokens if not i in remove_words]
print(stopped_tokens)

['even', 'saw', 'much', 'impact', 'upcoming', 'fork', 'predicted', '6', '7k', 'easily', 'end', 'year', 'much', 'news', 'btc', 'lot', 'adoption', 'now', 'fork', 'getting', 'closer', 'everyone', 'dumping', 'alt', 'coins', 'buy', 'btc', 'get', 'free', 'btg', 'coins', 'created', 'selling', 'alt', 'coin', 'buying', 'frenzy', 'btc', 'dumped', 'decent', 'amount', 'alt', 'coins', 'last', 'week', 'dropped', '25', '60', '7', 'days', 'across', 'board', 'sure', 'makes', 'great', 'buying', 'opportunity', 'alt', 'coins', 'get', 'alts', 'cheaper', 'added', 'value', 'btc', 'buy', 'btc', '1470', 'since', 'last', 'week', 'o', 'smashed', '5k', 'barrier', 'like', 'didn', 'exist', 'dont', 'see', '6k', 'barrier', 'either', 'kind', 'frenzy', '7k', 'likely', 'wouldn', 'surprised', 'see', 'touch', '8k', 'btc', 'gold', 'thar', 'hills', 'd', '1470', '1', 'week', 'huge', 'massive', 'next', 'step', '7000']


# Stemming

In [154]:
from nltk.stem.porter import PorterStemmer

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

# Create lemmatizer
wnl = WordNetLemmatizer()

In [155]:
# stem token
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

print(stemmed_tokens)

['even', 'saw', 'much', 'impact', 'upcom', 'fork', 'predict', '6', '7k', 'easili', 'end', 'year', 'much', 'news', 'btc', 'lot', 'adopt', 'now', 'fork', 'get', 'closer', 'everyon', 'dump', 'alt', 'coin', 'buy', 'btc', 'get', 'free', 'btg', 'coin', 'creat', 'sell', 'alt', 'coin', 'buy', 'frenzi', 'btc', 'dump', 'decent', 'amount', 'alt', 'coin', 'last', 'week', 'drop', '25', '60', '7', 'day', 'across', 'board', 'sure', 'make', 'great', 'buy', 'opportun', 'alt', 'coin', 'get', 'alt', 'cheaper', 'ad', 'valu', 'btc', 'buy', 'btc', '1470', 'sinc', 'last', 'week', 'o', 'smash', '5k', 'barrier', 'like', 'didn', 'exist', 'dont', 'see', '6k', 'barrier', 'either', 'kind', 'frenzi', '7k', 'like', 'wouldn', 'surpris', 'see', 'touch', '8k', 'btc', 'gold', 'thar', 'hill', 'd', '1470', '1', 'week', 'huge', 'massiv', 'next', 'step', '7000']


# tokenizing, removing stop words, stemmming

In [156]:
from collections import Counter

In [11]:
texts = []

remove_words = ['will','s' ,'bitcoin', 'just', 'get', 'use', 'now', 'solidx' 'people','bitfinex', 'think', 'maybe', 'imageshack', 'pt', 'em', 'img', 'nbsp', 't', 'com', 'http', 'amp', '1xnk8bc', 'href', 'oto', 'www', 'isn', 'etc', 'etf', 'tr', 'td', 'img', 'ath', 'xt', 'xp', 'php', 'img', 'gt', 'pboc', 'th', 'mtgox', 'cny', 'huobi',
 'm', 'import', 'st', 'lt', 'zhou' 'ok', 'color', 'can', "adam", "bitcoin", 'import', 'http', 'li', 'b', 'style' , 'font', 're', 'le', 'gif','span','hr', 'd' , 'jpg', 'png',  'am5om', 'fud', 'mt', 'th' 'hfebupaeo', 'ftdata', 'zbb', 'imgur', 'bite', 'uztgwi', 'podomatic']

map_words = {
    'btc': 'bitcoin',
    'bcc': 'bitcoin',
    'gbtc': 'bitcoin',
    'bitcoinca': 'bitcoin',
    'better': 'good',
    'increase': 'rise',
    'miner': 'mine',
    'winner': 'win'
}

for msg in tqdm(range(len(list_message))):
    raw = list_message[msg].lower()
    tokens = tokenizer.tokenize(raw)
    
    stopped_tokens = [i for i in tokens if not i in en_stop]
    stopped_tokens = [i for i in stopped_tokens if not i in remove_words]

    #   lemmatize tokents:  
    lemmatized_tokens = [wnl.lemmatize(i, 'a') for i in stopped_tokens]
    lemmatized_tokens = [wnl.lemmatize(i, 'n') for i in lemmatized_tokens]
    lemmatized_tokens = [wnl.lemmatize(i, 'v') for i in lemmatized_tokens]
    lemmatized_tokens = [wnl.lemmatize(i, 'r') for i in lemmatized_tokens]
    lemmatized_tokens = [wnl.lemmatize(i, 's') for i in lemmatized_tokens]
    
    #     stemmed_tokens = [p_stemmer.stem(i) for i in lemmatized_tokens]
    
    #   remove words:
    stemmed_tokens = [i for i in lemmatized_tokens if not i in remove_words]
    stemmed_tokens = [i for i in stemmed_tokens if not i in en_stop]
    d = pd.DataFrame({'z': stemmed_tokens})
    stemmed_tokens = d.replace(map_words)['z'].tolist()
    
    #   remove numeric values:
    alpha_only = [''.join(filter(str.isalpha, i))  for i in stemmed_tokens if len(''.join(filter(str.isalpha, i))) >1]
    texts.append(alpha_only)

NameError: name 'list_message' is not defined

In [26]:
for i in (range(len(texts))):
    print(Counter(texts[i]).most_common(7))
    print('------------'+str(i))

In [230]:
flat_list = [item for sublist in texts for item in sublist]

In [231]:
len(flat_list)

840580

In [232]:
Counter(flat_list).most_common()

[('price', 21745),
 ('go', 11111),
 ('see', 7671),
 ('buy', 7577),
 ('time', 7492),
 ('bitcoin', 7461),
 ('people', 7436),
 ('year', 7133),
 ('rise', 6339),
 ('happen', 6132),
 ('like', 5832),
 ('sell', 5273),
 ('good', 5221),
 ('market', 5079),
 ('make', 5003),
 ('don', 4807),
 ('even', 4660),
 ('one', 4510),
 ('coin', 4317),
 ('high', 4311),
 ('say', 4173),
 ('know', 4056),
 ('still', 3980),
 ('day', 3953),
 ('long', 3811),
 ('reach', 3700),
 ('much', 3678),
 ('money', 3607),
 ('value', 3381),
 ('back', 3380),
 ('right', 3314),
 ('take', 3300),
 ('come', 3230),
 ('really', 3195),
 ('hold', 3055),
 ('month', 3005),
 ('way', 2966),
 ('also', 2965),
 ('bitcoins', 2950),
 ('look', 2926),
 ('thing', 2905),
 ('big', 2844),
 ('next', 2822),
 ('need', 2779),
 ('new', 2718),
 ('end', 2644),
 ('low', 2619),
 ('lot', 2584),
 ('start', 2550),
 ('want', 2547),
 ('many', 2545),
 ('well', 2503),
 ('trade', 2496),
 ('sure', 2467),
 ('may', 2370),
 ('around', 2368),
 ('drop', 2294),
 ('exchange', 229

# LDA 

In [5]:
import gensim

In [3]:
import datetime

In [4]:
from gensim import corpora, models

a = datetime.datetime.now()

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

b = datetime.datetime.now()
c = b - a

NameError: name 'texts' is not defined

In [38]:
print(c)

0:00:19.882806


In [234]:
a = datetime.datetime.now()
damodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=15, id2word = dictionary, passes=20)
b = datetime.datetime.now()
c = b - a

In [235]:
print( c)


0:04:26.926923


In [15]:
from sklearn.externals import joblib
# joblib.dump(damodel, 'LDA_model.pkl') 

In [20]:
damodel = joblib.load('LDA_model.pkl') 

In [18]:
damodel.get_document_topics

[(173, 0.038102824),
 (175, 0.022032823),
 (281, 0.015734375),
 (42, 0.011334545),
 (118, 0.0071779178),
 (678, 0.0066948608),
 (508, 0.006389695),
 (36, 0.0063665253),
 (119, 0.0063250996),
 (535, 0.005789681)]

In [30]:
["m", "import",  "import", "http", "li", "b", "style" , "font", "span","hr", "d" , "jpg", "png" ]

['m',
 'import',
 'import',
 'http',
 'li',
 'b',
 'style',
 'font',
 'span',
 'hr',
 'd',
 'jpg',
 'png']

In [None]:
'imageshack', 'pt', 'em', 'img'

In [292]:
# get LDA topics:
for i in damodel.print_topics(num_topics=50, num_words=10):
    print(i)
    print('-----------------')

(0, '0.038*"china" + 0.022*"chinese" + 0.016*"exchange" + 0.011*"news" + 0.007*"ban" + 0.007*"time" + 0.006*"people" + 0.006*"like" + 0.006*"bank" + 0.006*"price"')
-----------------
(1, '0.016*"spend" + 0.015*"double" + 0.011*"blockchain" + 0.010*"transaction" + 0.008*"fork" + 0.007*"chain" + 0.006*"address" + 0.005*"see" + 0.005*"info" + 0.004*"merchant"')
-----------------
(2, '0.020*"price" + 0.015*"go" + 0.011*"market" + 0.009*"see" + 0.009*"low" + 0.008*"bitcoin" + 0.007*"buy" + 0.007*"back" + 0.007*"time" + 0.007*"day"')
-----------------
(3, '0.018*"buy" + 0.014*"bitcoin" + 0.013*"sell" + 0.012*"price" + 0.011*"go" + 0.009*"coin" + 0.009*"people" + 0.008*"like" + 0.008*"time" + 0.007*"make"')
-----------------
(4, '0.015*"bitcoinica" + 0.008*"market" + 0.006*"one" + 0.006*"short" + 0.006*"people" + 0.006*"make" + 0.005*"go" + 0.005*"position" + 0.005*"don" + 0.005*"like"')
-----------------
(5, '0.011*"price" + 0.009*"bitcoin" + 0.009*"people" + 0.008*"go" + 0.007*"year" + 0.00

In [39]:
from itertools import chain
# Assigns the topics to the documents in corpus
lda_corpus = damodel[corpus]

# Find the threshold, let's set the threshold to be 1/#clusters,
# To prove that the threshold is sane, we average the sum of all probabilities:
scores = list(chain(*[[score for topic_id,score in topic] for topic in [doc for doc in lda_corpus]]))
threshold = sum(scores)/len(scores)
print (threshold)
print()

# cluster1 = [j for i,j in zip(lda_corpus,texts) if i[0][1] > threshold]
# cluster2 = [j for i,j in zip(lda_corpus,texts) if i[1][1] > threshold]
# cluster3 = [j for i,j in zip(lda_corpus,texts) if i[2][1] > threshold]
# cluster4 = [j for i,j in zip(lda_corpus,texts) if i[3][1] > threshold]
# cluster5 = [j for i,j in zip(lda_corpus,texts) if i[4][1] > threshold]

# print (cluster1)
# print( cluster2)
# print( cluster3)
# print( cluster4)
# print( cluster5)

IndexError: index 119317 is out of bounds for axis 1 with size 119317

In [30]:
print(damodel.print_topics(num_topics=10, num_words=20))

[(0, '0.040*"bitcoin" + 0.012*"peopl" + 0.011*"can" + 0.009*"btc" + 0.008*"use" + 0.008*"like" + 0.007*"money" + 0.007*"just" + 0.007*"think" + 0.006*"get" + 0.006*"coin" + 0.006*"one" + 0.006*"currenc" + 0.006*"make" + 0.006*"don" + 0.005*"valu" + 0.005*"price" + 0.005*"market" + 0.005*"exchang" + 0.005*"go"'), (1, '0.042*"0" + 0.042*"span" + 0.039*"32" + 0.034*"1" + 0.029*"btc" + 0.024*"style" + 0.021*"font" + 0.019*"b" + 0.018*"import" + 0.018*"3em" + 0.018*"height" + 0.018*"size" + 0.018*"line" + 0.016*"36" + 0.012*"http" + 0.012*"li" + 0.011*"12pt" + 0.010*"whaleclub" + 0.009*"co" + 0.009*"hr"'), (2, '0.007*"1" + 0.007*"d" + 0.007*"0" + 0.007*"jpg" + 0.007*"png" + 0.006*"http" + 0.006*"org" + 0.005*"like" + 0.005*"chart" + 0.005*"m" + 0.005*"b" + 0.004*"2" + 0.004*"one" + 0.004*"post" + 0.004*"just" + 0.004*"imgur" + 0.004*"can" + 0.004*"look" + 0.004*"5" + 0.004*"use"'), (3, '0.056*"bitcoin" + 0.050*"price" + 0.015*"think" + 0.014*"year" + 0.014*"can" + 0.014*"now" + 0.013*"go" +

In [72]:
for x in ldamodel.print_topics(num_topics=5, num_words=15):
    print(x)

(0, '0.011*"will" + 0.010*"999" + 0.010*"price" + 0.009*"s" + 0.009*"market" + 0.009*"nbsp" + 0.008*"t" + 0.008*"http" + 0.007*"see" + 0.007*"go" + 0.006*"like" + 0.006*"com" + 0.006*"bitcoin" + 0.006*"now" + 0.006*"time"')
(1, '0.028*"price" + 0.024*"will" + 0.015*"nbsp" + 0.014*"halv" + 0.013*"go" + 0.011*"think" + 0.010*"http" + 0.009*"t" + 0.009*"now" + 0.009*"see" + 0.008*"bitcoin" + 0.008*"com" + 0.007*"s" + 0.007*"400" + 0.007*"500"')
(2, '0.018*"nbsp" + 0.014*"bitcoin" + 0.013*"t" + 0.012*"s" + 0.009*"will" + 0.009*"btc" + 0.008*"just" + 0.007*"buy" + 0.007*"price" + 0.007*"can" + 0.007*"peopl" + 0.007*"like" + 0.006*"go" + 0.006*"get" + 0.006*"market"')
(3, '0.072*"nbsp" + 0.015*"amp" + 0.015*"bitcoin" + 0.010*"s" + 0.010*"http" + 0.009*"t" + 0.008*"will" + 0.008*"1" + 0.007*"btc" + 0.006*"com" + 0.006*"0" + 0.005*"use" + 0.005*"can" + 0.005*"like" + 0.005*"9617"')
(4, '0.045*"bitcoin" + 0.035*"will" + 0.034*"price" + 0.012*"think" + 0.012*"can" + 0.012*"year" + 0.011*"s" + 0.

In [66]:
ldamodel.get_document_topics(corpus[1])

[(2, 0.99671769)]

-----------------

In [34]:
list_message[0]

'Even before I saw how much impact the upcoming fork was having I predicted 6-7K easily by end of year.&nbsp; Much more news about BTC and a lot more adoption.      Now with Fork getting closer everyone is dumping alt coins to buy BTC and get the "free"&nbsp; BTG coins.&nbsp; Has created a selling of alt coin and buying frenzy of BTC.               I dumped a decent amount into alt coins in the last week as they had dropped 25-60% in about 7 days across the board.&nbsp; Sure makes for a great buying opportunity of alt coins.&nbsp;      You get alts for cheaper and added value of your BTC to buy them with.&nbsp;           BTC is up $1470 since last week&nbsp; :o&nbsp; &nbsp;Smashed through 5K barrier like it didn\'t exist. I dont see 6K being a barrier either with this kind of frenzy. 7K is likely and I wouldn\'t be surprised at all to see it touch 8K. There is BTC Gold in them thar hills........&nbsp; &nbsp;;D                    &nbsp; 1470 in 1 week is huge, massive.     Next step is 

In [59]:
texts = []
for msg in range(len(list_message)):
    # Convert the document into lowercase
    raw = list_message[msg].lower()
    # Tokenize the document: tokens
    tokens = tokenizer.tokenize(raw)
    
    stopped_tokens = [i for i in tokens if not i in en_stop]
    stopped_tokens = [i for i in stopped_tokens if not i in remove_words]
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    texts.append(stemmed_tokens)
    
    

In [None]:
# Import Dictionary
from gensim.corpora.dictionary import Dictionary

# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(articles)

# Select the id for "computer": computer_id
computer_id = dictionary.token2id.get("computer")

# Use computer_id with the dictionary to print the word
print(dictionary.get(computer_id))

# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(article) for article in articles]

# Print the first 10 word ids with their frequency counts from the fifth document
print(corpus[4][:10])
corpus[0]

In [None]:
# Save the fifth document: doc
doc = corpus[4]

# Sort the doc for frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)

# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
    print(dictionary.get(word_id), word_count)
    
# Create the defaultdict: total_word_count
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count

# Create a sorted list from the defaultdict: sorted_word_count 
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True) 

# Print the top 5 words across all documents alongside the count
for word_id, word_count in sorted_word_count[:5]:
    print(dictionary.get(word_id), word_count)


In [7]:
df = pd.read_csv("pos_neg.csv")

In [8]:
df

Unnamed: 0.1,Unnamed: 0,timestamp,category_id,topic_id,topic_title,message_number,message_author,message_text,quoteheader,pos,neg,neu,compound
0,0,1514379921,57,2653937,16.5K will be the major resistance for BTC ?,1,AICoin_Official,"As you can see, price started a correction imm...",,0.082,0.041,0.876,0.9201
1,1,1514380918,57,2653937,16.5K will be the major resistance for BTC ?,2,nokati,And&nbsp; who is this Kuang Ren with such a wi...,[],0.112,0.033,0.854,0.7476
2,2,1514381025,57,2653937,16.5K will be the major resistance for BTC ?,3,fabiorem,"Yes, theres a lot of resistance at this level,...",[],0.110,0.000,0.890,0.2144
3,3,1514382464,57,2653937,16.5K will be the major resistance for BTC ?,4,heringasem,With the current price fluctuations the price ...,[],0.127,0.058,0.815,0.4939
4,4,1514384271,57,2653937,16.5K will be the major resistance for BTC ?,5,CuDoCuDau,You should buy bitcoin,"['<div class=""quoteheader""><a href=""https://bi...",0.000,0.000,1.000,0.0000
5,5,1514390315,57,2653937,16.5K will be the major resistance for BTC ?,6,BrewMaster,there are currently a couple of things going o...,[],0.169,0.000,0.831,0.8555
6,6,1514393063,57,2653937,16.5K will be the major resistance for BTC ?,7,CuDoCuDau,I think btc will increase $30000 in next year,"['<div class=""quoteheader""><a href=""https://bi...",0.247,0.000,0.753,0.3182
7,7,1514395791,57,2653937,16.5K will be the major resistance for BTC ?,8,Yaunfitda,"In my view, $16K is another barrier that ...","['<div class=""quoteheader""><a href=""https://bi...",0.128,0.113,0.759,0.5106
8,8,1514397927,57,2653937,16.5K will be the major resistance for BTC ?,9,mrcash02,Bitcoin has already hit $19.000 previously. No...,[],0.123,0.029,0.848,0.9031
9,9,1514405189,57,2653937,16.5K will be the major resistance for BTC ?,10,Gaaara,Nope I guess that can only happen after t...,"['<div class=""quoteheader""><a href=""https://bi...",0.090,0.020,0.889,0.5859


In [10]:
damodel.get_document_topics(corpus[1])

NameError: name 'corpus' is not defined