# Reading a CSV file via DictReader as a dict()

In [35]:
from csv import DictReader


def read_csv(file_name):
    
    rows = []
    with open(file_name,'r',encoding = 'utf-8') as table:
        
        r = DictReader(table)
        for line in r:
            rows.append(line)
    return rows

In [36]:
file_name = 'news_bodies_1.csv' #the csv file name or directory
dataset = read_csv(file_name)

In [37]:
data_set = [data['articleBody'] for data in dataset] # the input to the vectorizer should be a list of documents

# Topic Modeling using CountVectorizer and LDA

In [39]:
import lda 
from sklearn.feature_extraction.text import CountVectorizer


n_topics = 20 # the maximum number of topics to be considerd by LDA
n_iter = 500 # number of iterations

cvectorizer = CountVectorizer(min_df=5, stop_words='english')
cvz = cvectorizer.fit_transform(data_set)

In [40]:
lda_model = lda.LDA(n_topics=n_topics,n_iter=n_iter)
x_topics = lda_model.fit_transform(cvz)

INFO:lda:n_documents: 904
INFO:lda:vocab_size: 5230
INFO:lda:n_words: 155948
INFO:lda:n_topics: 20
INFO:lda:n_iter: 500
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:lda:<0> log likelihood: -1930052
INFO:lda:<10> log likelihood: -1287926
INFO:lda:<20> log likelihood: -1240373
INFO:lda:<30> log likelihood: -1220963
INFO:lda:<40> log likelihood: -1209656
INFO:lda:<50> log likelihood: -1203124
INFO:lda:<60> log likelihood: -1197701
INFO:lda:<70> log likelihood: -1194247
INFO:lda:<80> log likelihood: -1192181
INFO:lda:<90> log likelihood: -1189274
INFO:lda:<100> log likelihood: -1187357
INFO:lda:<110> log likelihood: -1185771
INFO:lda:<120> log likelihood: -1183405
INFO:lda:<130> log likelihood: -1181352
INFO:lda:<140> log likelihood: -1179629
INFO:lda:<150> log likelihood: -1178240
INFO:lda:<160> log likelihood: -1177393
INFO:lda:<170> log likelihood: -1177466
INFO:lda:<180> log likelihood: -1176676
INFO:lda:<190> log likelihood: -1176106
INFO:lda:<200> log likelihood: -117

# Reducing the dimensions from 20 -> 2 using t-SNE

In [42]:
from sklearn.manifold import TSNE

tsne_model = TSNE (n_components = 2, verbose = 1, random_state = 0, angle = 0.99, init='pca')
tsne_lda = tsne_model.fit_transform(x_topics)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 904 samples in 0.001s...
[t-SNE] Computed neighbors for 904 samples in 0.031s...
[t-SNE] Computed conditional probabilities for sample 904 / 904
[t-SNE] Mean sigma: 0.165059
[t-SNE] KL divergence after 250 iterations with early exaggeration: 48.927460
[t-SNE] KL divergence after 1000 iterations: 0.262631


In [44]:
import numpy as np
import bokeh.plotting as bp
from bokeh.plotting import save
from bokeh.models import HoverTool

n_top_words = 5 # the number of representing words for each cluster of documents

# 20 colors
colormap = np.array([
   "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
   "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
   "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
   "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

In [46]:
_lda_keys = []
for i in range(x_topics.shape[0]):
 _lda_keys +=  x_topics[i].argmax(),

In [48]:
topic_summaries = []
topic_word = lda_model.topic_word_  # all topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
 topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get!
 topic_summaries.append(' '.join(topic_words)) # append!

In [60]:
title = 'news sample topics'
num_example = len(x_topics)

plot_lda = bp.figure(plot_width=1400, plot_height=1100,
                    title=title,
                    tools='pan,wheel_zoom,box_zoom,reset,hover,previewsave',
                    x_axis_type=None, y_axis_type=None, min_border=1)


ys=tsne_lda[:, 1]
xs=tsne_lda[:, 0]
source = bp.ColumnDataSource(data = dict(
                x = xs,
                y = ys,
                color=colormap[_lda_keys][:num_example],
                content = data_set[:num_example],
                topic_key = _lda_keys[:num_example]
                  ))
plot_lda.scatter(x ='x',
                 y ='y', 
                source=source,
                color ='color')

In [61]:
# randomly choose a news (within a topic) coordinate as the crucial words coordinate
topic_coord = np.empty((x_topics.shape[1], 2)) * np.nan
for topic_num in _lda_keys:
 if not np.isnan(topic_coord).any():
   break
 topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]

# plot crucial words
for i in range(x_topics.shape[1]):
 plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

# save the plot
save(plot_lda, '{}.html'.format(title))

'/home/nima/nima/stance_detection/fakenewschalllenge_baseline/fnc-1-baseline/news sample topics.html'