In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re


def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            # print(f'These are the top comments: {documents[doc_index]}')
            pass

df = pd.read_csv('expanded_comments.csv')

def preprocess(text):
    t = text.lower()
    # t = re.sub('_',r'',t)
    # t = re.sub('\d+',r'',t)
    t = re.sub(r'@[^ ]*',r'',t)
    t = re.sub(r'\W+',r' ',t)
    t = re.sub(r'(could|would|like|list|net|mailto|subject|http)', '', t)
    t = re.sub(r'\b\w{1,3}\b', '', t)
    stopwords_list = stopwords.words('english')
    txt = ' '.join([word for word in t.split() if word not in stopwords_list])
    return txt

lemmatizer = WordNetLemmatizer()


df['documents'] = [' '.join([lemmatizer.lemmatize(preprocess(email))])
                 .strip() for email in df['comment']]

documents = df['documents']

no_features = 1000


# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names_out()

no_topics = 5

# Run LDA
lda_model = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=42).fit(tf)
lda_W = lda_model.transform(tf)
lda_H = lda_model.components_

no_top_words = 15
no_top_documents = 500
display_topics(lda_H, lda_W, tf_feature_names, df.comment, no_top_words, no_top_documents)


Topic 0:
people human technology think humans make know time world need want control going nuclear humanity
Topic 1:
corporations hold responsible earth regulation legislation ethical companies corporate profit long greed rein laws species
Topic 2:
people tech companies government hands kill stop american threat executives gets liable agree thing generated
Topic 3:
human systems world science self computer energy maybe mind argue improve fiction destroy year machine
Topic 4:
intelligence artificial good capitalism industry humans government target href _blank extinction china technology stupidity destruction


In [2]:
from wordcloud import WordCloud


In [3]:
topic_1 = 'people human technology think humans make know time world need want control going nuclear humanity'.split()
topic_2 = 'corporations hold responsible earth regulation legislation ethical companies corporate profit long greed rein laws species'.split()
topic_3 = 'people tech companies government hands kill stop american threat executives gets liable agree thing generated'.split()
topic_4 = 'human systems world science self computer energy maybe mind argue improve fiction destroy year machine'.split()
topic_5 = 'intelligence artificial good capitalism industry humans government target href _blank extinction china technology stupidity destruction'.split()

In [4]:
string = ','.join(list(topic_1))
wordcloud = WordCloud(background_color="lightblue", max_words=250, contour_width=3, contour_color='steelblue',colormap='Accent_r')
wordcloud.generate(string)
wordcloud.to_file('topic1.png')

<wordcloud.wordcloud.WordCloud at 0x1de1bd3e650>

In [5]:
string = ','.join(list(topic_2))
wordcloud = WordCloud(background_color="lightblue", max_words=250, contour_width=3, contour_color='steelblue',colormap='Accent_r')
wordcloud.generate(string)
wordcloud.to_file('topic2.png')

<wordcloud.wordcloud.WordCloud at 0x1de1c3b43d0>

In [6]:
string = ','.join(list(topic_3))
wordcloud = WordCloud(background_color="lightblue", max_words=250, contour_width=3, contour_color='steelblue',colormap='Accent_r')
wordcloud.generate(string)
wordcloud.to_file('topic3.png')

<wordcloud.wordcloud.WordCloud at 0x1de65784c10>

In [7]:
string = ','.join(list(topic_4))
wordcloud = WordCloud(background_color="lightblue", max_words=250, contour_width=3, contour_color='steelblue',colormap='Accent_r')
wordcloud.generate(string)
wordcloud.to_file('topic4.png')

<wordcloud.wordcloud.WordCloud at 0x1de1b813a50>

In [8]:
string = ','.join(list(topic_5))
wordcloud = WordCloud(background_color="lightblue", max_words=250, contour_width=3, contour_color='steelblue',colormap='Accent_r')
wordcloud.generate(string)
wordcloud.to_file('topic5.png')

<wordcloud.wordcloud.WordCloud at 0x1de1c4b6850>

In [4]:
topic = pd.DataFrame(lda_W.argmax(axis=1))

df_top = pd.concat([df,topic],axis=1)


df_top.columns = ['drop','comment_id','name','location','comment','time','likes','rep_count','replies','parent_id','pre_com','drop2','topic']

df_top.drop('drop',axis=1,inplace=True)
df_top.drop('drop2',axis=1,inplace=True)


In [5]:
df_top.parent_id.fillna(0,inplace=True)
df_top['parent_id'] = df_top.parent_id.astype(int)

In [6]:
# df_top.to_csv('final_comms.csv')

In [7]:
import plotly.graph_objects as go

import networkx as nx

In [9]:
df_top['topic'] = df_top.topic.apply(lambda x: 'Topic 1' if x == 0 else 'Topic 2' if x == 1 else 'Topic 3' if x == 2 else 'Topic 4' if x == 3 else 'Topic 5')

In [10]:
df_top

Unnamed: 0,comment_id,name,location,comment,time,likes,rep_count,replies,parent_id,pre_com,topic
0,125405146,Paul,Milwaukee,It is very possible that social media have alr...,2023-05-30 14:37:15,522,5,"[{'commentID': 125405281, 'status': 'approved'...",0,possible social media already cost us american...,Topic 3
1,125405281,Chris V,Detroit,@Paul I believe that accurate beliefs are abso...,2023-05-30 14:46:32,61,0,[],125405146,believe accurate beliefs absolutely necessary ...,Topic 1
2,125405707,Paul,Milwaukee,@Chris V Our cognitive systems are not designe...,2023-05-30 15:11:42,22,0,[],125405281,v cognitive systems designed natural selection...,Topic 1
3,125413598,Chris Burks,"Mount Vernon, WA",@Paul \nAbsolutely agree. I don’t think even l...,2023-05-30 20:37:47,12,0,[],125405146,absolutely agree think even liberal educated p...,Topic 1
4,125413969,L Kim,Seattle,@Paul Belief is a a trait that developed late ...,2023-05-30 20:53:27,1,0,[],125405146,belief trait developed late evolution necessary,Topic 2
...,...,...,...,...,...,...,...,...,...,...,...
1443,125431033,C,"N.,Y,",Purdue Pharma's Sackler family paid $6 billion...,2023-05-31 15:57:59,218,0,[],0,purdue pharma sackler family paid 6 billion do...,Topic 2
1444,125430839,RjW,RollingPrairie,"Unless prevented, the nihilistic qualities of ...",2023-05-31 15:48:57,39,0,[],0,unless prevented nihilistic qualities currentl...,Topic 1
1445,125428441,LB,U.S.,"Putting ""the"" government in charge of things i...",2023-05-31 13:27:24,100,0,[],0,putting government charge things hardly soluti...,Topic 5
1446,125426057,JL Turriff,Concordia MO,This is just another example of 'extinction by...,2023-05-31 06:16:50,157,0,[],0,another example extinction stupidity much glob...,Topic 5


In [11]:

G = nx.Graph()



# topic_col = {
#             0:'red',
#             1:'blue',
#             2:'green',
#             3:'yellow',
#             4:'orange'
# }

topic_col = {
            'Topic 1':'teal',
            'Topic 2':'gold',
            'Topic 3':'lightgreen',
            'Topic 4':'lightpink',
            'Topic 5':'purple'
}

for idx, row in df_top.iterrows():
    G.add_node(row['comment_id'],color=topic_col[row['topic']],hover=row['comment'],topic=row['topic'])


for idx, row in df_top.iterrows():
    for node in G.nodes:
        if row['parent_id'] == node:
            G.add_edge(node, row['comment_id'])

pos = nx.fruchterman_reingold_layout(G)



In [12]:


# Create a Plotly figure
fig = go.Figure()

# Add scatter trace for nodes with assigned positions
for node, position in pos.items():
    x, y = position  # Get the position of the node
    color = G.nodes[node]['color']  # Get the color attribute of the node
    hover = G.nodes[node]['hover']
    topic = G.nodes[node]['topic']
    fig.add_trace(go.Scatter(x=[x], y=[y], marker=dict(size=10,color=color), hovertext=hover,name=topic, hovertemplate=hover))

for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    fig.add_trace(go.Scatter(x=[x0, x1], y=[y0, y1], mode='lines', line=dict(width=1), name='Edge'))

# Update the layout of the figure
fig.update_layout(showlegend=False, title='Comments Graph')

fig.update_layout(xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                  yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))

# Show the figure
fig.show()

fig.write_html('graph_coms_topics.html')


In [None]:

edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = G.nodes[edge[0]]['pos']
    x1, y1 = G.nodes[edge[1]]['pos']
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
for node in G.nodes():
    x, y = G.nodes[node]['pos']
    node_x.append(x)
    node_y.append(y)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        # colorscale options
        #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
        #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
        #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
        colorscale='YlGnBu',
        reversescale=True,
        color=[],
        size=10,
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),
        line_width=2))

KeyError: 'pos'

In [None]:
node_adjacencies = []
node_text = []
for node, adjacencies in enumerate(G.adjacency()):
    node_adjacencies.append(len(adjacencies[1]))
    node_text.append('# of connections: '+str(len(adjacencies[1])))

node_trace.marker.color = node_adjacencies
node_trace.text = node_text

NameError: name 'node_trace' is not defined

In [None]:
fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='<br>Network graph made with Python',
                titlefont_size=16,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[ dict(
                    text="Python code: <a href='https://plotly.com/ipython-notebooks/network-graphs/'> https://plotly.com/ipython-notebooks/network-graphs/</a>",
                    showarrow=False,
                    xref="paper", yref="paper",
                    x=0.005, y=-0.002 ) ],
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                )
fig.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed