In [30]:
# Packages for reading and wrangling data
import pandas as pd              # For reading data into DataFrames
import regex as re               # For string manipulation

# Packages for AI
import torch
import pandas as pd
from io import StringIO
from   sentence_transformers import SentenceTransformer # conda install sentence-transformer
from   sentence_transformers import util
import nltk
import regex as re

# Packages for clustering
import numpy as np
from sklearn.manifold import TSNE

# Packages for visualization
import plotly.express as px      # Flexible drawing package
from wordcloud import WordCloud  # Draws just wordclouds
from textblob import TextBlob    # For sentiment analysis

# Packages for network diagrams
import networkx as nx
import plotly.graph_objects as go



In [24]:
dataset = 'using-ai-in-schools.csv'
df = pd.read_csv(dataset, encoding='utf-8')
df.columns

Index(['reply', 'id', 'parent_id', 'date', 'author', 'title', 'score',
       'replies', 'text'],
      dtype='object')

In [25]:

# Concatenate all the text values into a single string
text = ' '.join(df['text'].dropna())

# Create a WordCloud object
wordcloud = WordCloud(
    width=1024, 
    height=1024, 
    background_color='white',
    max_font_size=300, 
    max_words=200,
    collocations=False).generate(text)
wordcloud
# Plot the word cloud using Plotly

fig = px.imshow(wordcloud)
fig.update_layout(title_text='', width=1024, height=1024, xaxis_visible=False, yaxis_visible=False)
fig.show()

In [26]:

df['sentiment'] = [TextBlob(str(post)).sentiment.polarity for post in df.text]

# Print the updated dataframe
dataset_sentiment = f'{re.sub(r'\.csv', '', dataset)}-sentiment.csv'
print(dataset_sentiment)

df.to_csv(dataset_sentiment, encoding='utf-8')

using-ai-in-schools-sentiment.csv


In [27]:
df

Unnamed: 0,reply,id,parent_id,date,author,title,score,replies,text,sentiment
0,0,16ir9c7,,2023-09-14 19:20:13 UTC,choganoga,Using AI in schools,7499,316,,0.000000
1,1,k0ldcuc,t3_16ir9c7,2023-09-14 19:20:30 UTC,AutoModerator,Using AI in schools,1,0,\n**Welcome to r/TikTokCringe!**\n\n This is a...,0.211250
2,2,k0lw19n,t3_16ir9c7,2023-09-14 21:10:27 UTC,OurHonor1870,Using AI in schools,1711,14,Our education system is based around memorizin...,0.300000
3,3,k0lswnj,t3_16ir9c7,2023-09-14 20:51:43 UTC,___Binary___,Using AI in schools,703,7,"First off, all good points. Adapt. That’s the ...",0.330000
4,4,k0mhapn,t3_16ir9c7,2023-09-14 23:31:38 UTC,doubtwithout1,Using AI in schools,307,2,Getting ready for all the 20 y/o cryptocurrenc...,0.350000
...,...,...,...,...,...,...,...,...,...,...
312,312,k0oz58v,t1_k0oy2zh,2023-09-15 13:18:37 UTC,Plantarbre,Using AI in schools,0,1,"And I use it everyday too, and I'm a mathemati...",-0.075000
313,313,k0pa0uq,t1_k0p9he8,2023-09-15 14:30:46 UTC,,Using AI in schools,1,0,Hope not. I have PSTD from reading shit paper...,0.042857
314,314,k0p08ys,t1_k0oz58v,2023-09-15 13:26:15 UTC,tendadsnokids,Using AI in schools,1,1,"Listen, I'm a teacher at a remedial school for...",0.011111
315,315,k0p0oni,t1_k0p08ys,2023-09-15 13:29:17 UTC,Plantarbre,Using AI in schools,1,1,"Sure, but even if these comments are all burie...",0.200000


In [28]:

# GPU check & model download
################################################################################

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
#
# Download necessary models
#
nltk.download('punkt') # nltk tokenizer
model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
model      = SentenceTransformer(model_name)

# Read dataset
################################################################################
# Commented out because using previous file and data frame
# filename = 'IAM42-Clean.csv'
# df = pd.read_csv(filename, encoding='utf-8')

postings = df.text

# Create embeddings for postings
################################################################################
posting_embeddings  = model.encode(postings)

df['embedding'] = posting_embeddings.tolist()

dataset_embedding = f'{re.sub(r'\.csv', '', dataset_sentiment)}-embedding.csv'
print(dataset_embedding)

df.to_csv(dataset_embedding, encoding='utf-8')
# %%


cuda


[nltk_data] Downloading package punkt to /home/professorf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


using-ai-in-schools-sentiment-embedding.csv


In [29]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import plotly.express as px

# Assuming df['embedding'] contains the embeddings and df['text'] contains the original text
embeddings = np.array(df['embedding'].tolist())

# Perform t-SNE to reduce embeddings to 2D
tsne = TSNE(n_components=2, random_state=42, perplexity=7, n_iter=1000)
tsne_results = tsne.fit_transform(embeddings)

# Add t-SNE results to the DataFrame
dfc = df
dfc['X'] = tsne_results[:, 0]
dfc['Y'] = tsne_results[:, 1]
dfc.columns

# Create a scatter plot using Plotly, with hover text showing the original post text
text80 = [f'{post:.120}' for post in dfc.text]
fig = px.scatter(
    dfc, 
    x='X', 
    y='Y', 
    hover_data={'X': False, 'Y': False, 
                'post #':dfc.reply,'post': text80},  # Disable automatic hover data except 'text'
    title='t-SNE Clustering of Posts'
)

fig.update_layout(width = 1024, height=1024, xaxis = dict(visible = False), yaxis = dict(visible = False))
# Show the plot
fig.show()


In [70]:
# Process the parent_id field to remove the prefix
df['parent_id'] = [re.sub(r'^.{2}_', '', str(label)) for label in df.parent_id]

# Create a directed graph
G = nx.DiGraph()

# Add nodes and edges
for index, row in df.iterrows():
    G.add_node(row['id'], label=row['author'])  # Assuming there's a username column
    G.add_edge(row['id'], row['parent_id'])

# Position the nodes using a layout algorithm
pos = nx.spring_layout(G, seed = 14)
# pos = nx.shell_layout(G)
# pos = nx.kamada_kawai_layout(G)

# Create an edge trace for the plot
edge_trace = go.Scatter(
    x=[],
    y=[],
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines'
)

for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_trace['x'] += (x0, x1, None)
    edge_trace['y'] += (y0, y1, None)

# Create a node trace
node_trace = go.Scatter(
    x=[],
    y=[],
    text=[],
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale = False,
        # colorscale='YlGnBu',
        color = 'black',
        size=5
    )
)

for node in G.nodes():
    x, y = pos[node]
    node_trace['x'] += (x,)
    node_trace['y'] += (y,)
    
    # Safely access the label attribute
    label = G.nodes[node].get('label', 'No Label')
    node_trace['text'] += (label,)

# Create the figure
fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title='<br>Social Network Diagram',
                    title_x = 0.5,
                    titlefont_size=16,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=0, l=0, r=0, t=0),
                    xaxis=dict(showgrid=False, zeroline=False),
                    yaxis=dict(showgrid=False, zeroline=False))
                )

fig.update_layout(width = 640, height = 640, 
                  xaxis = dict(visible = False), 
                  yaxis = dict(visible = False),
                  showlegend = False)

fig.show()
