In [5]:
!pip install bertopic
import bertopic



In [6]:
import re
import csv
import nltk
import pandas as pd
import random
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

def preProcessingFcn(tweet, removeWords=list(), stem=True, removeURL=True, removeStopwords=True,
                     removeNumbers=True, removeHashtags=True, removeAt=True, removePunctuation=True, lem=False):
    ps = PorterStemmer()
    lm = WordNetLemmatizer()
    tweet = tweet.lower()
    if removeURL == True:
        tweet = re.sub(r"http\S+", " ", tweet)
    if removeHashtags == True:
        tweet = tweet.replace('#', ' ')
    if removeAt == True:
        tweet = tweet.replace('@', ' ')
    if removeNumbers == True:
        tweet = ''.join(i for i in tweet if not i.isdigit())
    if removePunctuation == True:
        tweet = re.sub(r"[,.;@#?!&$:]+\ *", " ", tweet)
    if removeStopwords == True:
        tweet = ' '.join([word for word in tweet.split()
                          if word not in stopwords.words('english')])
    if len(removeWords) > 0:
        tweet = ' '.join([word for word in tweet.split()
                         if word not in removeWords])
    if lem == True:
        tweet = ' '.join([lm.lemmatize(word) for word in tweet.split()])
    if stem == True:
        tweet = ' '.join([ps.stem(word) for word in tweet.split()])
    return tweet

''' MAKING HASH MAP '''
# boolean determining whether to use only unigrams
# currently should be True; non-unigrams are not yet included in synonym-based tweet altering
only_unigrams = True

'''
IN: list of "base" words to consider in hash map
OUT: hash map including all inputted base words where key:pair = synonym:base word
note: using version 2 of add_to_mapping (i.e. no separation of finding and adding synonyms)
'''
def make_mapping(words):
    mapping = {}
    for word in words:
        for syn in wordnet.synsets(word):
            for lemma in syn.lemma_names():
                if lemma not in mapping and lemma != word:
                    if (only_unigrams and '_' not in lemma) or not only_unigrams:
                        mapping[lemma] = word
    return mapping


''' ALTERING TWEETS '''
'''
IN: tweets = original corpus, mapping = synonym:base mapping
OUT: altered tweets with all base words added to end of tweet if synonym found in tweet
'''
def alter_tweets_with_ordering(tweets, mapping):
    new_tweets = []
    for tweet in tweets:
        tweet_refactor = ""
        for word in word_tokenize(tweet):
            tweet_refactor = tweet_refactor + " " + word
            if word in mapping:
                tweet_refactor = tweet_refactor + " " + mapping[word]
        tweet_refactor = tweet_refactor[1:]
        new_tweets.append(tweet_refactor)
    return new_tweets

INPUT_DF = pd.read_csv('Vaccine_Trust_cleaned_nostop.csv')
print(INPUT_DF)
corpusLong = []
for index, rows in INPUT_DF.iterrows():
    sentence = preProcessingFcn(rows.text)
    corpusLong.append(sentence)

corpus = []
corpus = corpusLong
#corpus = random.sample(corpusLong, 500)

[nltk_data] Downloading package punkt to /home/joshfink/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/joshfink/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                   text
0     type guy wont take vaccine go 2 tin dip 6 ener...
1     child many country tell get vaccine covidvacci...
2     safe vaccine determine intelligent scientist c...
3     prime minister publicly say didnt trust nvacci...
4     act stupid old enough appreciate amaze medical...
...                                                 ...
9128  baylor religion survey find identify biblical ...
9129  get friend take vaccine cuz trust like bitch w...
9130  trust russian self drive car much trust russia...
9131  leave incapable critical think nvirtue signal ...
9132  sniff k yes idk like trust vaccine know litera...

[9133 rows x 1 columns]


In [7]:
# LOADING MODELS
from sentence_transformers import SentenceTransformer

# Load sentence transformer model
sentence_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Create documents embeddings
embeddings = sentence_model.encode(corpus, show_progress_bar=False)
print(len(embeddings), ", ", len(embeddings[0]))

9133 ,  384


In [8]:
import hdbscan
import umap.umap_ as umap
from bertopic import BERTopic

# Define UMAP model to reduce embeddings dimension
umap_model = umap.UMAP(n_neighbors=15,
                       n_components=10,
                       min_dist=0.0,
                       low_memory=False)

# Define HDBSCAN model to perform documents clustering
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=50,
                                min_samples=1,
                                #metric='jaccard',
                                cluster_selection_method='eom',
                                prediction_data=True)

# Create BERTopic model
topic_model = BERTopic(top_n_words=5,
                       n_gram_range=(1,2), 
                       calculate_probabilities=True,
                       umap_model= umap_model,
                       hdbscan_model=hdbscan_model,
                       verbose=True)

In [9]:
# Train model, extract topics and probabilities
topics, probabilities = topic_model.fit_transform(corpus, embeddings)

count = 0
total = 0
for topic_elt in topics:
    total = total + 1
    if topic_elt == -1:
        count = count + 1
print(count)
print(count/total*100)

2021-07-19 13:49:49,195 - BERTopic - Reduced dimensionality with UMAP
2021-07-19 13:49:51,600 - BERTopic - Clustered UMAP embeddings with HDBSCAN


3356
36.745866637468524


In [None]:
#ANIK CSV CODE
umap_embedding = umap_model.fit_transform(embeddings)
print( len(umap_embedding), ", ", len(umap_embedding[0]))
UMAP_DF = pd.DataFrame(umap_embedding)
UMAP_DF.to_csv("umap_9133_10.csv")

In [None]:
topic_model.visualize_distribution(probabilities[0])

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_topics()

In [None]:
print(topic_model.get_topics())


In [None]:
frequency = topic_model.get_topic_freq()
print(frequency)

In [None]:
print(topic_model.get_topics())



In [None]:
topic_model.get_params()

In [17]:
# Vader Sentiment Analysis
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
from nltk import tokenize
sid = SentimentIntensityAnalyzer()
import matplotlib
import matplotlib.pyplot as plt

# An "interface" to matplotlib.axes.Axes.hist() method


# sort tweets
print(max(topics))

tweet_clusters = {}
for i in range(len(topics)):
    if topics[i] != -1:
        if topics[i] not in tweet_clusters:
            tweet_clusters[topics[i]] = list()
        tweet_clusters[topics[i]].append(corpus[i])


for i in range(0,max(topics)+1):
    
    for elt in tweet_clusters[i]:
        freq_ss = []
        ss = sid.polarity_scores(elt)
        freq_ss.append(ss[:, 'compound'])
    print("Topic: ", i)
    print(freq_ss)





49


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/joshfink/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


TypeError: unhashable type: 'slice'