In [1]:
import pandas as pd
import subprocess
import os

from datetime import date
import json
import praw

import time
from nltk.stem import *
import re
import string 
import numpy as np

from collections import Counter
import nltk
from nltk.corpus import stopwords

from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import matplotlib.pyplot as plt
import hdbscan



"""ACTUAL SCRIPT"""
#This scrapes the postings
def scrape_postings(subreddit):
    subprocess.run(["./post_scraper.sh", subreddit], text=True, input="Y")
    return


def load_data(today, subreddit):
    #Preprocessing
    data = pd.read_json("URS-master/scrapes/{}/r-{}-Top-1000-results-past-day.json".format(today,subreddit)).transpose()

    ps = PorterStemmer()
    data["Cleaned Text"] = data["Title"].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
    data["Cleaned Text"] = data["Cleaned Text"].apply(lambda x: x.replace("\n", ""))
    data["Cleaned Text"] = data["Cleaned Text"].apply(lambda x: [ps.stem(i.lower()) for i in str(x).split(" ")])
    data["Cleaned Text"] = data["Cleaned Text"].apply(lambda x: [i for i in x if i not in stopwords.words('english')])
    data["Cleaned Text"] = data["Cleaned Text"].apply(lambda x: " ".join(x))

    #Turning time active as a feature to see freshness
    data["Date Created"] = pd.to_datetime(data['Date Created'])
    data["Time elapsed (mins)"] = (data['Date Created'] - min(data["Date Created"])).astype("timedelta64[m]")

    #Seeing if there is an image 
    data["has image"] = data["URL"].apply(lambda x: int(x[-4:] == ".jpg"))

    #Categorizing Comment Upvotes by Range of Upvotes
    ratios = max(data["Upvote Ratio"])

    data["Hated Comment"] = data["Upvotes"].apply(lambda x: (x <=  np.percentile(ratios, 25)).astype(int))
    data["Disliked Comment"] = data["Upvotes"].apply(lambda x: (np.percentile(ratios, 25) < x <=  np.percentile(ratios, 50)).astype(int))
    data["Likable Comment"] = data["Upvotes"].apply(lambda x: (np.percentile(ratios, 50) < x <=  np.percentile(ratios, 75)).astype(int))
    data["Loved Comment"] = data["Upvotes"].apply(lambda x: (np.percentile(ratios, 75) < x <=  np.percentile(ratios, 100)).astype(int))
    keep = data[["Title", "Text", "Flair", "Date Created", "ID", "URL","Cleaned Text"]]
    vectors = data[data.columns.difference(list(keep.columns))]
    vectors = vectors[vectors.columns.difference(["Is Locked?","Is Spoiler?", "NSFW?","Stickied?", "Edited?"])]
    return keep, vectors
    
"""BERT CLUSTERING PART"""
def get_words(label_heading):
    #Looking at the most common words in each label
    x = keep.groupby(label_heading).count()["Text"]
    x =  pd.DataFrame(x)
    
    out = dict()
    for label in x.index:
        corpus = []
        for index in keep[keep[label_heading] == label].index:
            for word in keep["Title"][index].split():
                if word not in stopwords.words("english"):
                    corpus.append(word.lower())
        out[label] = Counter(corpus).most_common(10)
    return out


def BERT():    
    bert = keep["Title"].apply(lambda x: model.encode(x))
    bert = pd.DataFrame(data=bert.values.tolist())
    
    clusterer = hdbscan.HDBSCAN(min_cluster_size=2, gen_min_span_tree=True)
    clusterer.fit(bert)
    
    keep["BERT label"] = clusterer.labels_

    #Getting all words associated with particular label
    out = get_words("BERT label")

    keep["BERT label words"] = keep["BERT label"].apply(lambda x: out[x])
    return bert

def TFIDF():
    v = TfidfVectorizer()
    x = v.fit_transform(keep['Cleaned Text'])
    tfidf = pd.DataFrame(x.toarray(), columns=v.get_feature_names())

    clusterer = hdbscan.HDBSCAN(min_cluster_size=2, gen_min_span_tree=True)
    clusterer.fit(tfidf)
    keep["TFIDF label"] = clusterer.labels_

    #Getting all words associated with particular label
    out = get_words("TFIDF label")

    keep["TFIDF label words"] = keep["TFIDF label"].apply(lambda x: out[x])
    return tfidf
    
    
def TFIDF_BERT():
    BERT_tfidf = pd.concat([bert,tfidf],axis=1)

    clusterer = hdbscan.HDBSCAN(min_cluster_size=2, gen_min_span_tree=True)
    clusterer.fit(BERT_tfidf)
    
    keep["BERT + TFIDF label"] = clusterer.labels_

    #Getting all words associated with particular label
    out = get_words("BERT + TFIDF label")

    keep["BERT + TFIDF label words"] = keep["BERT + TFIDF label"].apply(lambda x: out[x])

"""USER INPUTS HERE - CAN USE SYS.ARGV SO THIS RUNS AS PYTHON SCRIPT"""
today = date.today()
today = "{}-{}-{}".format(today.month, today.day, today.year)


model = SentenceTransformer('bert-large-uncased')
for subreddit in ["WorldNews", "Cornell", "buildapcsales"]:
    scrape_postings(subreddit)
    keep, vectors = load_data(today, subreddit)
    bert = BERT()
    tfidf = TFIDF()
    TFIDF_BERT()
    a = keep[["Title", "URL", "ID", "BERT label", "BERT label words", "TFIDF label", "TFIDF label words", "BERT + TFIDF label", "BERT + TFIDF label words"]]
    b = vectors[["Comment Count","Time elapsed (mins)", "Upvotes", "Upvote Ratio"]]
    c = pd.concat([a,b], axis=1)
    c.to_csv("Clusters/{}_{}_labels.csv".format(str(today),subreddit))



Exception when trying to download https://sbert.net/models/bert-large-uncased.zip. Response 404


In [6]:
c

Unnamed: 0,Title,URL,BERT label,BERT label words,TFIDF label,TFIDF label words,BERT + TFIDF label,BERT + TFIDF label words,Comment Count,Time elapsed (mins),Upvotes,Upvote Ratio
Post 1,"[Monitor] AOC C24G1A 24"" Curved Frameless Gami...",https://smile.amazon.com/dp/B08D3Y5PFZ,7,"[([monitor], 4), (-, 3), (gaming, 2), (monitor...",4,"[([monitor], 5), (-, 5), (ips, 3), (aoc, 2), (...",6,"[([monitor], 4), (-, 3), (gaming, 2), (monitor...",100,615.0,623,0.95
Post 2,[PSU] Super flower Ledex III 850W - 114.99$ (1...,https://www.newegg.com/super-flower-leadex-iii...,-1,"[(-, 25), (rgb, 5), (gaming, 5), ([ssd], 4), (...",-1,"[(-, 16), ([case], 3), ([psu], 2), (black, 2),...",-1,"[(-, 25), (rgb, 5), ([ssd], 4), (1tb, 4), ($12...",79,979.0,135,0.95
Post 3,[SSD] SK hynix Gold S31 1TB 3D NAND 2.5 inch S...,https://smile.amazon.com/gp/product/B07SNHB4RC/,8,"[(-, 4), ([ssd], 3), (1tb, 3), (ssd, 3), (sk, ...",8,"[(-, 9), ([ssd], 7), (1tb, 7), (ssd, 6), (inte...",9,"[(-, 4), ([ssd], 3), (1tb, 3), (ssd, 3), (sk, ...",60,1279.0,214,0.98
Post 4,[Ryzen 7 5800x] Ryzen 7 5800x 8-core - $449,https://www.amazon.com/AMD-Ryzen-5800X-16-Thre...,0,"[(7, 3), (-, 3), (ryzen, 2), (5800x, 2), ($449...",9,"[(7, 3), (ryzen, 2), (5800x, 2), (-, 2), ($449...",0,"[(7, 3), (-, 3), (ryzen, 2), (5800x, 2), ($449...",76,520.0,118,0.84
Post 5,[Laptop] Lenovo Legion 5 Laptop: Ryzen 5 4600H...,https://www.walmart.com/ip/Lenovo-Legion-5-AMD...,9,"[(-, 3), ([laptop], 2), (5, 2), (gtx, 2), (108...",2,"[(-, 3), ([laptop], 2), (5, 2), (gtx, 2), (108...",10,"[(-, 3), ([laptop], 2), (5, 2), (gtx, 2), (108...",35,186.0,112,0.89
Post 6,[SSD] SK hynix Gold P31 1TB PCIe NVMe Gen3 M.2...,https://smile.amazon.com/gp/product/B08DKB5LWY,8,"[(-, 4), ([ssd], 3), (1tb, 3), (ssd, 3), (sk, ...",8,"[(-, 9), ([ssd], 7), (1tb, 7), (ssd, 6), (inte...",9,"[(-, 4), ([ssd], 3), (1tb, 3), (ssd, 3), (sk, ...",35,978.0,115,0.94
Post 7,[Case] Phanteks Eclipse P500A DRGB White and B...,https://www.newegg.com/white-phanteks-eclipse-...,-1,"[(-, 25), (rgb, 5), (gaming, 5), ([ssd], 4), (...",-1,"[(-, 16), ([case], 3), ([psu], 2), (black, 2),...",-1,"[(-, 25), (rgb, 5), ([ssd], 4), (1tb, 4), ($12...",53,93.0,108,0.89
Post 8,"[SSD] Team Group T-FORCE VULCAN G 2.5"" 1TB SAT...",https://www.newegg.com/team-group-1tb-t-force-...,-1,"[(-, 25), (rgb, 5), (gaming, 5), ([ssd], 4), (...",8,"[(-, 9), ([ssd], 7), (1tb, 7), (ssd, 6), (inte...",-1,"[(-, 25), (rgb, 5), ([ssd], 4), (1tb, 4), ($12...",23,86.0,96,0.9
Post 9,[Case + Wireless Keyboard/Mouse + Face Masks] ...,https://smile.amazon.com/dp/B07MDJ2RW8,4,"[(+, 4), (wireless, 3), (face, 2), (rosewill, ...",10,"[(+, 6), (rosewill, 4), ([case, 2), (wireless,...",4,"[(+, 4), (wireless, 3), (face, 2), (rosewill, ...",40,665.0,94,0.86
Post 10,[Case] Lian Li O-11 Dynamic (White) NOW IN STO...,https://www.newegg.com/white-lian-li-pc-o11-dy...,-1,"[(-, 25), (rgb, 5), (gaming, 5), ([ssd], 4), (...",7,"[([case], 2), (lian, 2), (li, 2), (-, 2), (o-1...",-1,"[(-, 25), (rgb, 5), ([ssd], 4), (1tb, 4), ($12...",104,274.0,82,0.86
