In [None]:
import praw


reddit = praw.Reddit(
    client_id="AFaAUdloxpfo7w",
    client_secret="pfpsSSWvRcD1kT25MmeGnN_oB5n4Zg",
    user_agent="personal use script",
)

out = reddit.subreddit("all").search("Tesla Model X", limit=1000)
out

In [None]:
ids = []
for i in out:
    ids.append(i.id)
    print(i.title)
ids

In [None]:
import pandas as pd

data = pd.DataFrame(ids, columns=["ID"])
data

In [None]:
comments_and_levels = []

for post_id in data["ID"]:
    post = reddit.submission(id=post_id)
    post_upvote_ratio = post.upvote_ratio
    
    #Parse an entire reddit post and grab its level, upvotes, and id
    def parse_comments(top_comment, level):
        body = top_comment.body
        score = top_comment.score
        id_ = top_comment.id
        #Only grabbing cogent sentences, lets say if length is > 100
        #Also splitting by linebreak - as well formed ideas are usually broken into sections
        if len(body) >= 100:
            cleanup = top_comment.body.split("\n")
            ideas = []
            for x in cleanup:
                if len(x) > 20:
                    ideas.append(x)
            comments_and_levels.append((ideas, level, score, id_, post_id, None))
        for comment in top_comment.replies:
            parse_comments(comment, level+1)

    post.comments.replace_more(limit=0)
    comments_and_levels.append((post.title, 0, post.score, post_id, post_id, post_upvote_ratio))

    for comment in post.comments:
        parse_comments(comment, 1)
    
data = pd.DataFrame(comments_and_levels, columns=["text", "level", "score", "id", "parent", "upvote_ratio"])
data = data.explode("text")
data = data.drop_duplicates(subset="text")
data

In [None]:
data.to_csv("tesla_model_x_posts_and_comments.csv")

# Cleaning Text for TF-IDF

In [None]:
import pandas as pd
import re
import string
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


#Getting comments
comments = []
for comment in data["text"]:
    comments.append(str(comment))


def prep(row):     
    # split into tokens by white space
    tokens = row.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words]
    # stemming of words
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in tokens]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return " ".join(tokens)


comments = pd.DataFrame(comments,columns=["text"])
comments["processed text"] = comments["text"].apply(lambda x: prep(x))
comments

# Running BERT and TFIDF

In [None]:
#Using BERT and TFIDF
from sentence_transformers import SentenceTransformer
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

#this is slow - maybe try distilbert-base-uncased?
model = SentenceTransformer('distilbert-base-uncased')

bert_embeddings = []
v = TfidfVectorizer(max_features=1000, min_df=5)
print("start tfidf")
x = v.fit_transform(comments["processed text"])
print("finished tfidf")
print("starting BERT")
i = 0
for comment in comments["text"]:
    i += 1
    bert_embeddings.append(model.encode(comment))
    if i % 100 == 0:
        print(i)
bert_embeddings

In [None]:
with open("TeslaModelXText.txt", "w") as f:
    for line in comments["text"]:
        f.write(line+"\n")

# Running FastText

In [None]:
import fasttext
model = fasttext.train_unsupervised('TeslaModelXText.txt', model='skipgram')

# Time to create dataframes of embeddings 

# BERT Alone (768 features)

In [None]:
import hdbscan

clusterer = hdbscan.HDBSCAN(min_cluster_size=2, gen_min_span_tree=False, min_samples=2)
clusterer.fit(bert_embeddings)
data["BERT"] = clusterer.labels_

# FastText Alone (100 features)

In [None]:
fasttext_embeddings = []
for line in comments["text"]:
    fasttext_embeddings.append(model[line])

clusterer = hdbscan.HDBSCAN(min_cluster_size=2, gen_min_span_tree=False, min_samples=2)
clusterer.fit(fasttext_embeddings)
data["FastText"] = clusterer.labels_

# TF-IDF Alone (1000 features)

In [None]:
tfidf_embeddings = x.toarray()

clusterer = hdbscan.HDBSCAN(min_cluster_size=2, gen_min_span_tree=False, min_samples=2)
clusterer.fit(tfidf_embeddings)
data["TFIDF"] = clusterer.labels_

# BERT + FastText (868 features)

In [None]:
import numpy as np
bert_ft = np.concatenate((bert_embeddings,fasttext_embeddings), axis=1)
clusterer = hdbscan.HDBSCAN(min_cluster_size=2, gen_min_span_tree=False, min_samples=2)
clusterer.fit(bert_ft)
data["BERT+FastText"] = clusterer.labels_

# BERT + TFIDF (1768 features)

In [None]:
import numpy as np
bert_tfidf = np.concatenate((bert_embeddings, tfidf_embeddings), axis=1)
clusterer = hdbscan.HDBSCAN(min_cluster_size=2, gen_min_span_tree=False, min_samples=2)
clusterer.fit(bert_tfidf)
data["BERT+TFIDF"] = clusterer.labels_

# FastText + TFIDF (1100 features)

In [None]:
import numpy as np
ft_tfidf = np.concatenate((fasttext_embeddings, tfidf_embeddings), axis=1)
clusterer = hdbscan.HDBSCAN(min_cluster_size=2, gen_min_span_tree=False, min_samples=2)
clusterer.fit(ft_tfidf)
data["FastText+TFIDF"] = clusterer.labels_

In [None]:
data.to_csv("Final_Tesla_Model_X.csv")
data.head()

In [2]:
import pandas as pd
data = pd.read_csv("Final_Tesla_Model_X.csv")
data

Unnamed: 0.1,Unnamed: 0,text,level,score,id,parent,upvote_ratio,BERT,FastText,TFIDF,BERT+FastText,BERT+TFIDF,FastText+TFIDF
0,0.0,Tesla Model X avoids the crash,0.0,30789,mzymrn,mzymrn,0.87,-1.0,-1.0,476.0,-1.0,-1.0,313.0
1,1.0,Did the BMW accelerate after getting hit? It l...,1.0,1809,gw3k07l,mzymrn,,-1.0,653.0,-1.0,-1.0,-1.0,-1.0
2,2.0,I got hit from behind once hard enough to sand...,3.0,574,gw3rh6m,mzymrn,,216.0,-1.0,-1.0,215.0,215.0,-1.0
3,2.0,I assume the impact knocked my foot off the pe...,3.0,574,gw3rh6m,mzymrn,,216.0,840.0,67.0,215.0,215.0,-1.0
4,3.0,I saw someone sliding to rear end me in my bra...,4.0,48,gw4rkxe,mzymrn,,319.0,864.0,67.0,316.0,326.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4232,2630.0,Now run along and go watch your duck dynasty r...,2.0,1,fakjyq9,e9mhag,,-1.0,261.0,336.0,-1.0,-1.0,-1.0
4233,2631.0,2021 Tesla Model X Review,0.0,1,l4rloa,l4rloa,1.00,151.0,-1.0,531.0,151.0,160.0,359.0
4234,2632.0,ARCHIVE - Top Tesla Models Review| Tesla model...,0.0,1,l8qk77,l8qk77,1.00,111.0,37.0,528.0,111.0,126.0,353.0
4235,2633.0,The 2022 Tesla Model S and 2022 Tesla Model X ...,0.0,1,l6oncu,l6oncu,1.00,95.0,-1.0,-1.0,95.0,86.0,-1.0


# Isolating the posts that were tagged as an idea by BERT, FastText, AND TFIDF

In [3]:
a = data[data["FastText"] != -1]
b = a[a["BERT"] != -1]
c = b[b["TFIDF"]!= -1]
c

Unnamed: 0.1,Unnamed: 0,text,level,score,id,parent,upvote_ratio,BERT,FastText,TFIDF,BERT+FastText,BERT+TFIDF,FastText+TFIDF
3,2.0,I assume the impact knocked my foot off the pe...,3.0,574,gw3rh6m,mzymrn,,216.0,840.0,67.0,215.0,215.0,-1.0
4,3.0,I saw someone sliding to rear end me in my bra...,4.0,48,gw4rkxe,mzymrn,,319.0,864.0,67.0,316.0,326.0,-1.0
31,24.0,"I had to shoulder dodge once in my truck, we w...",5.0,2,gw6teyc,mzymrn,,329.0,863.0,67.0,325.0,343.0,-1.0
42,30.0,When it engages it brakes hard enough that ABS...,4.0,205,gw4jfty,mzymrn,,280.0,913.0,269.0,278.0,277.0,-1.0
54,38.0,">Automatic Emergency Braking, I've experienced...",5.0,4,gw4lzb2,mzymrn,,319.0,31.0,268.0,316.0,326.0,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4219,2622.0,All these comments and not a single one about ...,1.0,0,falxmj3,e9mhag,,258.0,221.0,336.0,-1.0,267.0,-1.0
4220,2622.0,"Between this, lower center of gravity and a la...",1.0,0,falxmj3,e9mhag,,81.0,188.0,336.0,81.0,-1.0,-1.0
4225,2624.0,"Edit, I assumed it'd be the same as the Cybert...",1.0,-4,fakndrt,e9mhag,,369.0,967.0,336.0,366.0,384.0,-1.0
4227,2626.0,I assumed it was a similar test to the one the...,3.0,0,fakon4n,e9mhag,,369.0,938.0,336.0,366.0,384.0,-1.0


In [4]:
len(c["FastText"].unique())

409

In [5]:
len(c["BERT"].unique())

291

In [6]:
len(c["TFIDF"].unique())

282

In [7]:
len(c["BERT+FastText"].unique())

290

In [8]:
len(c["BERT+TFIDF"].unique())

277

In [10]:
len(c["FastText+TFIDF"].unique())

186

# Grabbing Number of Ideas for Each Model

In [2]:
model_text = ["BERT", "FastText", "TFIDF", "BERT+FastText", "BERT+TFIDF", "FastText+TFIDF"]
temp = {}
for x in model_text:
    temp[x] = len(data[x].unique()) 
    print(x, len(data[x].unique()))

BERT 394
FastText 973
TFIDF 535
BERT+FastText 391
BERT+TFIDF 407
FastText+TFIDF 368


In [3]:
for model in model_text:
    richest_clusters = []
    for i in range(temp[model]):
        if len(data[data[model]==i]) >= 5:
            richest_clusters.append(i)
    print(model, richest_clusters)

BERT [12, 38, 181, 184, 201, 215, 222, 232, 234, 253, 261, 278, 280, 289, 293, 298, 301, 340, 345, 375, 378, 379, 383, 387]
FastText [3, 14, 16, 17, 18, 29, 57, 58, 59, 75, 83, 89, 102, 107, 118, 122, 123, 134, 150, 153, 154, 223, 243, 257, 269, 270, 273, 275, 279, 289, 331, 337, 338, 343, 357, 360, 377, 411, 434, 444, 452, 483, 486, 515, 527, 537, 544, 569, 580, 590, 613, 625, 639, 649, 654, 655, 671, 674, 681, 685, 692, 694, 697, 699, 704, 717, 724, 731, 733, 734, 739, 751, 763, 764, 791, 794, 804, 811, 820, 821, 824, 836, 838, 847, 861, 866, 873, 876, 890, 892, 906, 915, 926, 927, 935, 936, 937, 944, 946, 949, 962, 964, 970]
TFIDF [1, 2, 4, 5, 6, 7, 10, 11, 14, 17, 20, 21, 22, 24, 31, 34, 36, 38, 40, 42, 44, 48, 50, 54, 55, 59, 63, 64, 67, 71, 72, 74, 77, 81, 82, 83, 85, 86, 89, 90, 92, 93, 94, 96, 100, 101, 102, 103, 104, 107, 112, 114, 118, 121, 127, 128, 129, 130, 132, 136, 139, 140, 142, 144, 147, 148, 150, 151, 152, 158, 161, 164, 169, 170, 171, 175, 179, 180, 183, 184, 188, 18

# Grabbing posts and generating scores for each of them

In [4]:
posts_and_comments = pd.read_csv("tesla_model_x_posts_and_comments.csv")
posts = posts_and_comments[posts_and_comments["level"] == 0]
posts

Unnamed: 0.1,Unnamed: 0,text,level,score,id,parent,upvote_ratio
0,0.0,Tesla Model X avoids the crash,0.0,30799,mzymrn,mzymrn,0.87
373,238.0,Refreshed Tesla Model X. Customs already?,0.0,561,nda1f5,nda1f5,0.93
400,260.0,"Trading “good karma, and good vibes” for a Tes...",0.0,6284,mid6cb,mid6cb,0.99
439,289.0,Tesla Model X Deliveries Pushed Back From Febr...,0.0,44,ncjurh,ncjurh,0.88
510,328.0,"Trading “good karma, and good vibes” for Tesla...",0.0,2539,miburn,miburn,0.97
...,...,...,...,...,...,...,...
3876,2416.0,"Tesla Model X – Charlotte, NC – Nikon D810; To...",0.0,8,k1qphq,k1qphq,0.90
3881,2421.0,Tug-of-war between Tesla Model X and Ford 4x4,0.0,3828,e9mhag,e9mhag,0.95
4234,2632.0,ARCHIVE - Top Tesla Models Review| Tesla model...,0.0,1,l8qk77,l8qk77,1.00
4671,2876.0,2021 Tesla Model X Review,0.0,1,l4rloa,l4rloa,1.00


# Generating a dictionary of comment_id to label

In [5]:
from collections import Counter

out = {}

for parent in posts["id"]:
    #Here we have all comments for a given post
    x = data[data["parent"] == parent]
    print(x[x["id"]==parent]["text"])
    #We can calculate how many ideas are reflected for each model
    out[parent] = {}
    for model in model_text:
        out[parent][model] = len(x[model].unique())

0    Tesla Model X avoids the crash
Name: text, dtype: object
373    Refreshed Tesla Model X. Customs already?
Name: text, dtype: object
400    Trading “good karma, and good vibes” for a Tes...
Name: text, dtype: object
439    Tesla Model X Deliveries Pushed Back From Febr...
Name: text, dtype: object
510    Trading “good karma, and good vibes” for Tesla...
Name: text, dtype: object
555    Tesla model X new orders pushed back to Jan-Fe...
Name: text, dtype: object
612    Can’t wait to play this underrated hidden gem ...
Name: text, dtype: object
640    This must be a sign. I saw this guy heading in...
Name: text, dtype: object
653    Kazadan kaçan Tesla Model X..
Name: text, dtype: object
658    You can ejaculate out a Tesla model x
Name: text, dtype: object
664    Tesla Model X hacked with $195 Raspberry Pi ba...
Name: text, dtype: object
Series([], Name: text, dtype: object)
1087    Random car meet in Car Dealership Tycoon, i’m ...
Name: text, dtype: object
1093    Thinking about buy

2566    Tesla Model X gets hacked through new relay at...
Name: text, dtype: object
2593    for sale: 2016 Used Tesla Model X P90D. Signat...
Name: text, dtype: object
2594    for sale: 2018 Used Tesla Model X 75D. 6 Seat....
Name: text, dtype: object
2694    for sale: 2017 Used Tesla Model X 100D. 6 Seat...
Name: text, dtype: object
2596    Norway: Audi E-Tron Has Caught Up to Tesla Mod...
Name: text, dtype: object
2595    for sale : 2018 Used Tesla Model X 100D. 6 Sea...
Name: text, dtype: object
2696    Hooni's Tesla Model X
Name: text, dtype: object
2695    ARCHIVE - Tesla Model X with over 400,000 mile...
Name: text, dtype: object
2697    Why Tesla's Model X Was The First SUV To Recei...
Name: text, dtype: object
2699    What's your opinion on the Ford Mustang Mach-E...
Name: text, dtype: object
2698    for sale: 2020 Used Tesla Model X PERFORMANCE....
Name: text, dtype: object
2760    Top Tesla Models Review| Tesla model x 2021- T...
Name: text, dtype: object
2753    As of Septem

In [8]:
post_ideas = pd.DataFrame(out).transpose()
post_ideas = post_ideas.reset_index()
post_ideas["parent"] = post_ideas["index"]
post_ideas

Unnamed: 0,index,BERT,FastText,TFIDF,BERT+FastText,BERT+TFIDF,FastText+TFIDF,parent
0,mzymrn,49,209,59,47,54,43,mzymrn
1,nda1f5,3,18,7,3,3,5,nda1f5
2,mid6cb,9,28,11,9,9,7,mid6cb
3,ncjurh,7,44,16,7,7,11,ncjurh
4,miburn,6,34,14,6,7,9,miburn
...,...,...,...,...,...,...,...,...
206,k1qphq,2,5,3,2,1,1,k1qphq
207,e9mhag,50,177,68,50,50,54,e9mhag
208,l8qk77,1,1,1,1,1,1,l8qk77
209,l4rloa,1,1,1,1,1,1,l4rloa


In [34]:
out = pd.merge(posts, post_ideas, on="parent")
for model in model_text:
    #Echo score is diversity ratio * downvotes / upvotes
    out["score"] = out["score"].apply(lambda x: float(x))
    out[model+"_echo_score"] = (out[model]/temp[model]) * ((1-out["upvote_ratio"]) * out["score"]) / (out["upvote_ratio"]*out["score"])

for model in model_text:
    print(out.sort_values(model+"_echo_score", ascending=False).head()["text"])
    print("------------------")

108                                   Used Tesla model X
41        Living in a Tesla Model X. Is this sub poppin?
0                         Tesla Model X avoids the crash
160    [Jason Torchinsky] One Week-Old Tesla Model X ...
131    What's your opinion on the Ford Mustang Mach-E...
Name: text, dtype: object
------------------
160    [Jason Torchinsky] One Week-Old Tesla Model X ...
0                         Tesla Model X avoids the crash
131    What's your opinion on the Ford Mustang Mach-E...
41        Living in a Tesla Model X. Is this sub poppin?
108                                   Used Tesla model X
Name: text, dtype: object
------------------
41        Living in a Tesla Model X. Is this sub poppin?
160    [Jason Torchinsky] One Week-Old Tesla Model X ...
0                         Tesla Model X avoids the crash
53     I moved into a Tesla Model X to fight gentrifi...
131    What's your opinion on the Ford Mustang Mach-E...
Name: text, dtype: object
------------------
108       

In [82]:
sample = ["mzymrn",
"narljv",
"k17a34",
"mzdith",
"lw8z93",
"nbhchz",
"lvgr9j",
"lmd2c5",
"k6nm1y",
"ky6zgd",
"jafoxj",
"lfiv84",
"d3e42b",
"m3rfte",
"lfjl1w",
"kf1lf5",
"jexnuv",
"lrjce4",
"mxx478",
"ifcy2d",
"mkar6r",
"mcmsmq",
"jzkea8",
"gju3xs",
"j87dm2",
"j1zlme",
"mrvmo5",
"lw8z93",
"kr1vp3",
"nbhchz",
"lfitsm",
"lgakfl"]

x = []

for i in sample:
    temp_row = []
    row = out[out["id"]==i]
    for model in model_text:
        temp_row.append(row[model+"_echo_score"].values[0])
    x.append(temp_row)

pd.DataFrame(x, columns = model_text)

Unnamed: 0,BERT,FastText,TFIDF,BERT+FastText,BERT+TFIDF,FastText+TFIDF
0,0.018583,0.032096,0.016479,0.017962,0.019825,0.01746
1,0.010367,0.016791,0.009925,0.010446,0.010036,0.012209
2,0.004442,0.010149,0.005452,0.004476,0.004505,0.006114
3,,,,,,
4,0.000155,0.000189,0.000229,0.000157,0.0001,0.000166
5,,,,,,
6,0.001934,0.001566,0.002492,0.001949,0.001872,0.002588
7,0.001138,0.001536,0.002234,0.001146,0.001101,0.002436
8,0.000155,0.000231,0.000114,0.000157,0.00015,0.000111
9,0.019459,0.019261,0.012738,0.019608,0.016744,0.013889


In [None]:
import pandas as pd

data = pd.read_csv("Final_Tesla_Model_X.csv")
model_text = ["BERT", "FastText", "TFIDF", "BERT+FastText", "BERT+TFIDF", "FastText+TFIDF"]
for x in model_text:
    print(x, len(data[x].unique()))
    
posts_and_comments = pd.read_csv("tesla_model_x_posts_and_comments.csv")
posts = posts_and_comments[posts_and_comments["level"] == 0]

out = {}

for parent in posts["id"]:
    #Here we have all comments for a given post
    x = data[data["parent"] == parent]
    print(x[x["id"]==parent]["text"])
    #We can calculate how many ideas are reflected for each model
    out[parent] = {}
    for model in model_text:
        out[parent][model] = len(x[model].unique())
        
        
post_ideas = pd.DataFrame(out).transpose()
post_ideas = post_ideas.reset_index()
post_ideas["parent"] = post_ideas["index"]

out = pd.merge(posts, post_ideas, on="parent")
for model in model_text:
    #Echo score is diversity ratio * downvotes / upvotes
    out["score"] = out["score"].apply(lambda x: float(x))
    out[model+"_echo_score"] = (out[model]/temp[model]) * ((1-out["upvote_ratio"]) * out["score"]) / (out["upvote_ratio"]*out["score"])

for model in model_text:
    print(out.sort_values(model+"_echo_score", ascending=False).head()["text"])
    print("------------------")
    
    
lol = out[out["BERT_echo_score"]!=0]
lol = lol.sort_values("BERT_echo_score", ascending=False)
for i in lol[::5]["id"]:
    print(i)

In [84]:
out.to_csv("final_echo_scores_tesla.csv")

In [None]:
sample = ["mzymrn",
"narljv",
"k17a34",
"mzdith",
"lw8z93",
"nbhchz",
"lvgr9j",
"lmd2c5",
"k6nm1y",
"ky6zgd",
"jafoxj",
"lfiv84",
"d3e42b",
"m3rfte",
"lfjl1w",
"kf1lf5",
"jexnuv",
"lrjce4",
"mxx478",
"ifcy2d",
"mkar6r",
"mcmsmq",
"jzkea8",
"gju3xs",
"j87dm2",
"j1zlme",
"mrvmo5",
"lw8z93",
"kr1vp3",
"nbhchz",
"lfitsm",
"lgakfl"]

x = []

for i in sample:
    temp_row = []
    row = out[out["id"]==i]
    for model in model_text:
        temp_row.append(row[model+"_echo_score"].values[0])
    x.append(temp_row)

pd.DataFrame(x, columns = model_text)