# Analysis of Stack Overflow Terraform Questions 

## Helpers

### Suppress Warnings

In [29]:
import warnings

warnings.filterwarnings("ignore")
# warnings.resetwarnings()

### Post GUI

In [30]:
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output


def generate_filtered_sentences_table(post_data):
    def generate_table_recursive(data, location):
        filtered_sentences = data.get("filtered-sentences", [])
        html_table = ""
        for sentence in filtered_sentences:
            source = location+":"+sentence.get("source", "")
            text = sentence.get("text", "")
            keywords = ", ".join(sentence.get("keywords", []))
            row = f"<tr><td>{source}</td><td>{text}</td><td>{keywords}</td></tr>"
            html_table += row

        if "comments" in data:
            for comment in data["comments"]:
                html_table += generate_table_recursive(comment, location+">comment")
        if "history" in data:
            for history_entry in data["history"]:
                html_table += generate_table_recursive(history_entry, location+">history")
        if "answers" in data:
            for answer in data["answers"]:
                html_table += generate_table_recursive(answer, location+">answer")

        # Close the HTML table for the current data

        return html_table

    # Call the recursive function to generate the HTML table
    html_table = "<table>"
    html_table += "<tr><th>Source</th><th>Text</th><th>Keywords</th></tr>"
    html_table += generate_table_recursive(post_data, "")
    html_table += "</table>"

    return html_table



def display_question(question_post):
    global component_outputs
    component_outputs["Id"].value = str(question_post.get("Id", ""))
    component_outputs["Title"].value = str(question_post.get("Title", ""))
    component_outputs["Tags"].value = str(question_post.get("Tags", ""))
    component_outputs["Body"].value = question_post.get("Body", "")
    component_outputs["Filtered Sentences"].value = generate_filtered_sentences_table(question_post)

    comments_text = "\n-----\n".join([comment["Text"].strip() for comment in question_post.get("comments", [])])
    component_outputs["Comments"].value = comments_text
    
    history_text = "\n-----\n".join([hist.get("Text", hist.get("Comment", "")).strip() for hist in question_post.get("history", [])])
    component_outputs["History"].value = history_text

def go_next(button):
    global questions, index
    index = index + 1 if index < len(questions) else index
    display_question(questions[index])

def go_prev(button):
    global questions, index
    index = index - 1 if index > 0 else 0
    display_question(questions[index])

def display_gui():
    global component_outputs
    # Navigation buttons
    next_button = widgets.Button(description="Next")
    prev_button = widgets.Button(description="Previous")
    next_button.on_click(go_next)
    prev_button.on_click(go_prev)
    next_prev_buttons = widgets.HBox([prev_button, next_button])

    toggle_widgets = [
        widgets.ToggleButton(value=False, description="Body"),
        widgets.ToggleButton(value=False, description="Comments"),
        widgets.ToggleButton(value=False, description="History"),
        widgets.ToggleButton(value=False, description="Filtered Sentences"),
    ]
    toggle_buttons = widgets.HBox(toggle_widgets)

    component_outputs = {
        "Id": widgets.Textarea(description="Id", disabled=True, layout={'width': '90%', 'height': '20px'}),
        "Title": widgets.Textarea(description="Title", disabled=True, layout={'width': '90%', 'height': '20px'}),
        "Tags": widgets.Textarea(description="Tags", disabled=True, layout={'width': '90%', 'height': '20px'}),
        "Body": widgets.HTML(description="Body", disabled=True, layout={'width': '90%', 'display': 'none'}),
        "Comments": widgets.Textarea(description="Comments", disabled=True, layout={'width': '90%', 'display': 'none', 'height': '40px'}),
        "History": widgets.Textarea(description="History", disabled=True, layout={'width': '90%', 'display': 'none', 'height': '40px'}),
        "Filtered Sentences": widgets.HTML(description="Filtered Sentences", disabled=True, layout={'width': '90%', 'display': 'none'}),
    }

    gui = widgets.Output()

    @gui.capture()
    def toggle_visibility(change):
        if change['new']:
            component_outputs[change['owner'].description].layout.display = 'block'
        else:
            component_outputs[change['owner'].description].layout.display = 'none'

    for b in toggle_widgets:
        # b.on_click(toggle_visibility)
        b.observe(toggle_visibility, names='value')
    
    # Display everything initially
    with gui:
        display(next_prev_buttons)
        display(toggle_buttons)
        for d, w in component_outputs.items():
            display(w)
    display(gui)

### Bokeh Export

In [31]:
from bokeh.io import export_svgs
from reportlab.graphics import renderPDF
from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from webdriver_manager.firefox import GeckoDriverManager
import svglib.svglib as svglib

class FirefoxWebdriver(object):
    def __new__(cls):
        if not hasattr(cls, 'instance'):
            cls.instance = super(FirefoxWebdriver, cls).__new__(cls)

            opts = webdriver.FirefoxOptions()
            opts.add_argument("--headless")
            cls.driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()), options=opts)
            
        return cls.instance

def export_pdf(bokeh_plot, filename):
    bokeh_plot.output_backend = "svg"
    filename = re.sub(r'\.pdf$','',filename)
    svg_name = filename+'.svg'
    pdf_name = filename+'.pdf'
    
    export_svgs(bokeh_plot, filename=svg_name, webdriver=FirefoxWebdriver().driver)
    svg = svglib.svg2rlg(svg_name)
    renderPDF.drawToFile(svg, pdf_name)
    os.remove(svg_name)

### Question Filtering

In [32]:
import os
import json

def post_content(post):
    return {
        "body" : post["Body"],
        "title" : post["Title"],
        "comments" : " | ".join([i["Text"] for i in post.get("comments", [])]),
        "answers" : " | ".join([i["Body"] for i in post.get("answers", [])]),
        "answers_comments" : " | ".join([i["Text"] for p in post.get("answers", []) for i in p.get("comments", []) ]),
        "history" : " | ".join([i.get("Text","") + " " + i.get("Comment","") for i in post.get("history", [])]),
    }


def filter_posts_with(post_dir, filter_func):
    for fn in os.listdir(post_dir):
        if fn.startswith("post-") and fn.endswith(".json"):
            with open(os.path.join(post_dir, fn), 'r') as f:
                post = json.load(f)
                content = post_content(post)
                for location, text in content.items():
                    if filter_func(text):
                        yield (post,location)
                        break


def filter_posts_with_strings(post_dir, strs):
    yield from filter_posts_with(post_dir, lambda x: any(s in x for s in strs))


def filter_posts_with_patterns(post_dir, pattern_list):
    patterns = [re.compile(p) for p in pattern_list]
    yield from filter_posts_with(post_dir, lambda x: any(p.search(x) for p in patterns))

### Keyword Search in Sentences

In [5]:
# Consider using ONNX if you have the hardware
# !pip install onnxruntime-gpu # GPUs with CUDA
# !pip install onnxruntime-silicon # Apple M1/M2

In [6]:
from bs4 import BeautifulSoup
from wtpsplit import WtP

# SENTER = WtP("wtp-bert-mini") # CPU
# SENTER = WtP("wtp-bert-mini", ort_providers=["CUDAExecutionProvider"]) # GPU with CUDA
SENTER = WtP("wtp-bert-mini", ort_providers=["CoreMLExecutionProvider"]) # Apple M1/M2


def htmlbody_to_text(html_string):
    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(html_string, 'html.parser')

    # Remove code blocks
    for code_block in soup.find_all(['pre']):
        code_block.extract()

    # Extract text from links and replace them in the soup with their text content
    for link_tag in soup.find_all('a'):
        link_tag.string = link_tag.get_text()

    # Get the text from the modified soup
    text = soup.get_text()

    # Remove extra whitespace and return the cleaned text
    return ' '.join(text.split()).strip()


def split_sentences(text):
    yield from SENTER.split(text, lang_code="en", style='ersatz')


def find_keywords(texts, pattern_dict):
    for t in texts:
        kws = [n for n,p in pattern_dict.items() if p.search(t)]
        if kws:
            yield {'text': t, 'keywords': kws}

2023-10-01 16:11:12.472776 [W:onnxruntime:, helper.cc:66 IsInputSupported] Dynamic shape is not supported for now, for input:/embeddings/HashBucketCodepointEmbedder_0/Gather_output_0
2023-10-01 16:11:12.472794 [W:onnxruntime:, helper.cc:66 IsInputSupported] Dynamic shape is not supported for now, for input:/embeddings/Concat_1_output_0
2023-10-01 16:11:12.472800 [W:onnxruntime:, helper.cc:66 IsInputSupported] Dynamic shape is not supported for now, for input:/embeddings/Add_output_0
2023-10-01 16:11:12.472808 [W:onnxruntime:, helper.cc:66 IsInputSupported] Dynamic shape is not supported for now, for input:attention_mask
2023-10-01 16:11:12.475071 [W:onnxruntime:, helper.cc:66 IsInputSupported] Dynamic shape is not supported for now, for input:/embeddings/HashBucketCodepointEmbedder_0/Gather_output_0
2023-10-01 16:11:12.475082 [W:onnxruntime:, helper.cc:66 IsInputSupported] Dynamic shape is not supported for now, for input:/embeddings/Concat_1_output_0
2023-10-01 16:11:12.475089 [W:onnx

## Analysis

In [12]:
step1output_dir = "../step-1-output/"
step1questions_dir = step1output_dir+"questions/"

step2output_dir = "../step-2-output/"
step2questions_dir = step2output_dir+"questions/"
os.makedirs(step2questions_dir, exist_ok=True)

### Filter Cost-Related Questions

Filter questions that contain one of the cost-related keywords

In [19]:
import re

cost_patterns = {i[0]: re.compile(i[1]) for i in [
        ("bill", r"\bb[iI][lL][lL]\w*\b"),
        ("cheap", r"\b[cC][hH][eE][aA][pP]\w*\b"),
        ("cost", r"\b[cC][oO][sS][tT]\b"),
        ("efficient", r"\b[eE][fF][fF][iI][cC][iI][eE][nN][tT]\b"),
        ("expense", r"\b[eE][xX][pP][eE][nN][sS]\w*\b"),
        ("pay", r"\b[pP][aA][yY]\b"),
]}

In [None]:
questions = [p[0] for p in filter_posts_with_patterns(step1questions_dir, cost_patterns.values())]
# questions = [p[0] for p in filter_posts_with_strings(step1questions_dir, cost_patterns.keys())]

### Enrich Questions with Topic-Related Insights 

Flag questions as containing topics or not

In [20]:
import re

topic_patterns = {i[0]: re.compile(i[1], re.IGNORECASE) for i in [
        ("networking", r"\bnetworking\b"), ("nat", r"\bnat\b"), ("vpn", r"\bvpn\b"), 
        ("instance", r"\binstance\b"), ("storage", r"\bstorage\b"), ("cpu", r"\bcpu\b"), ("ram", r"\bram\b"),
        ("domain", r"\bdomain\b"), ("area", r"\barea\b"), ("provider", r"\bprovider\b"),
        ("feature", r"\bfeature\b"), ("billing mode", r"\bmode\b"), ("policy", r"\bpolicy\b"), 
        ("cluster", r"\bcluster\b"), ("change", r"\bchang\w+\b"), ("test", r"\btest\w*\b"),
]}

In [None]:
for q in questions:
    q['contains-topic'] = any(kw.search(" ".join(post_content(q).values())) for kw in topic_patterns.values())
    # q['contains-topic'] = any(kw in " ".join(post_content(q).values()).lower() for kw in topic_patterns.keys())]

print(len(questions), "|", 
      len([q for q in questions if q['contains-topic']]), 
      len([q for q in questions if not q['contains-topic']]))

Highlight sentences that contain keywords.

They are added to **'filtered-sentences'** in the object where they are found, e.g.,:
- question post (if in **'Title'** or **'Body'**)
- answer post (if in the answer's **'Title'** or **'Body'**)
- comment (of either question or answer)
- history entry (of either question or answer)


In [10]:
%%time
all_patterns = {**cost_patterns, **topic_patterns}

def process_post_items(item, fields):
    try:
        item['filtered-sentences'] = [
            {'source': f, **res} for f in fields if item.get(f) and htmlbody_to_text(item[f])
                                    for res in find_keywords(split_sentences(htmlbody_to_text(item[f])), all_patterns)
        ]
    except Exception as e:
        for f in fields:
            if item.get(f):
                print(f, item[f], htmlbody_to_text(item[f]))
        raise e

def process_post(post):
    try:
        process_post_items(post, ['Title', 'Body'])
        for c in post.get("comments", []):
            process_post_items(c, ['Text'])
        for h in post.get("history", []):
            process_post_items(h, ['Text', 'Comment'])
        for a in post.get("answers", []):
            process_post(a)
    except Exception as e:
        print(e)
        
for q in questions:
    process_post(q)

CPU times: user 45min, sys: 4.89 s, total: 45min 5s
Wall time: 9min 10s


### Save Data

In [13]:
for q in questions:
    filename = f'post-{q["Id"]}.json'
    with open(os.path.join(step2questions_dir, filename), 'w') as f:
        json.dump(q, f)

### Explore Questions

Load questions (if resuming work)

In [1]:
import os
import json
step2output_dir = "../step-2-output/"
step2questions_dir = step2output_dir+"questions/"
questions = []
for fn in os.listdir(step2questions_dir):
    if fn.startswith("post-") and fn.endswith(".json"):
        with open(os.path.join(step2questions_dir, fn), 'r') as f:
            questions.append(json.load(f))

Display questions

In [13]:
index = 0
display_gui()
display_question(questions[index])

Output()

### Distribution of Topic Words

In [23]:
step2output_dir = "../step-2-output/"

In [24]:
import math
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, show
from bokeh.transform import dodge
output_notebook()

In [35]:
p_kws = list(topic_patterns.keys())
p_questions = [len([q['Id'] for q in questions if kw.search(" ".join(post_content(q).values()))]) for kw in topic_patterns.values()]
p_questions, p_kws = zip(*sorted(zip(p_questions, p_kws),reverse=True))

chart_source = ColumnDataSource(data={'kws':p_kws, '#questions':p_questions})

p = figure(x_range=p_kws, y_range=(0, 300), #title="Data distribution",
           height=350, width=600, toolbar_location=None, tools='')

p.vbar(x=dodge('kws',  0,   range=p.x_range), top='#questions', source=chart_source,
       width=0.2, color="#abc3c9", legend_label="number of distinct cost-related questions")

p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = math.pi/4
p.xaxis.major_label_text_font_size = '14px'
p.yaxis.major_label_text_font_size = '14px'
p.legend.location = "top_right"
p.legend.orientation = "vertical"

show(p)
export_pdf(p, step2output_dir+'so-questions-concepts')