In [None]:
%pip install --upgrade numpy gensim

In [None]:

import pandas as pd
import gensim
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

def preprocess_sentence(sentence):
    return [token for token in simple_preprocess(sentence) if token not in STOPWORDS]

def LDA_topic(sentence):
    sentence_bow = dictionary.doc2bow(sentence)
    topic_probabilities = lda_model.get_document_topics(sentence_bow)
    topic_index_with_highest_prob = max(topic_probabilities, key=lambda x: x[1])[0]
    return topic_index_with_highest_prob + 1

def concatenate_reviews(df, max_char_length):
    dataframe = df.rename(columns={"ldatopic": "topic", "conwords": "words"})
    dataframe = dataframe.sort_values(by=['date', 'topic'])
    concatenated_data = {'concatenated_string': [], 'start_time': [], 'end_time': [], 'topic': []}
    current_concatenated_str = ''
    current_start_time = None
    current_end_time = None
    current_topic = None
    for index, row in dataframe.iterrows():
        if current_topic is None:
            current_topic = row['topic']
            current_start_time = row['date']
            current_end_time = row['date']
            current_concatenated_str = row['words']
        else:
            if current_topic == row['topic'] and len(current_concatenated_str) + len(row['words']) <= max_char_length:
                current_concatenated_str += ' ' + row['words']
                current_end_time = row['date']
            else:
                concatenated_data['concatenated_string'].append(current_concatenated_str)
                concatenated_data['start_time'].append(current_start_time)
                concatenated_data['end_time'].append(current_end_time)
                concatenated_data['topic'].append(current_topic)
                
                current_topic = row['topic']
                current_start_time = row['date']
                current_end_time = row['date']
                current_concatenated_str = row['words']

    concatenated_data['concatenated_string'].append(current_concatenated_str)
    concatenated_data['start_time'].append(current_start_time)
    concatenated_data['end_time'].append(current_end_time)
    concatenated_data['topic'].append(current_topic)
    concatenated_df = pd.DataFrame(concatenated_data)
    concatenated_df['SMonth'] = concatenated_df['start_time'].dt.month 
    concatenated_df['EMonth'] = concatenated_df['end_time'].dt.month 
    return concatenated_df

def concat_more(concatenated_df, max_length, str_col = 'concatenated_string', first = False):
    newstrings = []
    newstart = []
    newend = []
    newtopic = []
    newind = []
    for a in list(concatenated_df['topic'].unique()):
        if first:
            topicdf = concatenated_df[concatenated_df['topic'] == a].sort_values(by=['date'])
            indices = topicdf['index'].tolist()
            # print(indices)
            # break
            strings = topicdf[str_col].tolist()
            start = topicdf['date'].tolist()
            end = topicdf['date'].tolist()
        else:
            topicdf = concatenated_df[concatenated_df['topic'] == a].sort_values(by=['start_time'])
            indices = topicdf['index'].tolist()
            # print(indices)
            # break
            strings = topicdf[str_col].tolist()
            start = topicdf['start_time'].tolist()
            end = topicdf['end_time'].tolist()
        skip = []
        
        for i in range(len(strings)-1):
            statement = False
            if i not in skip:
                temp = strings[i] + " \n" + strings[i+1]
                temp = temp.replace("Continue reading", "\n")
                if len(temp) < max_length and end[i + 1].year==start[i].year:
                    if end[i + 1] < end[i]:
                        print(end[i + 1], end[i])
                    newstrings.append(temp)
                    newind.append(indices[i] + indices[i+1])
                    newend.append(end[i + 1])
                    newstart.append(start[i])
                    newtopic.append(a)
                    skip.append(i+1)
                    statement = True
                else:
                    newstrings.append(temp)
                    newend.append(end[i])
                    newstart.append(start[i])
                    newtopic.append(a)
                    newind.append(indices[i])

    if statement:
        return concat_more(pd.DataFrame({"index":newind, "concatenated_string": newstrings, "start_time": newstart, "end_time": newend, "topic": newtopic}), max_length)
    return pd.DataFrame({"index":newind,"concatenated_string": newstrings, "start_time": newstart, "end_time": newend, "topic": newtopic})

def ind2list(ind):
    return [ind]

In [None]:
url="https://raw.githubusercontent.com/lichengrui/glassdoor_scraping/main/data.csv"
df=pd.read_csv(url)
df = df.drop(columns=["Unnamed: 0"])
df = df.dropna(how='all')
df['date'] = pd.to_datetime(df['date']) 
df['processed_conwords'] = df['conwords'].apply(preprocess_sentence)

dictionary = corpora.Dictionary(df['processed_conwords'])
corpus = [dictionary.doc2bow(sentence) for sentence in df['processed_conwords']]

num_topics = 10  # Set the number of topics
lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

df['topic'] = df['processed_conwords'].apply(LDA_topic)

In [None]:
display(df)

In [None]:

max_length = 300
result_df = df.reset_index()
result_df['index'] = result_df['index'].apply(ind2list)
result_df = concat_more(result_df, max_length, "conwords", True)
result_df1 = concat_more(result_df.sort_values(by=['start_time']),max_length)


In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device = 0)


In [None]:
def summary(review):
    return summarizer(review, max_length=130, min_length=10, do_sample=False)[0]["summary_text"]
testdf = result_df.head(50)
testdf['summarized'] = testdf['concatenated_string'].apply(summary)
display(testdf)

In [None]:
import pandas as pd
import json

def build_review_tree(data_df):
    review_tree = {
                "name": "Root", 
                "children": []}

    for _, row in data_df.iterrows():
        date = pd.to_datetime(row['date'])
        year = date.year
        rating = row['rating']
        review = row['conwords']
        topic = row['topic']


        # Create rating node if not present
        appendyear = True
        appendrating = True
        appendtopic = True
        for j in range(len(review_tree["children"])):
            if review_tree["children"][j]["year"] == year:
                appendyear = False
                break
        if appendyear:
            review_tree["children"].append({
                "year": year,
                "name": "Year: " + str(year), 
                "children": [{
                    "rating": rating,
                    "name": "Rating: " + str(rating), 
                    "children": [{
                        "topic": topic,
                        "name": "Topic: " + str(topic), 
                        "children": [{
                            'name': review,
                        }]}]}]})
        else:
            for i in range(len(review_tree["children"][j]["children"])):
                 if review_tree["children"][j]["children"][i]["rating"] == rating:
                     appendrating = False
                     break
            if appendrating:
                review_tree["children"][j]["children"].append({
                    "rating": rating,
                    "name": "Rating: " + str(rating), 
                    "children": [{
                        "topic": topic,
                        "name": "Topic: " + str(topic), 
                        "children": [{
                            'name': review,
                        }]}]})
            else:
                for k in range(len(review_tree["children"][j]["children"][i]["children"])):
                    if review_tree["children"][j]["children"][i]["children"][k]["topic"] == topic:
                        appendtopic = False
                        break
                if appendtopic:
                    review_tree["children"][j]["children"][i]['children'].append({
                        "topic": topic,
                        "name": "Topic: " + str(topic), 
                        "children": [{
                            'name': review,
                        }]})
                else:
                    review_tree["children"][j]["children"][i]["children"][k]['children'].append({
                            'name': review,
                        })
    return review_tree

def moreNodes(lst,max_length, p = False):
    if p:
        print("INPUT LIST")
        print(lst)
    length = 0
    newstring = ""
    children = []
    notadded=True
    node = {"name": 'placeholder', "children": []}
    for count in range(len(lst)):
        assert type(lst[count]==dict)
        currentReview = lst[count]['name']
        if len(currentReview) > max_length:
            node['name'] = summary(newstring)
            children.append(node)
            children.append({"name": summary(currentReview), "children": [lst[count]]})
            node = {"name": 'placeholder', "children": []}
            newstring = ""
            length = 0
            notadded = False
        elif length + len(currentReview) > max_length:
            node['name'] = summary(newstring)
            children.append(node)
            node = {"name": 'placeholder', "children": [lst[count]]}
            newstring = currentReview
            length = len(currentReview)
            notadded = False
        else:
            length += len(currentReview)
            if len(newstring) == 0:
                newstring = currentReview
            else:
                newstring += " " + currentReview
            node["children"].append(lst[count])
            notadded = True
    if len(children) == 0:
        if len(node['children'])>1:
            node['name'] = summary(newstring)
            children.append(node)
            if p:
                print("RETURN TRUE")
            return children, True
        if p:
            print("RETURN FALSE")
        return lst, False
    if notadded:
        node['name'] = summary(newstring)
        children.append(node)
        if p:
            print("RETURN TRUE")
    return children, True

max_summary_length = 1500

review_tree = build_review_tree(df.head(7000))
temp_tree = review_tree.copy()
json_string2 = json.dumps(review_tree, indent=2)
for year_num in range(len(review_tree["children"])):
    for rating_num in range(len(review_tree["children"][year_num]['children'])):
        for topic_num in range(len(review_tree["children"][year_num]['children'][rating_num]['children'])):
            go_on = True
            while go_on:
                temp_tree["children"][year_num]['children'][rating_num]['children'][topic_num]['children'], go_on = moreNodes(temp_tree["children"][year_num]['children'][rating_num]['children'][topic_num]['children'],max_summary_length)


# Convert to JSON format
json_string = json.dumps(review_tree, indent=2)

# Print the JSON structure
print(json_string)


In [None]:
def count_leaf_nodes(node):
    if "children" not in node:
        return 1  # This is a leaf node

    leaf_count = 0
    for child in node["children"]:
        leaf_count += count_leaf_nodes(child)
    return leaf_count
def max_tree_depth(node):
    if "children" not in node:
        return 1  # This is a leaf node

    max_depth = 0
    for child in node.get("children", []):
        depth = max_tree_depth(child)
        max_depth = max(max_depth, depth)

    return max_depth + 1 


leaf_count = count_leaf_nodes(temp_tree)
print("Number of leaf nodes:", leaf_count)

In [None]:
depth = max_tree_depth(temp_tree)
print("Maximum depth of the tree:", depth)

In [None]:
def add_descriptions(node):
    new_node = {}
    
    if "name" in node:
        newname = node['name'].split(" ")
        if len(node['name']) < 35:
            new_node["name"]  = node["name"] 
        else:
            new_node["name"] = " ".join(newname[:3]) + "....."
            if len(new_node['name'])>35:
                new_node["name"] = node['name'][:30] + "....."
        new_node["description"] = node["name"]
    
    if "children" in node:
        new_children = []
        for child in node["children"]:
            new_children.append(add_descriptions(child))
        new_node["children"] = new_children
    
    return new_node


new_tree = add_descriptions(temp_tree)
print(new_tree)

In [None]:
df_data = []
def convert_to_df(node, parent_name=None):
    global df_data
    node_data = {
        "name": node["name"],
        "description": node["description"],
        "parent": parent_name
    }
    children = node.get("children", [])
    for child in children:
        convert_to_df(child, node["name"])
    df_data.append(node_data)


convert_to_df(new_tree)
df = pd.DataFrame(df_data)

# Display the DataFrame
print("DataFrame:")
display(df)





In [None]:
# Convert JSON tree to DataFrame
def convert_to_df(node, parent_name=None):
    global df_data
    node_data = {
        "name": node["name"],
        "description": node["description"],
        "parent": parent_name
    }
    children = node.get("children", [])
    df_data.append(node_data)
    for child in children:
        convert_to_df(child, node["name"])

df_data = []
convert_to_df(new_tree)
df = pd.DataFrame(df_data)

# Display the DataFrame
print("DataFrame:")
display(df)

# Convert DataFrame back to JSON tree structure


In [None]:
def build_tree(data, parent_name=None):
    children = []
    for _, row in data.iterrows():
        if row["parent"] == parent_name:
            child = {
                "name": row["name"],
                "description": row["description"]
            }
            child_children = build_tree(data, row["name"])
            if child_children:
                child["children"] = child_children
            children.append(child)
    return children

converted_json_tree = build_tree(df)

# Display the converted JSON tree
print("\nConverted JSON Tree:")
print(converted_json_tree)