In [1]:
import pandas as pd
import biblib.bib
import re
import graphviz
import os
import sys
import math
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout, to_agraph
import pygraphviz
import matplotlib.pyplot as plt
from semanticscholar import SemanticScholar
from unidecode import unidecode
from alphabet_detector import AlphabetDetector
import seaborn as sns
from netgraph import Graph
import nltk
import collections
import re
import json

nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))
nltk.download('punkt')

In [2]:
# Glossary file contains:
    # All paper from initial selection (65)
    # All the FS and BS papers obtained from the query with proximity constraint
def get_glossary():
    with open('Glossary.bib', 'r') as glossary:
        db = biblib.bib.Parser().parse(glossary, log_fp=sys.stderr).get_entries()
        return db
glossary = get_glossary()

In [3]:
components = list(nx.connected_components(nx.Graph(nx.nx_pydot.read_dot('ALL_selection_only.dot'))))
components_info = []

for component in components:
    cur_component = {}

    for paper in component:
        cur_component[paper] = {}

        if paper.replace('\\n','') in glossary:
            cur_paper = glossary[paper.replace('\\n','')]
        else:
            print(f"Paper not in Glossary: {paper}")
        cur_component[paper]['title'] = cur_paper['title']
        cur_component[paper]['author'] = cur_paper['author']
        if('journal' in cur_paper):
            cur_component[paper]['journal'] = cur_paper['journal']
        else:
            print(f"missing journal: {paper}")
        if('abstract' in cur_paper):
            cur_component[paper]['abstract'] = cur_paper['abstract']
        else:
            print(f"missing abstract: {paper}")

    components_info.append(cur_component)

Paper not in Glossary: \n


In [44]:
title_abstr_str = ''

# 0, 2, 8
cur_comp_info = components_info[2]
for paper in cur_comp_info:
    cur_paper = cur_comp_info[paper]
    title_abstr_str += f" {cur_paper['title']}"
    title_abstr_str += f" {cur_paper['abstract']}"

def remove_punctuation(input_string):
    return re.sub(r'[^\w\s]', '', input_string)

def remove_stopwords(input_string):
    tokens = nltk.word_tokenize(input_string.lower())
    tokens = [word for word in tokens if word not in stop_words]
    bigrams_trigrams = list(nltk.everygrams(tokens, 2, 3))
    bigrams_trigrams = ['_'.join(list(gram)).replace("'", '') for gram in bigrams_trigrams]
    tokens += bigrams_trigrams
    return tokens


def count_tokens(input_string):
    tokens = remove_stopwords(remove_punctuation(input_string))
    return collections.Counter(tokens)

token_count = count_tokens(title_abstr_str)
sorted_token_count = dict(sorted(token_count.items(), key=lambda item: item[1], reverse=True))

print(sorted_token_count)

{'research': 67, 'reviews': 58, 'topic': 57, 'study': 46, 'analysis': 45, 'topics': 45, 'online': 45, 'sentiment': 31, 'data': 29, 'quality': 27, 'results': 24, 'service': 23, 'satisfaction': 23, 'using': 22, 'text': 22, 'online_reviews': 22, 'structural': 20, 'structural_topic': 20, 'attributes': 19, 'modeling': 19, 'approach': 18, 'also': 18, 'factors': 18, 'sentiment_analysis': 18, 'topic_modeling': 18, 'social': 17, 'customer': 17, 'information': 17, 'latent': 16, 'consumer': 16, 'major': 15, 'consumers': 15, 'used': 14, 'use': 14, 'review': 14, 'issues': 14, 'services': 14, 'covid19': 14, 'literature': 13, 'learning': 13, 'identify': 13, 'insights': 13, 'models': 12, 'methods': 12, 'findings': 12, 'trends': 12, 'customers': 12, 'service_quality': 12, 'human': 11, 'modelling': 11, 'airbnb': 11, 'users': 11, 'mining': 11, 'computing': 11, 'course': 11, 'important': 10, 'time': 10, 'based': 10, 'education': 10, 'twitter': 10, 'content': 10, 'big': 10, 'employee': 10, 'industry': 10, 

In [42]:
for paper in cur_comp_info:
    print(cur_comp_info[paper]['title'])

Transfer Topic Labeling with Domain-Specific Knowledge Base: An Analysis of UK House of Commons Speeches 1935-2014
Harnessing the ``wisdom of employees'' from online reviews
Employing structural topic modelling to explore perceived service quality attributes in {Airbnb} accommodation
A {Structural} {Topic} {Modeling}-{Based} {Bibliometric} {Study} of {Sentiment} {Analysis} {Literature}
The Politics of Scrutiny in Human Rights Monitoring: Evidence from Structural Topic Models of US State Department Human Rights Reports
A text analytics approach for online retailing service improvement: Evidence from Twitter
Quality 4.0: big data analytics to explore service quality attributes and their relation to user sentiment in {Airbnb} reviews
Latent Dirichlet allocation (LDA) for topic modeling of the CFPB consumer complaints
Job satisfaction and employee turnover determinants in high contact services: {Insights} from {Employees}'{Online} reviews
W2VLDA: Almost unsupervised system for Aspect Based