In [1]:
import os
import sys
import psycopg2

sys.path.insert(0, os.path.abspath('../'))  # add the current module so that we can import the utils file

from aip_tools.utils import get_top_keywords_for_query, create_df_for_query

[nltk_data] Downloading package stopwords to /home/dev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/dev/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/dev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
db = psycopg2.connect(user="lvs215",
                                   password="",
                                   host="localhost",
                                   port="5432",
                                   database="lvs215")
start_year = 2011  # inclusive
end_year = 2020  # inclusive

In [7]:
corpus_query = """
    SELECT * 
    FROM publications"""

corpus_df = create_df_for_query(db, corpus_query)

In [8]:
top_keywords_for_query = get_top_keywords_for_query(conn, corpus_df,
                                                    "SELECT * FROM publications WHERE year between {} and {}".format(
                                                        start_year, end_year), 50)

In [9]:
custom_stopwords_for_query = ["based", "method", "service", "approach", "problem", "computing", "proposed"]

# Print the entire LaTeX table based on the top_keywords_for_query list
# Generate overall trends across my entire database
print("""\\begin{{table}}[t]
\\caption{{Top-10 keywords in system-venue articles published between {} and {}.}}
\\label{{tbl:top-10-overall}}
\\vspace{{-0.2cm}}
\\begin{{adjustbox}}{{max width=\\columnwidth}}
\\begin{{tabular}}{{lllllllllll}}
\\toprule
Rank & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10 \\\\ \\midrule
Word & {} \\\\ \\bottomrule
\\end{{tabular}}
\\end{{adjustbox}}
\\vspace{{-0.3cm}}
\\end{{table}}\n""".format(start_year, end_year, " & ".join([x for x in top_keywords_for_query if x not in custom_stopwords_for_query][:10])))


\begin{table}[t]
\caption{Top-10 keywords in system-venue articles published between 2011 and 2020.}
\label{tbl:top-10-overall}
\vspace{-0.2cm}
\begin{adjustbox}{max width=\columnwidth}
\begin{tabular}{lllllllllll}
\toprule
Rank & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10 \\ \midrule
Word & data & system & network & algorithm & model & cloud & performance & application & time & user \\ \bottomrule
\end{tabular}
\end{adjustbox}
\vspace{-0.3cm}
\end{table}



In [10]:
corpus_query = """
    SELECT * 
    FROM publications 
    WHERE year BETWEEN %s AND %s"""

corpus_df = create_df_for_query(db, corpus_query, [start_year, end_year])

In [11]:
# Based on the corpus defined above, get the top 50 keywords for the sub query.
top_keywords_for_query = get_top_keywords_for_query(db, corpus_df,
                                                    """SELECT * 
                                                    FROM publications 
                                                    WHERE year between %s and %s
                                                    AND (lower(title) like %s or lower(abstract) like %s) 
                                                    AND (lower(title) like %s or lower(abstract) like %s)""",
                                                    50, [2011, 2020, '%workflow%', '%workflow%', '%schedul%', '%schedul%'])

print(top_keywords_for_query)

['workflow', 'scheduling', 'cloud', 'resource', 'algorithm', 'task', 'scientific', 'execution', 'application', 'computing', 'cost', 'data', 'time', 'deadline', 'system', 'environment', 'performance', 'schedule', 'based', 'scheduler', 'proposed', 'approach', 'makespan', 'problem', 'provisioning', 'aware', 'constraint', 'job', 'intensive', 'model', 'objective', 'distributed', 'service', 'multi', 'heuristic', 'optimization', 'user', 'constrained', 'different', 'grid', 'heterogeneous', 'real', 'multiple', 'dependency', 'method', 'strategy', 'simulation', 'hybrid', 'energy', 'dynamic']


In [12]:
# Print the entire LaTeX table based on the top_keywords_for_query list
custom_stopwords_for_query = ["resource", "scientific", "system", "execution", "based", "proposed", "service", 
                              "computing", "approach", "problem"]

print("""\\begin{{table}}[t]
\\caption{{Top-10 keywords in articles on scheduling workflow published between {} and {}.}}
\\label{{tbl:top-10-scheduling-workflow-overall}}
\\vspace{{-0.2cm}}
\\begin{{adjustbox}}{{max width=\\columnwidth}}
\\begin{{tabular}}{{lllllllllll}}
\\toprule
Rank & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10 \\\\ \\midrule
Word & {} \\\\ \\bottomrule
\\end{{tabular}}
\\end{{adjustbox}}
\\vspace{{-0.3cm}}
\\end{{table}}\n""".format(start_year, end_year, " & ".join([x for x in top_keywords_for_query if x not in custom_stopwords_for_query][:10])))


\begin{table}[t]
\caption{Top-10 keywords in articles on scheduling workflow published between 2011 and 2020.}
\label{tbl:top-10-scheduling-workflow-overall}
\vspace{-0.2cm}
\begin{adjustbox}{max width=\columnwidth}
\begin{tabular}{lllllllllll}
\toprule
Rank & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10 \\ \midrule
Word & workflow & scheduling & cloud & algorithm & task & application & cost & data & time & deadline \\ \bottomrule
\end{tabular}
\end{adjustbox}
\vspace{-0.3cm}
\end{table}



In [13]:
keywords_per_year = dict()
num_keywords = 10

In [14]:
custom_stopwords_for_query = set(["resource", "execution", "scientific", "service", "management", "based", 
                              "computing", "schedule", "approach", "different", "distributed", "science", 
                              "system", "intensive", "aware", "executing", "science", "file", "characteristic", 
                              "complex", "bi", "proposed", "transfer", "hybrid", "directed", "constraint", 
                              "constrained", "multi", "multiple", "scientist", "tolerance"])
for year in range(start_year, end_year + 1):
    query = "SELECT * FROM publications WHERE year = ? and (lower(title) like '%workflow%' or lower(abstract) like '%workflow%') and (lower(title) like '%schedul%' or lower(abstract) like '%schedul%')"
    keywords = [x for x in get_top_keywords_for_query(conn, corpus_df, query, 50, [year]) if x not in custom_stopwords_for_query]
    print(keywords)  # Visual dump to see if we need to add words to the stopwords set
    keywords_per_year[year] = keywords[:num_keywords]

DatabaseError: Execution failed on sql 'SELECT * FROM publications WHERE year = ? and (lower(title) like '%workflow%' or lower(abstract) like '%workflow%') and (lower(title) like '%schedul%' or lower(abstract) like '%schedul%')': list index out of range

In [None]:
# Print the entire LaTeX table based on the keywords_per_year dict
print("""
\\begin{{table}}[t]
\\caption{{Top-10 keywords per year in articles on scheduling workflows published between {} and {}.}}
\\label{{tbl:top-10-scheduling-workflow-per-year}}
\\vspace{{-0.2cm}}
\\begin{{adjustbox}}{{max width=\columnwidth}}
\\begin{{tabular}}{{rllllllllll}}
\\toprule""".format(start_year, end_year))
print("Rank & {} \\\\ \\midrule".format(" & ".join([str(x) for x in range(start_year, end_year + 1)])))

for rank in range(1, num_keywords + 1):
    line = "{0} & ".format(rank)
    for year in range(start_year, end_year + 1):
        line += "{0} & ".format(keywords_per_year[year][rank - 1])

    line = line.rstrip(" & ")
    line += " \\\\"
    if rank == 5:
        line += " \\midrule"

    if rank == num_keywords:
        line += " \\bottomrule"

    print(line)
print("""\end{tabular}
\\end{adjustbox}
\\vspace{-0.3cm}
\\end{table}""")