Skip to content

Commit

Permalink
Migration from private to public repo
Browse files Browse the repository at this point in the history
  • Loading branch information
stefanofasciani committed Jun 9, 2021
1 parent 87c4581 commit eefeb1a
Show file tree
Hide file tree
Showing 17 changed files with 3,645 additions and 4 deletions.
16 changes: 16 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
.DS_Store
.vscode
.venv
.ipynb_checkpoints
.idea
.gradle
__pycache__
scholar.log
geckodriver.log
/resources/config.json
/lastrun.log
/resources/corrected/*
/nime/
/cache/
/output/
/ignore/
674 changes: 674 additions & 0 deletions LICENSE

Large diffs are not rendered by default.

217 changes: 213 additions & 4 deletions README.md

Large diffs are not rendered by default.

661 changes: 661 additions & 0 deletions analysis_meta.py

Large diffs are not rendered by default.

135 changes: 135 additions & 0 deletions analysis_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# This file is part of the NIME Proceedings Analyzer (NIME PA)
# Copyright (C) 2021 Jackson Goode, Stefano Fasciani

# The NIME PA is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# The NIME PA is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.

# If you use the NIME Proceedings Analyzer or any part of it in any program or
# publication, please acknowledge its authors by adding a reference to:

# S. Fasciani, J. Goode, 20 NIMEs: Twenty Years of New Interfaces for Musical
# Expression, in proceedings of 2021 International Conference on New Interfaces
# for Musical Expression, Shanghai, China, 2021.

# Native
import sys
if sys.version_info < (3, 7):
print("Please upgrade Python to version 3.7.0 or higher")
sys.exit()
import os
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import pickle
import argparse

# External
import matplotlib
from matplotlib import pyplot as plt
from scipy.optimize import curve_fit
from scipy.interpolate import UnivariateSpline
import numpy as np
from tqdm import tqdm
import pandas as pd

# Helper
import pa_print
from pa_utils import import_config

grobid_text_src = './cache/text/grobid/'
lda_src = './cache/lda/'
num_topics = 5


if __name__ == "__main__":
parser = argparse.ArgumentParser(description='A script for querying search terms occurrence over time')
parser.add_argument('-v', '--verbose', action='store_true', default=False,
help='prints out analysis process and results')
args = parser.parse_args()

# Sets global print command
pa_print.init(args)

# Print notice
pa_print.lprint()

keywords, _, _, selected_years = import_config('./resources/custom.csv') # ignore and merge words already processed
if len(keywords) == 0:
print('No keywords found! Please add keywords in ./resources/custom.csv.')
sys.exit()

if len(selected_years) != 0:
year_range = list(map(int,selected_years))
year_start, year_end = min(year_range), max(year_range) + 1
else:
year_start, year_end = 2001, 2021
year_range = range(year_start, year_end)

print(f'Searching for {keywords} in years {year_range}')

print('\nLoading bodies, dict, corpus, and model...')
processed_bodies = pickle.load(open(lda_src+'bodies.pkl', 'rb'))

# Create list to mark each text with year (will be linked to corpus values)
year_list = []
for i in os.listdir(grobid_text_src):
if i.startswith('grob_'):
name = i.split('grob_nime')[-1]
year = name.split('_')[0]
year_list.append((int(year), name))

keyword_frequency = pd.DataFrame(index = year_range, columns = keywords)

searched_words = dict()
year_counts = dict()
for i in year_range:
searched_words[i] = {}
year_counts[i] = 0

for year, doc in zip(year_list, processed_bodies):
year = year[0]
if year in year_range:
for term in keywords:
if searched_words[year].get(term):
searched_words[year][term] += doc.count(term) # update year total with current count
else: # initial entry
searched_words[year].update({term: doc.count(term)})

year_counts[year] += len(doc) # get total words/year

for year, search in searched_words.items():
for term in keywords:
search[term] = search[term] / year_counts[year]
keyword_frequency.at[year, term] = search[term]

# * Show searched words
plt.figure(figsize=(20,10))

x = [year for year in searched_words.keys()]
for word in keywords:
y = [search[word] for search in searched_words.values()]
plt.scatter(x, y, label=word)

# Spline
s = UnivariateSpline(x, y, s=5)
xs = np.linspace(year_start, year_end-1, 100)
ys = s(xs)
plt.plot(xs, ys, label=f'Spline for {word}')

plt.legend()
plt.xlabel('Year')
plt.ylabel('Frequency of Keyword within Paper')
plt.title('Frequency of Keyword over Publication Year')
plt.savefig('./output/keyword_occurrence.png')

with pd.ExcelWriter('./output/keyword_occurrence.xlsx') as writer:
keyword_frequency.to_excel(writer, sheet_name='Keyword Occurrence', header=True)
Loading

0 comments on commit eefeb1a

Please sign in to comment.