In [6]:
# Packages for this notebook
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from googlesearch import search
import requests
from bs4 import BeautifulSoup

In [2]:
# Load dataset as pandas dataframe
csv_file = "https://github.com/jmstanto/oslineage/raw/main/NSFopenSourceAwards.csv"
award_data = pd.read_csv(csv_file, encoding = "ISO-8859-1")


In [3]:
show_graphs = False # Set to True to see diagnostic graphs

# Compute an aggregation of awards by state
state_groups = award_data.groupby('OrganizationState').agg('count')

if show_graphs:
  # display distribution of awards across states as graph
  plt.figure(figsize=(8,4))
  sorted_df = state_groups.sort_values('AwardNumber')
  sns.lineplot(x="OrganizationState", y="AwardNumber", sort=False, data=sorted_df)
  plt.xticks(rotation=90, fontsize=8)
  plt.show()

In [4]:
# Reformat numeric data giving the dollar amounts of awards
amounts = []
awarded_amount = award_data['AwardedAmountToDate']

for index, amount in awarded_amount.items():
  amounts.append(int(amount[1:-3].replace(',','')))

award_data['AwardedAmount'] = amounts

if show_graphs:
  # display awarded amounts as histogram
  award_data.hist(column='AwardedAmount', bins=20)
  plt.show()

In [5]:
# Create a pairs database of pi and co-pi pairs
pis_df = award_data[['PrincipalInvestigator', 'Co-PIName(s)']].copy()
copis = award_data['Co-PIName(s)'].str.split(pat=", ")
pairs = []

for index, pi in award_data['PrincipalInvestigator'].items():
    if isinstance(copis[index], list):
        for copi in copis[index]:
            pairs.append((pi, copi))

pairs_df = pd.DataFrame(pairs)

if show_graphs:
  # display pi and co-pi pairs as network diagram
  G = nx.Graph()
  for pair in pairs:
    G.add_edge(pair[0], pair[1])
  nx.draw(G, node_size=5, font_size=5, pos=nx.spring_layout(G))
  plt.show()

In [7]:
# create tf-idf matrix from abstracts
all_stopwords = list(text.ENGLISH_STOP_WORDS)
tfidf_vect = TfidfVectorizer(lowercase=True, stop_words=all_stopwords)
tfidf = tfidf_vect.fit_transform(award_data['Abstract'])
tfidf = pd.DataFrame(tfidf.toarray())
tfidf.columns = tfidf_vect.get_feature_names_out()

tfidf.head()

Unnamed: 0,000,0001,00131600,060,097,0d,10,100,1000,1000b,...,zoom,zooniverse,zoonotic,zork,zpc,zstandard,ztc,ztf,ztx,zwicky
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.073827,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# prompt: Display the total number of rows and columns for tfidf.

print("Rows:", tfidf.shape[0])
print("Columns:", tfidf.shape[1])


Rows: 2205
Columns: 18753


In [8]:
# create matrix of abstract keywords based on tf-idf
keylist = []
for row_num, rows in tfidf.iterrows():
  row = tfidf.iloc[row_num]
  sorted_row = row.sort_values(ascending=False)
  keywords = []
  for word, score in sorted_row.items():
    if score > (sorted_row.mean() + 20 * sorted_row.std()):
        keywords.append(word)
    else:
        break
  keylist.append(keywords)

# Add the keywords to the original data set
award_data['Keywords'] = keylist
award_data.head()

Unnamed: 0,AwardNumber,Title,NSFOrganization,Program(s),StartDate,LastAmendmentDate,PrincipalInvestigator,State,Organization,AwardInstrument,...,OrganizationState,OrganizationZip,OrganizationPhone,NSFDirectorate,ProgramElementCode(s),ProgramReferenceCode(s),ARRAAmount,Abstract,AwardedAmount,Keywords
0,2303740,POSE: Phase II: Building open source ecosystem...,TI,POSE,10/01/2023,09/20/2023,Karmen Condic-Jurkic,CA,OPEN MOLECULAR SOFTWARE FOUNDATION,Standard Grant,...,CA,956164548,9493852000.0,TIP,211Y00,,$0.00,Karmen Condic-Jurkic and David Mobley of the O...,1499353,"[omsf, molecular, projects, sciences]"
1,2247929,A Learning Environment for an Open-Source Cont...,DUE,HSI-Hispanic Serving Instituti,07/01/2023,04/27/2023,Igor Fabio Steinmacher,AZ,Northern Arizona University,Standard Grant,...,AZ,86011,9285231000.0,EDU,077Y00,"8209, 9178",$0.00,With support from the Improving Undergraduate ...,498616,"[oss, doorway, contribution, learners, hsis, s..."
2,2050195,REU Site: The future of discovery: training st...,OAC,RSCH EXPER FOR UNDERGRAD SITES,04/15/2021,05/21/2021,Volodymyr Kindratenko,IL,University of Illinois at Urbana-Champaign,Standard Grant,...,IL,618013620,2173332000.0,CSE,113900,"075Z, 079Z, 9250",$0.00,Machine learning is a powerful tool that has b...,405000,"[machine, learning, students, mentors, apply, ..."
3,1749635,CAREER: Software Reliability and Security Risk...,CNS,"Special Projects - CNS, CSR-Computer Systems R...",09/01/2018,07/18/2023,Lance Fiondella,MA,"University of Massachusetts, Dartmouth",Continuing Grant,...,MA,27472356,5089999000.0,CSE,"171400, 735400","1045, 120Z, 6194, 7354, 9178, 9251",$0.00,Consistent growth in the software sector of th...,614957,"[srt, hybridize, educator, software, fitting, ..."
4,2333297,Workshop on Effective Practices to Support Ope...,DUE,IUSE,01/01/2024,08/30/2023,Steven Clontz,AL,University of South Alabama,Standard Grant,...,AL,366083053,2514606000.0,EDU,199800,"7556, 8209, 9150, 9178",$0.00,This project aims to serve the national intere...,49833,"[creators, conference, stem, technologies, dev..."


In [12]:
# Create parameters to control the search process.
# Because there are bandwidth limitations on searching
# and runtime limitations on Colab, these controls
# allowing selective batching of the work.

start_record = 0
end_record = 10
how_many_urls = 5
exclusion_list = ["README","/discussions","/followers","/guides","/insights",
"/issues", "/orgs", "/starred"]

In [13]:
# Run the set of queries defined by parameters above

# Cycle through the specified set of records in award_data
for i in range(start_record, end_record):
  query = ""
  for word in award_data['Keywords'][i]:
    query += word + " "
  query += "site:github.com"

  # Collect a list of urls for each search string
  url_list = []
  for j in search(query, tld="com", num=how_many_urls, stop=how_many_urls, pause=2):
    test_url = j

    # Check for urls that include at least one of the undesirable strings
    for exclusion in exclusion_list:
      if exclusion in j:
        test_url = "NA"
        break

    if test_url != "NA":
      url_list.append(test_url)

  # Now we have completed one search: If there is a good url, save it
  if len(url_list) > 0:
    award_data.at[i, 'GithubUrl'] = url_list[0]
  else:
    award_data.at[i, 'GithubUrl'] = "NA"

award_data.head()

Unnamed: 0,AwardNumber,Title,NSFOrganization,Program(s),StartDate,LastAmendmentDate,PrincipalInvestigator,State,Organization,AwardInstrument,...,OrganizationZip,OrganizationPhone,NSFDirectorate,ProgramElementCode(s),ProgramReferenceCode(s),ARRAAmount,Abstract,AwardedAmount,Keywords,GithubUrl
0,2303740,POSE: Phase II: Building open source ecosystem...,TI,POSE,10/01/2023,09/20/2023,Karmen Condic-Jurkic,CA,OPEN MOLECULAR SOFTWARE FOUNDATION,Standard Grant,...,956164548,9493852000.0,TIP,211Y00,,$0.00,Karmen Condic-Jurkic and David Mobley of the O...,1499353,"[omsf, molecular, projects, sciences]",https://github.com/dwhswenson
1,2247929,A Learning Environment for an Open-Source Cont...,DUE,HSI-Hispanic Serving Instituti,07/01/2023,04/27/2023,Igor Fabio Steinmacher,AZ,Northern Arizona University,Standard Grant,...,86011,9285231000.0,EDU,077Y00,"8209, 9178",$0.00,With support from the Improving Undergraduate ...,498616,"[oss, doorway, contribution, learners, hsis, s...",https://github.com/UKPLab/argument-reasoning-c...
2,2050195,REU Site: The future of discovery: training st...,OAC,RSCH EXPER FOR UNDERGRAD SITES,04/15/2021,05/21/2021,Volodymyr Kindratenko,IL,University of Illinois at Urbana-Champaign,Standard Grant,...,618013620,2173332000.0,CSE,113900,"075Z, 079Z, 9250",$0.00,Machine learning is a powerful tool that has b...,405000,"[machine, learning, students, mentors, apply, ...",https://github.com/jingwu6
3,1749635,CAREER: Software Reliability and Security Risk...,CNS,"Special Projects - CNS, CSR-Computer Systems R...",09/01/2018,07/18/2023,Lance Fiondella,MA,"University of Massachusetts, Dartmouth",Continuing Grant,...,27472356,5089999000.0,CSE,"171400, 735400","1045, 120Z, 6194, 7354, 9178, 9251",$0.00,Consistent growth in the software sector of th...,614957,"[srt, hybridize, educator, software, fitting, ...",https://github.com/tangwen-qian/DailyArXiv
4,2333297,Workshop on Effective Practices to Support Ope...,DUE,IUSE,01/01/2024,08/30/2023,Steven Clontz,AL,University of South Alabama,Standard Grant,...,366083053,2514606000.0,EDU,199800,"7556, 8209, 9150, 9178",$0.00,This project aims to serve the national intere...,49833,"[creators, conference, stem, technologies, dev...",https://github.com/svaksha/diversity-index/blo...


In [15]:
award_data.head(10)

Unnamed: 0,AwardNumber,Title,NSFOrganization,Program(s),StartDate,LastAmendmentDate,PrincipalInvestigator,State,Organization,AwardInstrument,...,OrganizationZip,OrganizationPhone,NSFDirectorate,ProgramElementCode(s),ProgramReferenceCode(s),ARRAAmount,Abstract,AwardedAmount,Keywords,GithubUrl
0,2303740,POSE: Phase II: Building open source ecosystem...,TI,POSE,10/01/2023,09/20/2023,Karmen Condic-Jurkic,CA,OPEN MOLECULAR SOFTWARE FOUNDATION,Standard Grant,...,956164548,9493852000.0,TIP,211Y00,,$0.00,Karmen Condic-Jurkic and David Mobley of the O...,1499353,"[omsf, molecular, projects, sciences]",https://github.com/dwhswenson
1,2247929,A Learning Environment for an Open-Source Cont...,DUE,HSI-Hispanic Serving Instituti,07/01/2023,04/27/2023,Igor Fabio Steinmacher,AZ,Northern Arizona University,Standard Grant,...,86011,9285231000.0,EDU,077Y00,"8209, 9178",$0.00,With support from the Improving Undergraduate ...,498616,"[oss, doorway, contribution, learners, hsis, s...",https://github.com/UKPLab/argument-reasoning-c...
2,2050195,REU Site: The future of discovery: training st...,OAC,RSCH EXPER FOR UNDERGRAD SITES,04/15/2021,05/21/2021,Volodymyr Kindratenko,IL,University of Illinois at Urbana-Champaign,Standard Grant,...,618013620,2173332000.0,CSE,113900,"075Z, 079Z, 9250",$0.00,Machine learning is a powerful tool that has b...,405000,"[machine, learning, students, mentors, apply, ...",https://github.com/jingwu6
3,1749635,CAREER: Software Reliability and Security Risk...,CNS,"Special Projects - CNS, CSR-Computer Systems R...",09/01/2018,07/18/2023,Lance Fiondella,MA,"University of Massachusetts, Dartmouth",Continuing Grant,...,27472356,5089999000.0,CSE,"171400, 735400","1045, 120Z, 6194, 7354, 9178, 9251",$0.00,Consistent growth in the software sector of th...,614957,"[srt, hybridize, educator, software, fitting, ...",https://github.com/tangwen-qian/DailyArXiv
4,2333297,Workshop on Effective Practices to Support Ope...,DUE,IUSE,01/01/2024,08/30/2023,Steven Clontz,AL,University of South Alabama,Standard Grant,...,366083053,2514606000.0,EDU,199800,"7556, 8209, 9150, 9178",$0.00,This project aims to serve the national intere...,49833,"[creators, conference, stem, technologies, dev...",https://github.com/svaksha/diversity-index/blo...
5,2303582,POSE: Phase II: Cultivating Modeling Literacy...,TI,POSE,09/15/2023,04/15/2024,Uri Wilensky,IL,Northwestern University,Standard Grant,...,602080001,3125038000.0,TIP,211Y00,,$0.00,This project is funded by Pathways to Enable O...,1519990,"[netlogo, modeling, agent, ose]",https://github.com/mas178/social-simulation
6,2107298,HCC: Medium: Designing Human-Centered Environm...,IIS,HCC-Human-Centered Computing,08/15/2021,04/11/2024,Laura Dabbish,PA,Carnegie-Mellon University,Standard Grant,...,152133815,4122689000.0,CSE,736700,"7367, 7924, 9251",$0.00,To increase inclusion and enhance diversity at...,820000,"[participation, enhance, environments, genders...",https://github.com/murrayds/elife-analysis/blo...
7,2303748,POSE: Phase II: CONNECT: Consortium of Open-so...,TI,POSE,08/15/2023,08/08/2023,Xuesong Zhou,AZ,Arizona State University,Standard Grant,...,852813670,4809655000.0,TIP,211Y00,,$0.00,"This project, funded by Pathways to Enable Ope...",1500000,"[multimodal, transportation, gmns]",https://github.com/zephyr-data-specs/GMNS
8,2054516,Collaborative Research: OpenDendro - Advanced ...,AGS,Paleoclimate,06/01/2021,05/21/2021,Kevin Anchukaitis,AZ,University of Arizona,Standard Grant,...,85721,5206266000.0,GEO,153000,,$0.00,The research team aims to create openDendro as...,143148,"[dendrochronology, opendendro, ring, legacy, t...",
9,2230153,POSE: Phase I: An Open-Source Ecosystem for th...,TI,POSE,09/15/2022,09/09/2022,Steven Clontz,AL,University of South Alabama,Standard Grant,...,366083053,2514606000.0,TIP,211Y00,9150,$0.00,This project is funded by Pathways to Enable O...,267268,"[oer, textbooks, authors, stem, instructors, p...",https://github.com/uoregon-libraries/oregonnew...


In [20]:
type(award_data["GithubUrl"][8])

str

In [19]:
requests.get("https://github.com/uoregon-libraries/oregonnews/blob/master/core/fixtures/jamaica_sample.json")

<Response [200]>

In [22]:
# find and store date of last update for each abstract
dates = []

for i in range(start_record, end_record):
  u = award_data['GithubUrl'][i]

  if u != "NA":
    print(u)
    response = requests.get(u)
    soup = BeautifulSoup(response.text, 'html.parser')
    updated_date = soup.find('relative-time')
    if updated_date:
      date = updated_date['datetime']
      dates.append(date)
    else:
      dates.append("date of last update not found")
  else:
    dates.append("N/A")

for d in dates:
  print(d)

#award_data['LastUpdated'] = dates
#award_data.to_csv('award_data.csv')

https://github.com/dwhswenson
https://github.com/UKPLab/argument-reasoning-comprehension-task/blob/master/mturk/annotation-task/data/exported-1927-summarized-arguments/argument-summarized.tsv
https://github.com/jingwu6
https://github.com/tangwen-qian/DailyArXiv
https://github.com/svaksha/diversity-index/blob/main/di-conf-events.md
https://github.com/mas178/social-simulation
https://github.com/murrayds/elife-analysis/blob/master/elife_final.bib
https://github.com/zephyr-data-specs/GMNS
https://github.com/uoregon-libraries/oregonnews/blob/master/core/fixtures/jamaica_sample.json
date of last update not found
date of last update not found
date of last update not found
date of last update not found
date of last update not found
date of last update not found
date of last update not found
2024-06-14T14:05:00Z
N/A
date of last update not found
