In [1]:
# !pip install PyYAML python-dotenv

In [5]:
import requests
import yaml
import os
from dotenv import load_dotenv

def fetch_gov_github_accounts(url):
    response = requests.get(url)
    if response.status_code == 200:
        return yaml.safe_load(response.text)
    else:
        return None

def fetch_repository_details(username, token):
    headers = {'Authorization': f'token {token}'}
    repos_url = f"https://api.github.com/users/{username}/repos"
    repos_response = requests.get(repos_url, headers=headers)
    
    if repos_response.status_code == 200:
        repos_data = repos_response.json()
        full_repo_details = []
        
        for repo in repos_data:
            repo_details = {
                'name': repo['name'],
                'description': repo['description'] or "No description",
                'stars': repo['stargazers_count'],
                'forks': repo['forks'],
                'language': repo['language'] or "None specified"
            }
            # Fetch the README file
            readme_url = f"https://api.github.com/repos/{username}/{repo['name']}/readme"
            readme_response = requests.get(readme_url, headers=headers)
            if readme_response.status_code == 200:
                readme_data = readme_response.json()
                readme_content = requests.get(readme_data['download_url']).text
                repo_details['readme'] = readme_content[:100]  # Truncate for brevity
            else:
                repo_details['readme'] = "README not available"
            
            full_repo_details.append(repo_details)
        
        return full_repo_details
    else:
        return None

def save_to_markdown(repos, filename):
    with open(filename, 'w') as f:
        f.write('| Repository Name | Description | Stars | Language | README |\n')
        f.write('|-----------------|-------------|-------|----------|--------|\n')
        for repo in repos:
            f.write(f"| {repo['name']} | {repo['description']} | {repo['stars']} | {repo['language']} | {repo['readme'][:50]}... |\n")


# Load environment variables from .env file
load_dotenv('.env')

# Load environment variables from .env file
load_dotenv('.env')

# Access environment variables
github_token = os.getenv('GITHUB_TOKEN')
if not github_token:
    raise ValueError("GITHUB_TOKEN not found in environment variables. Please check your .env file.")


# Main execution
url = "https://raw.githubusercontent.com/github/government.github.com/gh-pages/_data/governments.yml"
accounts = fetch_gov_github_accounts(url)

accounts


{'Argentina': ['argob',
  'cifasis',
  'gcba',
  'inti-cmnb',
  'municipalidad-de-vicente-lopez',
  'municipioriogrande'],
 'Australia': ['actesa',
  'actgov',
  'agnsw',
  'AtlasOfLivingAustralia',
  'ausdto',
  'australianantarcticdatacentre',
  'AustralianAntarcticDivision',
  'berowrarfb',
  'bom-radar',
  'city-of-melbourne',
  'commerce-wa-ols',
  'consumerdataright',
  'data61',
  'datagovau',
  'dbca-wa',
  'dpc-sdp',
  'dpipwe',
  'dssgovaus',
  'envris',
  'Fire-and-Rescue-NSW',
  'gccgisteam',
  'GeoscienceAustralia',
  'govau',
  'govcms',
  'gs-dawr',
  'healthgovau',
  'Healthway',
  'hiscom',
  'innovationgovau',
  'IPAustralia',
  'Landgate',
  'nla',
  'NSW-eTendering',
  'NSWPlanning',
  'pmcau',
  'PublicRecordOfficeVictoria',
  'qld-gov-au',
  'srnsw',
  'SunshineCoastCouncil',
  'treasury-aus',
  'victoriangovernment',
  'wagov',
  'wamuseum'],
 'Austria': ['datagvat'],
 'Belgium': ['belgianpolice',
  'CIRB',
  'Fedict',
  'inbo',
  'NationalBankBelgium',
  'onroer

In [None]:
import pandas as pd


all_repos = []
if accounts:
    for country, usernames in accounts.items():
        for username in usernames:
            repo_details = fetch_repository_details(username, github_token)
            if repo_details:
                all_repos.extend(repo_details)
                print(f"Data for {username} fetched and processed.")
            else:
                print(f"Failed to fetch data for {username}")

# Create DataFrame and save to Markdown
if all_repos:
    repos_df = pd.DataFrame(all_repos)
    markdown_file = "all_government_repositories.md"
    save_to_markdown(repos_df, markdown_file)
    print(f"All data saved to {markdown_file}")
    # Optionally save to CSV or another format
    repos_df.to_csv("all_government_repositories.csv", index=False)
    print("Data also saved as a CSV file.")
else:
    print("No repository data collected.")

Data for argob fetched and processed.
Data for cifasis fetched and processed.
Data for gcba fetched and processed.
Data for inti-cmnb fetched and processed.
Data for municipalidad-de-vicente-lopez fetched and processed.
Data for municipioriogrande fetched and processed.
Failed to fetch data for actesa
Data for actgov fetched and processed.
Data for agnsw fetched and processed.
Data for AtlasOfLivingAustralia fetched and processed.
Data for ausdto fetched and processed.
Data for australianantarcticdatacentre fetched and processed.
Data for AustralianAntarcticDivision fetched and processed.
Data for berowrarfb fetched and processed.
Data for bom-radar fetched and processed.
Data for city-of-melbourne fetched and processed.
Data for commerce-wa-ols fetched and processed.
Data for consumerdataright fetched and processed.
Data for data61 fetched and processed.
Data for datagovau fetched and processed.
Data for dbca-wa fetched and processed.
Data for dpc-sdp fetched and processed.
Data for d

In [21]:
repos_df = pd.DataFrame(all_repos)
repos_df

Unnamed: 0,name,description,stars,forks,language,readme
0,accesibilidad-web,Repositorio del Equipo de Accesibilidad Web de...,66,19,None specified,# Accesibilidad Web\n\n**Repositorio del Equip...
1,AR-API-Gateway-Distro,No description,0,1,None specified,# AR-API-Gateway-Distro
2,AR-Distro,Distro de Drupal para Gobiernos,12,10,PHP,# AR-Distro\n\nDistribución Drupal gratuita de...
3,barra-de-accesibilidad,Plugin que incorpora opciones de accesibilidad...,5,6,PHP,"# Barra de ""Herramientas de Accesibilidad""\n# ..."
4,cofra,Sistema de rendición de caja chica,3,5,PHP,# Cofra\n\nSistema de rendición de caja chica....
...,...,...,...,...,...,...
2139,laws-lois-xml,The consolidated Acts and regulations of Canad...,13,0,XSLT,# laws-lois-xml - [Aller en français](https://...
2140,lims-xml-dtd,The xml document type definitions for the fede...,0,0,None specified,README not available
2141,ogd-office-entry-am-entree-au-bureau,COVID-19 Office Entry - Entrée au bureau,10,4,PowerShell,# COVID-19 Office Entry - Entrée au bureau\nTh...
2142,otto,No description,1,0,Python,# What is Otto\n\nOtto is a suite of AI servic...


In [47]:
repos_df10 = repos_df[:10]

In [26]:
# !pip install langchain langchain-anthropic langchain-community

python(95213) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Collecting langchain-community
  Downloading langchain_community-0.3.2-py3-none-any.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
Collecting pydantic-settings<3.0.0,>=2.4.0
  Downloading pydantic_settings-2.5.2-py3-none-any.whl (26 kB)
Collecting dataclasses-json<0.7,>=0.5.7
  Downloading dataclasses_json-0.6.7-py3-none-any.whl (28 kB)
Collecting marshmallow<4.0.0,>=3.18.0
  Downloading marshmallow-3.22.0-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing-inspect<1,>=0.4.0
  Using cached typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting mypy-extensions>=0.3.0
  Using cached mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Installing collected packages: mypy-extensions, marshmallow, typing-inspect, pydantic-settings, dataclasses-json, langchain-community
Success

In [48]:
from langchain.prompts import PromptTemplate
from langchain_anthropic import ChatAnthropic
from tqdm import tqdm
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain_core.exceptions import OutputParserException

anthropic_api_key = os.getenv('ANTHROPIC_KEY')
df = repos_df.copy()

def classify_repos(df, topic_list, model_name="claude-3-haiku-20240307"):
    llm = ChatAnthropic(model=model_name, api_key=anthropic_api_key)
    
    summary_prompt = PromptTemplate(
        input_variables=["description", "readme"],
        template="""
        Please provide a summary of the following GitHub repository based on its description and README.md content. If the README.md is not in English, please first translate it to English and then generate a summary. The summary should be concise and in fewer than 5 sentences.
        
        Repository description:
        {description}
        
        README.md content:
        {readme}
        
        Summary:
        """
    )
    
    topic_prompt = PromptTemplate(
        input_variables=["summary", "topic_list"],
        template="""
        Given the following summary of a GitHub repository and a list of potential topics, select the most appropriate topic for the repository. If none of the topics in the list are suitable, generate a new topic label.
        
        Repository summary:
        {summary}
        
        Potential topics:
        {topic_list}
        
        Selected topic:
        """
    )
    
    def generate_summary(row):
        description = row["description"] if pd.notnull(row["description"]) else ""
        readme = row["readme"] if pd.notnull(row["readme"]) else ""
        prompt = summary_prompt.format(description=description, readme=readme)
        summary = llm.invoke(prompt).content
        return summary
    
    response_schemas = [
        ResponseSchema(name="topic", description="The selected topic for the repository.")
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

    topic_prompt = PromptTemplate(
        input_variables=["summary", "topic_list"],
        template="""
        Given the following summary of a GitHub repository and a list of potential topics, select the most appropriate topic for the repository. If none of the topics in the list are suitable, generate a new topic label.

        Repository summary:
        {summary}

        Potential topics:
        {topic_list}

        Selected topic:
        "{{topic}}"

        PLEASE ONLY RETURN THE TOPIC LABEL AS THE RESPONSE. DO NOT INCLUDE ANY ADDITIONAL TEXT.
        """,
        output_parser=output_parser,
    )

    def classify_repo(readme):
        row = df.loc[df["readme"] == readme].iloc[0]
        summary = generate_summary(row)
        prompt = topic_prompt.format(summary=summary, topic_list=", ".join(topic_list))
        response = llm.invoke(prompt).content

        try:
            topic = output_parser.parse(response)["topic"]
        except OutputParserException:
            # Handle the case when the response is not a valid JSON
            topic = response.strip()  # Extract the topic as a plain string

        if topic not in topic_list:
            topic_list.append(topic)

        return topic
    
    tqdm.pandas(desc="Classifying repositories")
    df["topic"] = df["readme"].progress_apply(classify_repo)
    
    df["summary"] = df.apply(generate_summary, axis=1)
    
    return df, topic_list



In [49]:
display(df)

Unnamed: 0,name,description,stars,forks,language,readme
0,accesibilidad-web,Repositorio del Equipo de Accesibilidad Web de...,66,19,None specified,# Accesibilidad Web\n\n**Repositorio del Equip...
1,AR-API-Gateway-Distro,No description,0,1,None specified,# AR-API-Gateway-Distro
2,AR-Distro,Distro de Drupal para Gobiernos,12,10,PHP,# AR-Distro\n\nDistribución Drupal gratuita de...
3,barra-de-accesibilidad,Plugin que incorpora opciones de accesibilidad...,5,6,PHP,"# Barra de ""Herramientas de Accesibilidad""\n# ..."
4,cofra,Sistema de rendición de caja chica,3,5,PHP,# Cofra\n\nSistema de rendición de caja chica....
5,cuidar-android,App Cuidar,74,21,Java,# Cuidar Android\n\nSe publica aquí el código ...
6,cuidar-ios,App Cuidar,12,5,Swift,# Cuidar iOS\n\nSe publica aquí el código fuen...
7,drupal-argentina-borrador,Esta es una distribución de un componente util...,0,1,PHP,# drupal-argentina-borrador\nEsta es una distr...
8,drupal-argentina-carousel,Esta es una distribución de un componente util...,0,1,PHP,# drupal-argentina-carousel\nEsta es una distr...
9,drupal-argentina-distro,Esta es una distribución pública del portal de...,0,1,PHP,# drupal-argentina-distro\nEsta es una distrib...


In [51]:
df, topic_list = classify_repos(repos_df10, topic_list=['ai', 'web development', 'data science', 'cybersecurity'])
df
df.to_csv('all_gov_projects.csv', index=False)


Classifying repositories: 100%|██████████| 10/10 [00:17<00:00,  1.74s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["topic"] = df["readme"].progress_apply(classify_repo)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["summary"] = df.apply(generate_summary, axis=1)
