In [80]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
import os
import webbrowser
import time
import json
import numpy as np
import zipfile

In [3]:
pd.set_option("display.max_columns", 30)

In [150]:
projects = pd.read_excel("Projects_v3.xlsx", sheet_name="VCM BC projects")
# i don't concat directly because I wanted to harmonize the column names
sheets = {'car':'CAR Projects','vcs': 'VCS Projects', 'acr': 'ACR Projects','gold': 'Gold Projects'}
berkeley = {}
for v, k in sheets.items():
    berkeley[v] = pd.read_excel("Voluntary-Registry-Offsets-Database--v8-May-2023.xlsx", sheet_name=k).dropna(how='all', axis=1)

berkeley['vcs'] = berkeley['vcs'].drop(['Project ID'], axis=1)
berkeley['vcs'] = berkeley['vcs'].rename(columns={"Registry ID": "Project ID"})
berkeley['gold'] = berkeley['gold'].rename(columns={"GS_ID": "Project ID"})
berkeleyDS = pd.concat(berkeley.values())
berkeleyDS.dropna(subset=['Project ID'],inplace=True)

In [152]:
# it will find nonsense values for non-registry ones, but we filter later
projects["registryID"] = (
    projects["Registry entry"].apply(
        lambda x: re.findall(r"-?\d+\.?\d*", x)[-1] if type(x) == str else x
    )
).astype("Int64")


berkeleyDS['plainID'] = (
    berkeleyDS["Project ID"].apply(
        lambda x: re.findall(r"-?\d+\.?\d*", x)[-1] if type(x) == str else x
    )
).astype("Int64")

In [153]:
projects['registryID'] = projects['registryID'].astype(str)
projects.loc[projects['Registry entry'].str.contains(r'thereserve2', na=False), 'projectID'] = 'CAR' + projects['registryID'] 
projects.loc[projects['Registry entry'].str.contains(r'verra', na=False), 'projectID'] = 'VCS' + projects['registryID'] 
# drop duplicated columns as we merge
projectsM = berkeleyDS.merge(projects, left_on="Project ID",right_on="projectID", how="outer", suffixes=('', '_y'))
projectsM.dropna(how='all', axis=1, inplace=True)
projectsM.drop(projectsM.filter(regex='_y$').columns, axis=1, inplace=True)
projectsM.reset_index(drop=True, inplace=True)

In [52]:
# add mangrove projects not in list
mask = projectsM['Project Name'].str.contains(r'mangrove', na=False, case=False)
projectsM.loc[mask, 'projectID'] = projectsM['Project ID'].astype(str)
projectsM.loc[mask, 'registryID'] = projectsM['Project ID'].astype(str)

In [57]:
# assuming you start from Berkeley database
# missing ACR and gold 
projectsM.plainID = projectsM.plainID.astype(str)
projectsM.loc[projectsM['Project ID'].str.contains(r'CAR', na=False), 'projectLink'] = 'https://thereserve2.apx.com/mymodule/reg/TabDocuments.asp?r=111&ad=Prpt&act=update&type=PRO&aProj=pub&tablename=doc&id1=' + projectsM['plainID'] 
projectsM.loc[projectsM['Project ID'].str.contains(r'VCS', na=False), 'projectLink'] = 'https://registry.verra.org/uiapi/resource/resourceSummary/' + projectsM['plainID'] 

In [68]:
# subset of projects 
projectsW = projectsM[~projectsM['registryID'].isnull()].copy()

In [69]:
%%time
# get the links to the geospatial data file in the registry, only VCS and CAR for now
projectsW['urlDown'] = ''
projectsW['urlFileNames'] = ''
for url in projectsW.projectLink:
    if type(url) is str:
        if 'verra' in url:
            data = requests.get(url).json()
            # we get the lon,lat as in the JSON file
            projectsW.loc[projectsW['projectLink'] == url, 'latitude'] = data['location']['latitude']
            projectsW.loc[projectsW['projectLink'] == url, 'longitude'] = data['location']['longitude']
            # we also get the KML, if any, to compute the coordinates manually
            # the nesting in the JSON file is nasty
            found_match = False
            for group in data['documentGroups']:
                if 'OTHER_DOCUMENTS' in next(iter(group.values())):
                    for docs in group['documents']:
                        regexp = re.compile(r'\.kml',re.IGNORECASE)
                        if regexp.search(docs['documentName']):
                            projectsW.loc[projectsW['projectLink'] == url, 'urlDown'] = docs['uri']
                            projectsW.loc[projectsW['projectLink'] == url, 'urlFileNames'] = docs['documentName']
                            found_match = True
                            break  # Stop looking for more matches
                if found_match:
                    break  # Stop iterating over 'documentGroups'
        elif 'thereserve2' in url:
            reqs = requests.get(url)
            soup = BeautifulSoup(reqs.text, 'html.parser')
            links = soup.find_all('a', href=True, string=re.compile(r'shapefile|shape file|shape|shp|kml', re.IGNORECASE))
            if len(links)>0:
                projectsW.loc[projectsW['projectLink'] == url, 'urlDown'] = 'https://thereserve2.apx.com' + links[0].get('href')
                projectsW.loc[projectsW['projectLink'] == url, 'urlFileNames'] = links[0].text

CPU times: user 1.32 s, sys: 43.4 ms, total: 1.36 s
Wall time: 33.3 s


In [75]:
# check what files have been downloaded already 
projectsW.replace(r'^\s*$', np.nan, regex=True, inplace=True)
downloaded = [
    os.path.splitext(filename)[0] for filename in os.listdir("./locationFiles/")
]
downList = []
for project in list(set(projectsW['projectID'])):
        if project not in downloaded:
                downList.append(project)

In [76]:
# download the file and name it with as the project ID. 
for url in projectsW.urlDown:
      if type(url) is str:
        if projectsW['projectID'][projectsW['urlDown'] == url].astype(str).values[0] in downList:
            try:
                 downFile = requests.get(url)
                 fileName = './locationFiles/{}'.format(projectsW['projectID'][projectsW['urlDown'] == url].astype(str).values[0])
                 ext = re.findall('(\.[^.]*)$',projectsW['urlFileNames'][projectsW['urlDown'] == url].astype(str).values[0])[-1]
                 if ext != 'zip':  
                      with open("%s%s" % (fileName, ext), 'wb') as f:
                                for chunk in downFile.iter_content(1024): # iterate on stream using 1KB packets
                                    f.write(chunk) # write the file
                 else:
                      with open(fileName, 'wb') as f:
                                for chunk in downFile.iter_content(1024): # iterate on stream using 1KB packets
                                    f.write(chunk) # write the file               
            except:
                 pass
            time.sleep(2)

In [78]:
projectsW.dropna(how='all', axis=1, inplace=True)
projectsW.to_csv('projectsW.csv', index=False)

In [123]:
projectsW = pd.read_csv('projectsW.csv')

## To test
As of now, no fix for zip files. I need to rename folder and files in zip keeping the extension

In [None]:
source_zip_path = './locationFiles/CAR1429.zip'
target_zip_path = './locationFiles/CAR1429test.zip'

# Extract the zip file to a temporary directory
temp_dir = './locationFiles/temp'
with zipfile.ZipFile(source_zip_path, 'r') as source_zip:
    source_zip.extractall(temp_dir)

# Get the folder name inside the extracted files
folder_name = os.path.basename(temp_dir)

# Create a new target zip file
with zipfile.ZipFile(target_zip_path, 'w') as target_zip:
    # Iterate over the extracted files
    for root, dirs, files in os.walk(temp_dir):
        for file in files:
            file_path = os.path.join(root, file)
            relative_path = os.path.relpath(file_path, temp_dir)
            
            # Get the extension of the file
            file_extension = os.path.splitext(file)[1]
            
            # Construct the new file name using the zip file name and the folder name
            new_file_name = f"{folder_name}{file_extension}"
            
            # Write the file to the target zip file with the new name and relative path
            target_zip.write(file_path, arcname=os.path.join(folder_name, new_file_name))

# Remove the temporary directory
os.rmdir(temp_dir)

print("Zip file renamed and files inside renamed successfully!")
