In [1]:
# Read the file and keep every 3rd line in an array
import pandas as pd
import re
from io import StringIO
import requests
import os

In [2]:
def download_file(url, save_path):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        with open(save_path, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded: {save_path}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {url}: {e}")

In [3]:
def read_every_third_line(file_path):
    result = []
    with open(file_path, 'r') as file:
        for index, line in enumerate(file):
            if (index) % 3 == 0:  # Keep every 3rd line (0-based index)
                result.append(line.strip())
    return result

In [4]:
# read tables in markdown files and append to dataframe
def read_markdown_table(file_path):
    with open(file_path, 'r', encoding="UTF-8") as file:
        content = file.read()
        # Find the table in the markdown file
        # replace everything before the ---\n\n
        content = re.sub(r'^(.*?)(\n---\n\n)', '', content, flags=re.DOTALL)

        df = pd.read_csv(StringIO(content), sep='|', skipinitialspace=True)
        # drop columns with Unnamed in column name
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        # replace all ** in col names with empty string
        df.columns = df.columns.str.replace('**', '', regex=False)
        # replace all leading and trailing spaces in col names with empty string
        df.columns = df.columns.str.strip()
        # remove row 2
        df = df.drop(df.index[0])
        # remove all leading and trailing spaces in the dataframe
        df = df.map(lambda x: x.strip() if isinstance(x, str) else x)
        # rename columns to lowercase
        df.columns = df.columns.str.lower()
        # append file name without extension to the dataframe as new column
        file_name = os.path.basename(file_path)
        file_name = os.path.splitext(file_name)[0]
        df['deployment_type'] = file_name
        # transform columns to rows except for the first column
        df = df.melt(id_vars=["region", "deployment_type"], var_name='model_full', value_name='is_available')
        # replace ✅ with True
        df['is_available'] = df['is_available'].replace({'✅': True, '-': False})

        # extract model from model_full by using everything before the first comma
        df['model'] = df['model_full'].str.extract(r'^(.*?),')[0]

        # extract version from model_full by using everything after the first comma
        df['version'] = df['model_full'].str.extract(r',\s*(.*)')[0]

        # remove leading and trailing spaces in model and version columns
        df['model'] = df['model'].str.strip()
        df['version'] = df['version'].str.strip()

        return df
    return None

In [None]:
# Example usage
# replace contents by copy pasting the table from this link if needed:
# https://github.com/MicrosoftDocs/azure-ai-docs/tree/main/articles/ai-services/openai/includes/model-matrix
file_path = 'model_matrix_file_listing.txt'
deployment_types = read_every_third_line(file_path)
# something like this:
# ['datazone-provisioned-managed.md', 'datazone-standard.md', 'global-batch-datazone.md', 'global-batch.md', 'provisioned-global.md', 'provisioned-models.md', 'quota.md', 'standard-audio.md', 'standard-chat-completions.md', 'standard-completions.md', 'standard-embeddings.md', 'standard-global.md', 'standard-gpt-35-turbo.md', 'standard-gpt-4.md', 'standard-image-generation.md', 'standard-models.md']
print(deployment_types)

['datazone-provisioned-managed.md', 'datazone-standard.md', 'global-batch-datazone.md', 'global-batch.md', 'provisioned-global.md', 'provisioned-models.md', 'quota.md', 'standard-audio.md', 'standard-chat-completions.md', 'standard-completions.md', 'standard-embeddings.md', 'standard-global.md', 'standard-gpt-35-turbo.md', 'standard-gpt-4.md', 'standard-image-generation.md', 'standard-models.md']


In [6]:

url_prefix = "https://raw.githubusercontent.com/MicrosoftDocs/azure-ai-docs/refs/heads/main/articles/ai-services/openai/includes/model-matrix/"
# download the files from the URLs in the array
all_dataframes = []

for file_name in deployment_types:
    url = url_prefix + file_name
    # Save to a directory named 'downloaded_files'
    save_path = os.path.join('downloaded_files', file_name)
    # Create the directory if it doesn't exist
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    download_file(url, save_path)
    if not "quota" in file_name:
        one_df = read_markdown_table(save_path)
        if one_df is not None:
            all_dataframes.append(one_df)

# Combine all dataframes into a single dataframe
df = pd.concat(all_dataframes, ignore_index=True)

Downloaded: downloaded_files\datazone-provisioned-managed.md
Downloaded: downloaded_files\datazone-standard.md
Downloaded: downloaded_files\global-batch-datazone.md
Downloaded: downloaded_files\global-batch.md


  df['is_available'] = df['is_available'].replace({'✅': True, '-': False})
  df['is_available'] = df['is_available'].replace({'✅': True, '-': False})
  df['is_available'] = df['is_available'].replace({'✅': True, '-': False})
  df['is_available'] = df['is_available'].replace({'✅': True, '-': False})


Downloaded: downloaded_files\provisioned-global.md
Downloaded: downloaded_files\provisioned-models.md
Downloaded: downloaded_files\quota.md
Downloaded: downloaded_files\standard-audio.md


  df['is_available'] = df['is_available'].replace({'✅': True, '-': False})
  df['is_available'] = df['is_available'].replace({'✅': True, '-': False})
  df['is_available'] = df['is_available'].replace({'✅': True, '-': False})


Downloaded: downloaded_files\standard-chat-completions.md
Downloaded: downloaded_files\standard-completions.md
Downloaded: downloaded_files\standard-embeddings.md
Downloaded: downloaded_files\standard-global.md


  df['is_available'] = df['is_available'].replace({'✅': True, '-': False})
  df['is_available'] = df['is_available'].replace({'✅': True, '-': False})
  df['is_available'] = df['is_available'].replace({'✅': True, '-': False})
  df['is_available'] = df['is_available'].replace({'✅': True, '-': False})


Downloaded: downloaded_files\standard-gpt-35-turbo.md
Downloaded: downloaded_files\standard-gpt-4.md
Downloaded: downloaded_files\standard-image-generation.md
Downloaded: downloaded_files\standard-models.md


  df['is_available'] = df['is_available'].replace({'✅': True, '-': False})
  df['is_available'] = df['is_available'].replace({'✅': True, '-': False})
  df['is_available'] = df['is_available'].replace({'✅': True, '-': False})
  df['is_available'] = df['is_available'].replace({'✅': True, '-': False})


In [7]:
gdpr_regions = ['francecentral', 'germanywestcentral','polandcentral',
       'spaincentral', 'swedencentral', 'westeurope', 'italynorth', 
       #'norwayeast', 'switzerlandnorth', 'uksouth', 'switzerlandwest'
       ]

In [8]:
df["gdpr_region"] = df['region'].isin(gdpr_regions)
df["global_deployment"] = df['deployment_type'].str.contains("global")

df["usable_in_gdpr"] = df["gdpr_region"] & ~df["global_deployment"] & df["is_available"]

# sort by usable_in_gdpr, region, model_full, deployment_type
df = df.sort_values(by=["usable_in_gdpr", "region", "model_full", "deployment_type"], ascending=[False, True, True, True])
df.to_csv("azure_openai_service_model_matrix.csv", index=False)

In [9]:

# find models (group by) that are not available in any gdpr_region
models_in_gdpr_df = df[df["usable_in_gdpr"]]
print(models_in_gdpr_df["model"].unique())


['gpt-35-turbo' 'gpt-35-turbo-16k' 'gpt-4' 'gpt-4-32k' 'gpt-4o'
 'gpt-4o-mini' 'o1' 'o3-mini' 'text-embedding-3-large'
 'text-embedding-ada-002' 'dall-e-3' 'gpt-35-turbo-instruct' 'o1-mini'
 'o1-preview' 'tts' 'tts-hd' 'whisper']


In [10]:
models_not_in_gdpr_df = df[~df["model"].isin(models_in_gdpr_df["model"])]
print(models_not_in_gdpr_df["model"].unique())

['gpt-4.1' 'gpt-4.5-preview' 'gpt-4o-audio-preview'
 'gpt-4o-mini-audio-preview' 'gpt-4o-mini-realtime-preview'
 'gpt-4o-realtime-preview' 'text-embedding-3-small']


In [11]:
# TODO: Currently only Azure OpenAI is included. Add Azure AI Service models 

In [12]:
quota_df = read_markdown_table("downloaded_files/quota.md")