# Finaincial Risk Profile RAG v6 (Mistral Model)

Attempting to use markdown loader to chunk based on headings

In [2]:
pip install langchain langchain_community langchain_core langchain_openai faiss-cpu sentence-transformers

Collecting langchain
  Downloading langchain-0.1.10-py3-none-any.whl (806 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.2/806.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_community
  Downloading langchain_community-0.0.25-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_core
  Downloading langchain_core-0.1.28-py3-none-any.whl (252 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m252.4/252.4 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_openai
  Downloading langchain_openai-0.0.8-py3-none-any.whl (32 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting senten

In [3]:
import pandas as pd
import numpy as np
import json
import re
from tqdm import tqdm
from operator import itemgetter
import pickle
import string
import time

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.llms import HuggingFaceHub
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

In [4]:
import os
from google.colab import userdata
os.environ['OPENAI_API_KEY'] = userdata.get("OPENAI_API_KEY")
os.environ['HUGGINGFACEHUB_API_TOKEN'] = userdata.get("HF_TOKEN2")

In [5]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


## Step 0: Load data

In [6]:
def choose_volatility_table(cicra_rating, industry):
    # Corrected mapping of industries to their volatility levels based on CICRA rating and industry-specific guidance
    industry_volatility_map = {
    'Standard': ['Technology', 'Aerospace & Defense', 'Automobiles & Components', 'Capital Goods', 'Consumer Durables & Apparel', 'Materials', 'Semiconductors', 'Software & Services'],
    'Medium': ['Biotech', 'Energy', 'Chemicals', 'Commercial & Professional Services', 'Consumer Services', 'Media & Entertainment', 'Pharmaceuticals', 'Retailing'],
    'Low': ['Utilities', 'Telecommunications', 'Food & Beverages', 'Healthcare Equipment', 'Banks', 'Insurance', 'Real Estate', 'Transportation']
    }

    # Determine the volatility level based on industry
    industry_volatility = 'Standard'  # Default for industries not explicitly mentioned or for general assessment
    for level, industries in industry_volatility_map.items():
        if industry in industries:
            industry_volatility = level  # Directly use the level from the corrected mapping
            break

    # Adjust for CICRA rating
    if int(cicra_rating) == 1 or industry_volatility == 'Low':
      volatility_level = 'Low'
      if industry_volatility == 'Medium':
        volatility_level = 'Medial'
    elif int(cicra_rating) == 2:
      volatility_level = 'Medial'
    else:
      volatility_level = 'Standard'

    return volatility_level

In [7]:
def map_metrics_to_risk_categories(volatility_level, metrics):
    thresholds = {
        'Standard': {
            'ffo_to_debt': {'Minimal Risk [1]': (60, None), 'Modest Risk [2]': (45, 60), 'Intermediate Risk [3]': (30, 45), 'Significant Risk [4]': (20, 30), 'Aggressive Risk [5]': (12, 20), 'Highly Leveraged [6]': (None, 12)},
            'debt_to_ebitda': {'Minimal Risk [1]': (None, 1.5), 'Modest Risk [2]': (1.5, 2), 'Intermediate Risk [3]': (2, 3), 'Significant Risk [4]': (3, 4), 'Aggressive Risk [5]': (4, 5), 'Highly Leveraged [6]': (5, None)},
            'ffo_cash_interest_cover': {'Minimal Risk [1]': (13, None), 'Modest Risk [2]': (9, 13), 'Intermediate Risk [3]': (6, 9), 'Significant Risk [4]': (4, 6), 'Aggressive Risk [5]': (2, 4), 'Highly Leveraged [6]': (None, 2)},
            'ebitda_to_interest': {'Minimal Risk [1]': (15, None), 'Modest Risk [2]': (10, 15), 'Intermediate Risk [3]': (6, 10), 'Significant Risk [4]': (3, 6), 'Aggressive Risk [5]': (2, 3), 'Highly Leveraged [6]': (None, 2)},
            'cfo_to_debt': {'Minimal Risk [1]': (50, None), 'Modest Risk [2]': (35, 50), 'Intermediate Risk [3]': (25, 35), 'Significant Risk [4]': (15, 25), 'Aggressive Risk [5]': (10, 15), 'Highly Leveraged [6]': (None, 10)},
            'focf_to_debt': {'Minimal Risk [1]': (40, None), 'Modest Risk [2]': (25, 40), 'Intermediate Risk [3]': (15, 25), 'Significant Risk [4]': (10, 15), 'Aggressive Risk [5]': (5, 10), 'Highly Leveraged [6]': (None, 5)},
            'dcf_to_debt': {'Minimal Risk [1]': (25, None), 'Modest Risk [2]': (15, 25), 'Intermediate Risk [3]': (10, 15), 'Significant Risk [4]': (5, 10), 'Aggressive Risk [5]': (2, 5), 'Highly Leveraged [6]': (None, 2)},
        },
        'Medial': {
            'ffo_to_debt': {'Minimal Risk [1]': (50, None), 'Modest Risk [2]': (35, 50), 'Intermediate Risk [3]': (23, 35), 'Significant Risk [4]': (13, 23), 'Aggressive Risk [5]': (9, 13), 'Highly Leveraged [6]': (None, 9)},
            'debt_to_ebitda': {'Minimal Risk [1]': (None, 1.75), 'Modest Risk [2]': (1.75, 2.5), 'Intermediate Risk [3]': (2.5, 3.5), 'Significant Risk [4]': (3.5, 4.5), 'Aggressive Risk [5]': (4.5, 5.5), 'Highly Leveraged [6]': (5.5, None)},
            'ffo_cash_interest_cover': {'Minimal Risk [1]': (14, None), 'Modest Risk [2]': (9, 14), 'Intermediate Risk [3]': (5, 9), 'Significant Risk [4]': (2.75, 5), 'Aggressive Risk [5]': (1.75, 2.75), 'Highly Leveraged [6]': (None, 1.75)},
            'ebitda_to_interest': {'Minimal Risk [1]': (14, None), 'Modest Risk [2]': (9, 14), 'Intermediate Risk [3]': (5, 9), 'Significant Risk [4]': (2.75, 5), 'Aggressive Risk [5]': (1.75, 2.75), 'Highly Leveraged [6]': (None, 1.75)},
            'cfo_to_debt': {'Minimal Risk [1]': (40, None), 'Modest Risk [2]': (27.5, 40), 'Intermediate Risk [3]': (18.5, 27.5), 'Significant Risk [4]': (10.5, 18.5), 'Aggressive Risk [5]': (7, 10.5), 'Highly Leveraged [6]': (None, 7)},
            'focf_to_debt': {'Minimal Risk [1]': (30, None), 'Modest Risk [2]': (17.5, 30), 'Intermediate Risk [3]': (9.5, 17.5), 'Significant Risk [4]': (5, 9.5), 'Aggressive Risk [5]': (0, 5), 'Highly Leveraged [6]': (None, 0)},
            'dcf_to_debt': {'Minimal Risk [1]': (18, None), 'Modest Risk [2]': (11, 18), 'Intermediate Risk [3]': (6.5, 11), 'Significant Risk [4]': (2.5, 6.5), 'Aggressive Risk [5]': (0, 2.5), 'Highly Leveraged [6]': (None, 0)},
        },
        'Low': {
            'ffo_to_debt': {'Minimal Risk [1]': (35, None), 'Modest Risk [2]': (23, 35), 'Intermediate Risk [3]': (13, 23), 'Significant Risk [4]': (9, 13), 'Aggressive Risk [5]': (6, 9), 'Highly Leveraged [6]': (None, 6)},
            'debt_to_ebitda': {'Minimal Risk [1]': (None, 2), 'Modest Risk [2]': (2, 3), 'Intermediate Risk [3]': (3, 4), 'Significant Risk [4]': (4, 5), 'Aggressive Risk [5]': (5, 6), 'Highly Leveraged [6]': (6, None)},
            'ffo_cash_interest_cover': {'Minimal Risk [1]': (8, None), 'Modest Risk [2]': (5, 8), 'Intermediate Risk [3]': (3, 5), 'Significant Risk [4]': (2, 3), 'Aggressive Risk [5]': (1.5, 2), 'Highly Leveraged [6]': (None, 1.5)},
            'ebitda_to_interest': {'Minimal Risk [1]': (13, None), 'Modest Risk [2]': (7, 13), 'Intermediate Risk [3]': (4, 7), 'Significant Risk [4]': (2.5, 4), 'Aggressive Risk [5]': (1.5, 2.5), 'Highly Leveraged [6]': (None, 1.5)},
            'cfo_to_debt': {'Minimal Risk [1]': (30, None), 'Modest Risk [2]': (20, 30), 'Intermediate Risk [3]': (12, 20), 'Significant Risk [4]': (8, 12), 'Aggressive Risk [5]': (5, 8), 'Highly Leveraged [6]': (None, 5)},
            'focf_to_debt': {'Minimal Risk [1]': (20, None), 'Modest Risk [2]': (10, 20), 'Intermediate Risk [3]': (4, 10), 'Significant Risk [4]': (0, 4), 'Aggressive Risk [5]': (0, 0), 'Highly Leveraged [6]': (None, 0)},
            'dcf_to_debt': {'Minimal Risk [1]': (11, None), 'Modest Risk [2]': (7, 11), 'Intermediate Risk [3]': (3, 7), 'Significant Risk [4]': (0, 3), 'Aggressive Risk [5]': (0, 0), 'Highly Leveraged [6]': (None, 0)},
        },
    }

    if volatility_level not in thresholds:
        return 'Invalid volatility level'

    selected_thresholds = thresholds[volatility_level]
    risk_categories = {}

    for metric, value in metrics.items():
        # Check if the value is a string and contains a percentage sign
        if isinstance(value, str) and '%' in value:
            # If so, remove the percentage sign and convert to float
            value = float(value.strip('%')) / 100
        elif not isinstance(value, float):
            # If the value is not a float, try converting it to float
            try:
                value = float(value)
            except ValueError:
                # If conversion fails, skip this metric
                risk_categories[metric] = "Invalid value"
                continue

        # Adjust for percentages
        if metric in ['ffo_to_debt', 'cfo_to_debt', 'focf_to_debt', 'dcf_to_debt']:
          value = value * 100

        risk_category = "Unknown"  # Default risk category
        for category, ranges in selected_thresholds.get(metric, {}).items():
            lower, upper = ranges
            if (lower is None or value >= lower) and (upper is None or value < upper):
                risk_category = category
                break  # Found the matching category, no need to check further

        risk_categories[metric] = risk_category

    return risk_categories

In [8]:
def calculate_risk_categories(row):
    metrics = {
        'ffo_to_debt': row['ffo_to_debt_ltm'],
        'debt_to_ebitda': row['debt_to_ebitda_ltm'],
        'ffo_cash_interest_cover': row.get('ffo_interest_coverage_ltm', None), # Assuming this maps to ffo_cash_interest_cover
        'ebitda_to_interest': row['ebitda_to_interest_ltm'],
        'cfo_to_debt': row['cfo_to_debt_ltm'],
        'focf_to_debt': row['focf_to_debt_ltm'],
        'dcf_to_debt': row['dcf_to_debt_ltm'],
        # Add other mappings as necessary
    }
    volatility_level = row['volatility']
    return map_metrics_to_risk_categories(volatility_level, metrics)


In [9]:
df = pd.read_csv("/content/drive/Shareddrives/ENGS Final Project/Data/Financial Risk Profile Data/Financial Data/CLEAN_merged_frp_data_v2.csv")
df['circa_rating'] = df['circa_rating'].str.extract('(\d+)').astype(int)
df['volatility'] = df.apply(lambda row: choose_volatility_table(row['circa_rating'], row['sector']), axis=1)
df['Risk_Categories'] = df.apply(calculate_risk_categories, axis=1)

df.head()

Unnamed: 0.1,Unnamed: 0,company_name,sector,financial_risk_profile,circa_rating,business_description,ffo_to_debt_ltm,debt_to_ebitda_ltm,cfo_to_debt_ltm,focf_to_debt_ltm,dcf_to_debt_ltm,ffo_interest_coverage_ltm,ebitda_to_interest_ltm,ebit_margin_ltm,ebitda_margin_ltm,return_on_capital_ltm,volatility,Risk_Categories
0,0,AAR Corp.,AEROSPACE & DEFENSE,[3] Intermediate,3,AAR Corp. provides products and services to co...,0.18,1.31,0.23,0.11,0.11,2.93,10.3,7.25,8.7,7.05,Standard,"{'ffo_to_debt': 'Aggressive Risk [5]', 'debt_t..."
1,1,Boeing Co.,AEROSPACE & DEFENSE,[4] Significant,3,"The Boeing Company, together with its subsidia...",0.11,10.6,0.16,0.12,0.13,2.42,1.47,1.66,4.05,2.03,Standard,"{'ffo_to_debt': 'Highly Leveraged [6]', 'debt_..."
2,2,BWX Technologies Inc.,AEROSPACE & DEFENSE,[4] Significant,3,"BWX Technologies, Inc., together with its subs...",0.19,3.44,0.2,0.07,0.14,5.35,8.05,12.2,15.3,8.53,Standard,"{'ffo_to_debt': 'Aggressive Risk [5]', 'debt_t..."
3,3,CACI International Inc.,AEROSPACE & DEFENSE,[3] Intermediate,3,"CACI International Inc, together with its subs...",0.15,2.42,0.17,0.13,0.36,3.2,8.06,8.06,10.0,6.87,Standard,"{'ffo_to_debt': 'Aggressive Risk [5]', 'debt_t..."
4,4,General Dynamics Corp.,AEROSPACE & DEFENSE,[3] Intermediate,3,General Dynamics Corporation operates as an ae...,0.42,1.84,0.51,0.41,0.62,11.8,12.5,8.77,10.8,7.35,Standard,"{'ffo_to_debt': 'Intermediate Risk [3]', 'debt..."


### Step 1: Set up RAG vector store

In [10]:
# Set up markdown splitter
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("####", "Header 4"),
    ("#####", "Header 5"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers = False)

# Upload documents
documents = [
    # general crtieria
    '/content/drive/Shareddrives/ENGS Final Project/Data/Financial Risk Profile Data/RAG Knowledge Base/Markdown RAG/general_criteria_markdown_v2.txt',
    # sector-specific guidance
    '/content/drive/Shareddrives/ENGS Final Project/Data/Financial Risk Profile Data/RAG Knowledge Base/Markdown RAG/sector_specific_guidance_markdown_v1.txt',
    # ratios and adjustments
    '/content/drive/Shareddrives/ENGS Final Project/Data/Financial Risk Profile Data/RAG Knowledge Base/Markdown RAG/ratios_and_adjustments.txt',
    # standard volatility table
    '/content/drive/Shareddrives/ENGS Final Project/Data/Financial Risk Profile Data/RAG Knowledge Base/Markdown RAG/Standard Volatility Table.txt',
    # medial volatility table
    '/content/drive/Shareddrives/ENGS Final Project/Data/Financial Risk Profile Data/RAG Knowledge Base/Markdown RAG/Medial Volatility Table.txt',
    # low volatility tale
    '/content/drive/Shareddrives/ENGS Final Project/Data/Financial Risk Profile Data/RAG Knowledge Base/Markdown RAG/Low Volatility Table.txt'
]

# STEP 1: SPLIT ON MARKDOWN HEADERS
md_header_splits = []

for doc in documents:
  doc = open(doc, 'r').read()
  doc_data = markdown_splitter.split_text(doc)
  md_header_splits += doc_data

# STEP 2: ADD RECURSIVE CHARACTER SPLITTER ON PARAGRAPHS/SENTENCES
text_splitter = RecursiveCharacterTextSplitter()

final_doc_splits = text_splitter.split_documents(md_header_splits)
len(final_doc_splits)

195

In [11]:
final_doc_splits[94]

Document(page_content="### **The Railroad and Package Express Industry Supplemental Ratios**  \nThe Railroad and Package Express Industry Supplemental Ratios: In addition to our analysis of a company's core ratios, we also consider supplemental ratios in order to develop a fuller understanding of a company's credit risk profile and fine tune our cash flow analysis. In our view, a railroad or package express company's inability to meet cash interest payments or a debt maturity would be the most likely cause of a cash default during an industry downturn. Therefore, we consider as supplemental ratios:  \n1. Coverage ratios (FFO + cash interest)/cash interest and EBITDA/interest; and\n2. FOCF to debt (this captures the capital intensity of railroads in particular).", metadata={'Header 2': 'Sector-Specific Guidance', 'Header 3': '**The Railroad and Package Express Industry Supplemental Ratios**'})

## Step 1: Create Vector database and set up retreiver

In [12]:
# vectorstore = FAISS.from_documents(final_doc_splits, embedding=OpenAIEmbeddings(model = 'text-embedding-ada-002'))
# vectorstore.save_local("/content/drive/Shareddrives/ENGS Final Project/Data/Financial Risk Profile Data/RAG Knowledge Base/Markdown RAG/vectorstore")

vectorstore = FAISS.load_local("/content/drive/Shareddrives/ENGS Final Project/Data/Financial Risk Profile Data/RAG Knowledge Base/Markdown RAG/vectorstore", OpenAIEmbeddings(model = 'text-embedding-ada-002'))
retriever = vectorstore.as_retriever()

### Step 1B: Set up multi-prompt structure

In [44]:
# retriever = vectorstore.as_retriever()

template = """<s>[INST]Use the following context to help you answer the question below:
{context}

Question: {question}[/INST]
"""
prompt = ChatPromptTemplate.from_template(template)

# TRAINING LOOP
model = HuggingFaceHub(
    # repo_id="mistralai/Mistral-7B-Instruct-v0.2",
    repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1",
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 5000,
        "top_k": 30,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
    },
)

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)
question = "Summarize the sector-specific guidance for the technology software industry supplemental ratios to consider in 2 sentences."
chain.invoke(question)

KeyboardInterrupt: 

## Step 2: Create prompt template

In [13]:
def create_prompt1(row):
  return f"""
  STEP 1: In a few sentences, briefly summarize the sector-specific guidance about which supplemental and core metrics are considered important for the {row['sector'].title()} industry. If the {row['sector'].title()} industry is not mentioned, use the industry and business description to find the most similar sector.
  """

def create_prompt2(row):
  return f"""
  STEP 2: Follow the general corporate methodology and the procedure in the industry-specific guidance to aggregate the key financial ratios and determine the financial risk (1-6). Pay attention to the industry-specific guidance about which metrics are most important for the {row['sector']} industry.
  STEP 2A: First, compare the core ratios (FFO to debt and debt to EBITDA) to the ratio ranges in the relevant benchmark table. If the core ratios result in different cash flow/leverage assessments, select the relevant core ratio based on the industry-specific guidance or whichever the best indicator of a company's future leverage.
  STEP 2B: Next, consider the supplementary ratios. Follow the industry-specific guidance. If the cash flow/leverage assessment(s) indicated by the important supplemental ratio(s) differs from the preliminary cash flow/leverage assessment, we might adjust the preliminary cash flow/leverage assessment by one category in the direction of the cash flow/leverage assessment indicated by the supplemental ratio(s) to derive the adjusted cash flow/leverage assessment. We will make this adjustment if, in our view, the supplemental ratio provides the best indicator of a company's future leverage. If there is more than one important supplemental ratio and they result in different directional deviations from the preliminary cash flow/leverage assessment, we will select one as the relevant supplemental ratio based on which, in our opinion, provides the best indicator of a company's future leverage. We will then make the adjustment outlined above if the selected supplemental ratio differs from the preliminary cash flow/leverage assessment and the selected supplemental ratio provides the best overall indicator of a company's future leverage.
  Your final answer should be a single risk profile from 1 to 6.
  """

def json_final_answer_prompt(row):
  return f"""For your final answer, return a JSON object with 'reasoning' and 'answer' fields.

  Your response should be in the form {{'reasoning': <Methodological explanation as to why you chose that specific rating>, 'answer': <A single category from 1-6> }}"""

### Step 3: Run model on a single company

In [14]:
def extract_json_and_answer(input_str):
    # Regular expression to find JSON object within a string
    json_str_match = re.search(r'\{.*\}', input_str, re.DOTALL)

    if json_str_match:
        json_str = json_str_match.group(0)

        try:
            # Parse the JSON string into a Python dictionary
            json_obj = json.loads(json_str)

            # Extract the "answer" field from the JSON object
            answer = json_obj.get('answer', 'No answer found')

            return answer
        except json.JSONDecodeError:
            return -1 # Error parsing JSON
    else:
        return -2 # No JSON object found

In [15]:
def mistral_extract_output(text):
  response = text.split("[/INST]")[-1]
  response = response.split("[\INST]")[-1]
  response = response.split("[\\INST]")[-1]
  response = response.split("[//INST]")[-1]
  # response = response.replace("{", "[")
  # response = response.replace("}", "]")
  return response

In [48]:
# Prepare df for training run
df['y_pred'] = np.nan
df['reasoning'] = ""

In [16]:
# TRAINING LOOP
model = HuggingFaceHub(
    # repo_id="mistralai/Mistral-7B-Instruct-v0.2",
    repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1",
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 7000,
        "top_k": 30,
        "temperature": 0.05,
        "repetition_penalty": 1.03,
    },
)

df = pd.read_csv("/content/drive/Shareddrives/ENGS Final Project/Models/Output/RAW_PARTIAL_RESULTS_rag_v6_mistral_UPDATED.csv")

for index, row in tqdm(df.iterrows(), total=df.shape[0], desc=f'Scoring each row in df'):
  if index < 509:
    continue

  # Periodically save output
  if index % 30 == 0:
    df.to_csv("/content/drive/Shareddrives/ENGS Final Project/Models/Output/RAW_PARTIAL_RESULTS_rag_v6_mistral_UPDATED.csv")

  # Periodically sleep so we don't reach API limit
  if index > 530 and (index + 1) % 75 == 0:
    time.sleep(1800)

  # STEP 1: SUMMARIZE SECTOR-SPECIFIC GUIDANCE
  step1_template = f"""<s> [INST]Use the following context to help you accomplish the task:""" + """
  {context}
  TASK: {step1}[/INST]"""
  prompt1 = ChatPromptTemplate.from_template(step1_template)
  chain1 = (
      {"context": retriever, "step1": RunnablePassthrough()}
      | prompt1
      | model
      | StrOutputParser()
  )
  step1_output = mistral_extract_output(chain1.invoke(create_prompt1(row)))

  # STEP 2: AGGREGATE RATINGS

  # Clean dictionary string of risk categories
  risk_categories = str(row['Risk_Categories']).strip("{}").replace(", ", "\n")

  step2_template = f"<s> [INST]Use the following context to help you determine the financial risk profile for {row['company_name']}:" + """
  {context}

  Use the following informatino to help you determine the financial risk profile:
  1) The risk categories for each of the metrics based on the""" + f""" {row['volatility']} volatility table: {risk_categories}.
  2) Summary the industry-specific guidance:""" + step1_output + """

  Take a deep breath and think step by step. YOUR NEXT TASK: {step2}[/INST]"""
  prompt2 = ChatPromptTemplate.from_template(step2_template)
  chain2 = (
      {"context": retriever, "step2": RunnablePassthrough()}
      | prompt2
      | model
      | StrOutputParser()
  )
  step2_output = mistral_extract_output(chain2.invoke(create_prompt2(row)))


  # STEP 3: OUTPUT RESULT
  step3_template = f"""<s> [INST] Use the following information to help you determine the financial risk profile:
  1) risk categories for each of the metrics based on the {row['volatility']} volatility table: {risk_categories}.
  2) Summary of the industry-specific guidance: {step1_output}
  3) Risk profile assignment reasoning: {step2_output}

  Here is your final task:""" + "{step3} [/INST]"
  prompt3 = ChatPromptTemplate.from_template(step3_template)
  final_chain = (
      {"step3":RunnablePassthrough()}
      | prompt3
      | model
      | StrOutputParser()
  )
  final_output = mistral_extract_output(final_chain.invoke(json_final_answer_prompt(row)))

  final_answer = extract_json_and_answer(final_output)
  df.at[index, 'y_pred'] = final_answer
  df.at[index, 'reasoning'] = "# STEP 1 OUTPUT:\n" + step1_output + "\n # STEP 2 OUTPUT:\n" + step2_output + "\n # FINAL ANSWER OUTPUT:\n" + final_output

print("finished!")
df.to_csv("/content/drive/Shareddrives/ENGS Final Project/Models/Output/RAW_RESULTS_rag_v6_mistral_UPDATED.csv")

  warn_deprecated(
Scoring each row in df: 100%|██████████| 947/947 [5:08:46<00:00, 19.56s/it]


finished!


In [20]:
df['y_pred'].value_counts()

946

In [None]:
from IPython.display import Markdown, display
display(Markdown(df['reasoning'].iloc[3]))