In [230]:
import json
import requests
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from pandas import json_normalize
from datetime import datetime
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from cfuzzyset import cFuzzySet as FuzzySet

In [231]:
df = pd.read_csv("ares_operations.csv")

In [232]:
def clean_text_2 (row):
    no_punc = re.sub(r'[^\w\s]', '', row)
    lower = no_punc.lower()
    no_space = re.sub(r'\s+', '', lower)
    return no_space

clean_text_2 ('Hello 45!')

'hello45'

In [233]:
def average_score_2 (row, terms):
    # find average fuzzy score among all variations of one row title
    standard_row = clean_text_2 (row)
    total_score = 0
    for search_string in terms:
        score = fuzz.ratio(standard_row, clean_text_2(search_string))
        total_score += score
    if len(terms) != 0:
        average_score = total_score / len(terms) 
    else:
        average_score = 0
    return average_score 

def top_score (row, terms):
    scores = []
    standard_row = clean_text_2 (row)
    for search_string in terms:
        score = fuzz.ratio (standard_row, clean_text_2(search_string))
        scores.append(score)
    return max(scores)

def return_best_match_2 (text, type_list):
    # return row title of best match
    text = clean_text_2(text)
    highest_average = 0
    winner = pd.NA
    for header in type_list:
        score = average_score_2(text, header['search_strings'])
        if score > highest_average:
            highest_average = score
            winner = header['row_title']
    for header in type_list:
        for search_string in header['search_strings']:
            if search_string == text or search_string in text:
                winner = header['row_title']
                
    return winner

def return_highest_score_match (text, type_list):
    text = clean_text_2(text)
    highest_average = 0
    winner = pd.NA
    for header in type_list:
        score = top_score(text, header['search_strings'])
        if score > highest_average:
            highest_average = score
            winner = header['row_title']
    for header in type_list:
        for search_string in header['search_strings']:
            if search_string == text or search_string in text:
                winner = header['row_title']
    return winner

In [234]:
operations_list = [
    {
        'row_title': 'EXPENSES',
        'search_strings': [], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Interest expense',
        'search_strings': ['interest and credit facility fees', 
                           'interest and other financing fees', 
                           'interest and other debt expenses', 
                           'interest', 
                           'interest and other debt financing expenses', 
                           'interest expense'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Base management fees',
        'search_strings': ['base management fees', 
                           'base management fee (Note 2)', 
                           'management fees'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Incentive fees',
        'search_strings': ['capital gains incentive fees', 
                           'incentive management fees', 
                           'incentive fees', 
                           'incentive fee', 
                           'income incentive fee', 
                           'subordinated income incentive fee', 
                           'income based incentive fee'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Administrative fees',
        'search_strings': ['administrative fees', 
                           'administrative service fee', 
                           'administrative services expenses'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'General and administrative expenses',
        'search_strings': ['other general and administrative', 
                           'general and administrative expenses', 
                           'general and administrative', 
                           'other general and administrative expenses'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Total expenses',
        'search_strings': ['total expenses', 
                           'total operating expenses'], 
        'value1': None,
        'value2': None
    },
]

# Create a DataFrame from the operations_list
desired_columns = ['row_title', 'value1', 'value2']
op_df = pd.DataFrame([{key: d[key] for key in desired_columns} for d in operations_list])

print(op_df)

                             row_title value1 value2
0                             EXPENSES   None   None
1                     Interest expense   None   None
2                 Base management fees   None   None
3                       Incentive fees   None   None
4                  Administrative fees   None   None
5  General and administrative expenses   None   None
6                       Total expenses   None   None


In [235]:
interest_income_terms = ['interest income']
dividend_income_terms = ['dividend income']
pik_income_terms = ['payment in kind interest income']
other_income_terms = ['other income', 'fee income']
total_income_terms = ['total income']
ifd_income_terms = ['interest fee and dividend income']

non_affiliated_terms = ['non-control, non-affiliate',
                        'non-control, non-affiliated',
                        'non-controlled, non-affiliate',
                        'non-controlled, non-affiliated']

affiliated_terms = ['non-control, affiliate',
                    'non-control, affiliated',
                    'non-controlled, affiliate',
                    'non-controlled, affiliated']

control_terms = ['control, affiliate',
                 'control, affiliated',
                 'controlled, affiliate',
                 'controlled, affiliated']

investments_matrix = [
    {
        'affiliation': 'NA',
        'affiliation_terms': 'NA',
        'income_type': 'Investment income',
        'income_type_terms': ['investment income'],
        'value1': None,
        'value2': None
    },
    {
        'affiliation': 'None',
        'affiliation_terms': non_affiliated_terms,
        'income_type': 'Interest',
        'income_type_terms': interest_income_terms,
        'value1': None,
        'value2': None
    },
    {   'affiliation': 'None',
        'affiliation_terms': non_affiliated_terms,
        'income_type': 'PIK',
        'income_type_terms': pik_income_terms,
        'value1': None,
        'value2': None
    },
    {
        'affiliation': 'None',
        'affiliation_terms': non_affiliated_terms,
        'income_type': 'Dividend',
        'income_type_terms': dividend_income_terms,
        'value1': None,
        'value2': None
    },
    {
        'affiliation': 'None',
        'affiliation_terms': non_affiliated_terms,
        'income_type': 'Other',
        'income_type_terms': other_income_terms,
        'value1': None,
        'value2': None
    },
    {
        'affiliation': 'None',
        'affiliation_terms': non_affiliated_terms,
        'income_type': 'IFD',
        'income_type_terms': ifd_income_terms,
        'value1': None,
        'value2': None
    },
    {
        'affiliation': 'None',
        'affiliation_terms': non_affiliated_terms,
        'income_type': 'Total',
        'income_type_terms': total_income_terms,
        'value1': None,
        'value2': None
    },



    {
        'affiliation': 'Affiliated',
        'affiliation_terms': affiliated_terms,
        'income_type': 'Interest',
        'income_type_terms': interest_income_terms,
        'value1': None,
        'value2': None
    },
    {
        'affiliation': 'Affiliated',
        'affiliation_terms': affiliated_terms,
        'income_type': 'PIK',
        'income_type_terms': pik_income_terms,
        'value1': None,
        'value2': None
    },
    {
        'affiliation': 'Affiliated',
        'affiliation_terms': affiliated_terms,
        'income_type': 'Dividend',
        'income_type_terms': dividend_income_terms,
        'value1': None,
        'value2': None
    },
    {
        'affiliation': 'Affiliated',
        'affiliation_terms': affiliated_terms,
        'income_type': 'Other',
        'income_type_terms': other_income_terms,
        'value1': None,
        'value2': None
    },
    {
        'affiliation': 'Affiliated',
        'affiliation_terms': non_affiliated_terms,
        'income_type': 'IFD',
        'income_type_terms': ifd_income_terms,
        'value1': None,
        'value2': None
    },
    {
        'affiliation': 'Affiliated',
        'affiliation_terms': affiliated_terms,
        'income_type': 'Total',
        'income_type_terms': total_income_terms,
        'value1': None,
        'value2': None
    },



    {
        'affiliation': 'Controlled',
        'affiliation_terms': control_terms,
        'income_type': 'Interest',
        'income_type_terms': interest_income_terms,
        'value1': None,
        'value2': None
    },
    {
        'affiliation': 'Controlled',
        'affiliation_terms': control_terms,
        'income_type': 'PIK',
        'income_type_terms': pik_income_terms,
        'value1': None,
        'value2': None
    },
    {
        'affiliation': 'Controlled',
        'affiliation_terms': control_terms,
        'income_type': 'Dividend',
        'income_type_terms': dividend_income_terms,
        'value1': None,
        'value2': None
    },
    {
        'affiliation': 'Controlled',
        'affiliation_terms': control_terms,
        'income_type': 'Other',
        'income_type_terms': other_income_terms,
        'value1': None,
        'value2': None
    },
    {
        'affiliation': 'Controlled',
        'affiliation_terms': non_affiliated_terms,
        'income_type': 'IFD',
        'income_type_terms': ifd_income_terms,
        'value1': None,
        'value2': None
    },
    {
        'affiliation': 'Controlled',
        'affiliation_terms': control_terms,
        'income_type': 'Total',
        'income_type_terms': total_income_terms,
        'value1': None,
        'value2': None
    }
]

In [236]:
def best_match_subheading (subheading, matrix):
    best_match = None
    best_match_score = 0
    for row in matrix:
        affl_avg_score = top_score (subheading, row['affiliation_terms'])
        if affl_avg_score > best_match_score:
            best_match, best_match_type = row['affiliation'], 'affiliation'
            best_match_score = affl_avg_score
        for term in row['affiliation_terms']:
            if term == subheading:
                return row['affiliation'], 'affiliation'
            
        inv_avg_score = top_score (subheading, row['income_type_terms'])
        if inv_avg_score > best_match_score:
            best_match, best_match_type = row['income_type'], 'income_type'
            best_match_score = inv_avg_score
        for term in row['income_type_terms']:
            if term == subheading:
                return row['income_type'], 'income_type'
    return best_match, best_match_type

    # Filter list of dictionaries to only include rows with the specified affiliation`
    # filtered_list = [d for d in investments_matrix if d['affiliation'] == affiliation]`

In [237]:
x, y = best_match_subheading ("dividend income", investments_matrix)

In [238]:
best_match_subheading("INVESTMENT INCOME:", investments_matrix)

('Investment income', 'income_type')

In [239]:
def filter_from_subheadings (df):
    subheading_match = ''
    for i in range(len(df)):
        row = df.iloc[i]
        if pd.isnull(row[1]) and pd.isnull(row[2]):
            subheading_match, match_type = best_match_subheading(row[0], investments_matrix)
            return [d for d in investments_matrix if d[match_type] == subheading_match]
            # return match_type
            # return row[0]

In [240]:
filter_from_subheadings (df_investments[1:])

[{'affiliation': 'None',
  'affiliation_terms': ['non-control, non-affiliate',
   'non-control, non-affiliated',
   'non-controlled, non-affiliate',
   'non-controlled, non-affiliated'],
  'income_type': 'Interest',
  'income_type_terms': ['interest income'],
  'value1': None,
  'value2': None},
 {'affiliation': 'None',
  'affiliation_terms': ['non-control, non-affiliate',
   'non-control, non-affiliated',
   'non-controlled, non-affiliate',
   'non-controlled, non-affiliated'],
  'income_type': 'PIK',
  'income_type_terms': ['payment in kind interest income'],
  'value1': None,
  'value2': None},
 {'affiliation': 'None',
  'affiliation_terms': ['non-control, non-affiliate',
   'non-control, non-affiliated',
   'non-controlled, non-affiliate',
   'non-controlled, non-affiliated'],
  'income_type': 'Dividend',
  'income_type_terms': ['dividend income'],
  'value1': None,
  'value2': None},
 {'affiliation': 'None',
  'affiliation_terms': ['non-control, non-affiliate',
   'non-control, no

In [241]:
def return_best_match_2 (text, type_list):
    # return row title of best match
    text = clean_text_2(text)
    highest_average = 0
    winner = pd.NA
    for header in type_list:
        score = average_score_2(text, header['search_strings'])
        if score > highest_average:
            highest_average = score
            winner = header['row_title']
    for header in type_list:
        for search_string in header['search_strings']:
            if search_string == text or search_string in text:
                winner = header['row_title']
                
    return winner

In [242]:
df_investments

Unnamed: 0.1,Unnamed: 0,Three Months Ended 2023,Three Months Ended 2022
0,INVESTMENT INCOME:,,
1,From non-controlled/non-affiliate company inve...,,
2,Interest income (excluding payment-in-kind (“P...,367000000.0,242000000.0
3,PIK interest income,37000000.0,27000000.0
4,Capital structuring service fees,10000000.0,26000000.0
5,Dividend income,57000000.0,41000000.0
6,Other income,16000000.0,10000000.0
7,Total investment income from non-controlled/no...,487000000.0,346000000.0
8,From non-controlled affiliate company investme...,,
9,Interest income (excluding PIK interest income),3000000.0,1000000.0


In [243]:
best_match_subheading("From controlled affiliate company investments:", investments_matrix)

('Affiliated', 'affiliation')

In [244]:
def show_investment (df, matrix):
    subheading_match, subheading_type = '', ''
    for i in range(len(df)):
        row = df.iloc[i]
        if pd.isnull(row[1]) and pd.isnull(row[2]):
            subheading_match, subheading_type = best_match_subheading(row[0], matrix)
            print('subheading: ' + subheading_match + ', ' + subheading_type)

In [245]:
show_investment (df_investments, investments_matrix)

subheading: Investment income, income_type
subheading: None, affiliation
subheading: Affiliated, affiliation
subheading: Affiliated, affiliation


In [246]:
def get_investment (df, matrix):
    subheading_match, subheading_type = '', ''
    for i in range(len(df)):
        row = df.iloc[i]
        if pd.isnull(row[1]) and pd.isnull(row[2]):
            subheading_match, subheading_type = best_match_subheading(row[0], matrix)
            continue
        dicts_to_search = [d for d in matrix if d[subheading_type] == subheading_match]
        text_match, text_type = best_match_subheading(row[0], dicts_to_search)
        print(text_match, text_type)
        for item in matrix:
              if item[text_type] == text_match and item[subheading_type] == subheading_match:
                # print("found")
                # print(item)
                item['value1'] = row[1]
                item['value2'] = row[2]

In [247]:
print(best_match_subheading(df_investments.iloc[1][0], investments_matrix))
dicts = [d for d in investments_matrix if d['affiliation'] == 'None']
print(dicts)
print(best_match_subheading(df_investments.iloc[2][0], dicts))
one = [d for d in investments_matrix if d['affiliation'] == 'None' and d['income_type'] == 'PIK']
print(one)
print(df_investments.iloc[2][1])


('None', 'affiliation')
[{'affiliation': 'None', 'affiliation_terms': ['non-control, non-affiliate', 'non-control, non-affiliated', 'non-controlled, non-affiliate', 'non-controlled, non-affiliated'], 'income_type': 'Interest', 'income_type_terms': ['interest income'], 'value1': None, 'value2': None}, {'affiliation': 'None', 'affiliation_terms': ['non-control, non-affiliate', 'non-control, non-affiliated', 'non-controlled, non-affiliate', 'non-controlled, non-affiliated'], 'income_type': 'PIK', 'income_type_terms': ['payment in kind interest income'], 'value1': None, 'value2': None}, {'affiliation': 'None', 'affiliation_terms': ['non-control, non-affiliate', 'non-control, non-affiliated', 'non-controlled, non-affiliate', 'non-controlled, non-affiliated'], 'income_type': 'Dividend', 'income_type_terms': ['dividend income'], 'value1': None, 'value2': None}, {'affiliation': 'None', 'affiliation_terms': ['non-control, non-affiliate', 'non-control, non-affiliated', 'non-controlled, non-affil

In [248]:
get_investment (df_investments, investments_matrix)
investments_matrix

PIK income_type
Interest income_type
Interest income_type
Dividend income_type
Other income_type
None affiliation
PIK income_type
Interest income_type
Affiliated affiliation
PIK income_type
Interest income_type
Interest income_type
Dividend income_type
Other income_type
Affiliated affiliation
Total income_type


[{'affiliation': 'NA',
  'affiliation_terms': 'NA',
  'income_type': 'Investment income',
  'income_type_terms': ['investment income'],
  'value1': None,
  'value2': None},
 {'affiliation': 'None',
  'affiliation_terms': ['non-control, non-affiliate',
   'non-control, non-affiliated',
   'non-controlled, non-affiliate',
   'non-controlled, non-affiliated'],
  'income_type': 'Interest',
  'income_type_terms': ['interest income'],
  'value1': '487000000.0',
  'value2': 346000000.0},
 {'affiliation': 'None',
  'affiliation_terms': ['non-control, non-affiliate',
   'non-control, non-affiliated',
   'non-controlled, non-affiliate',
   'non-controlled, non-affiliated'],
  'income_type': 'PIK',
  'income_type_terms': ['payment in kind interest income'],
  'value1': '487000000.0',
  'value2': 346000000.0},
 {'affiliation': 'None',
  'affiliation_terms': ['non-control, non-affiliate',
   'non-control, non-affiliated',
   'non-controlled, non-affiliate',
   'non-controlled, non-affiliated'],
  '

In [249]:
# Filter list of dictionaries to only the dictionaries where the affiliation is 'Affiliated'
# filtered_list = [d for d in investments_matrix if d['affiliation'] == 'Affiliated']

In [250]:
investment_income_list = [
    {
        'row_title': 'Investment income',
        'search_strings': [], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Non-controlled, non-affiliated investments: Interest income',
        'search_strings': ['from non controlled non affiliate company investments interest income', 
                           'from non controlled non affiliated investments interest income', 
                           'non control non affiliate investments interest income', 
                           'investment income from non controlled non affiliated investments interest income'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Non-controlled, non-affiliated investments: Dividend income',
        'search_strings': ['from non controlled non affiliate company investments dividend income', 
                           'from non controlled non affiliated investments dividend income', 
                           'non control non affiliate investments dividend income', 
                           'investment income from non controlled  non affiliated investments dividend income'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Non-controlled, non-affiliated investments: Payment-in-kind income',
        'search_strings': ['from non controlled non affiliate company investments payment in kind interest income', 
                           'from non controlled non affiliated investments payment in kind interest income', 
                           'non control non affiliate investments payment in kind interest income', 
                           'investment income from non controlled non affiliated investments payment in kind interest income'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Non-controlled, non-affiliated investments: Other income',
        'search_strings': ['from non controlled non affiliate company investments other income', 
                           'from non controlled non affiliated investments other income', 
                           'non control non affiliate investments other income', 
                           'investment income from non controlled  non affiliated investments other income', 
                           'from non controlled non affiliated investments fee income'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Non-controlled, non-affiliated investments: total interest, fee, and dividend income',
        'search_strings': ['interest fee and dividend income non control non affiliate investments'],
        'value1' : None,
        'value2' : None
    },
    {
        'row_title': 'Controlled affiliated investments: Interest income',
        'search_strings': ['from controlled affiliate company investments interest income', 
                           'from controlled affiliated investments  interest income', 
                           'control investments interest income', 
                           'investment income from controlled affiliated investments interest income'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Controlled affiliated investments: Dividend income',
        'search_strings': ['from controlled affiliate company investments dividend income', 
                           'from controlled affiliated investments dividend income', 
                           'control investments dividend income', 
                           'investment income from controlled affiliated investments dividend income'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Controlled affiliated investments: Payment-in-kind income',
        'search_strings': ['from controlled affiliate company investments payment in kind interest income', 
                           'from controlled affiliated investments payment in kind interest income', 
                           'control investments payment in kind interest income', 
                           'investment income from controlled affiliated investments payment in kind interest income'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Controlled affiliated investments: Other income',
        'search_strings': ['from controlled affiliate company investments other income', 
                           'from controlled affiliated investments fee income', 
                           'from controlled affiliated investments other income', 
                           'control investments other income', 
                           'investment income from controlled affiliated investments other income'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Controlled affiliated investments: total interest, fee, and dividend income',
        'search_strings': ['interest fee and dividend income control investments'],
        'value1' : None,
        'value2' : None
    },
    {
        'row_title': 'Non-controlled affiliated investments: Interest income',
        'search_strings': ['from non controlled affiliate company investments interest income', 
                           'from non controlled affiliated investments interest income', 
                           'affiliate investments interest income'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Non-controlled affiliated investments: Dividend income',
        'search_strings': ['from non controlled affiliate company investments dividend income', 
                           'from non controlled affiliated investments dividend income', 
                           'affiliate investments dividend income'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Non-controlled affiliated investments: Payment-in-kind income',
        'search_strings': ['from non controlled affiliate company investments payment in kind interest income', 
                           'from non controlled affiliated investments payment in kind interest income', 
                           'affiliate investments payment in kind interest income'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Non-controlled affiliated investments: Other income',
        'search_strings': ['from non controlled affiliate company investments other income', 
                           'fee income', 
                           'from non controlled affiliated investments other income', 
                           'affiliate investments other income'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Non-controlled affiliated investments: total interest, fee, and dividend income',
        'search_strings': ['interest fee and dividend income affiliated investments'],
        'value1' : None,
        'value2' : None
    },
    {
        'row_title': 'Total interest income',
        'search_strings': ['investment income interest income'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Total PIK income',
        'search_strings': ['investment income payment in kind interest income'],
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Total dividend income',
        'search_strings': ['investment income dividend income'],
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Total fee income',
        'search_strings': ['investment income fee income'],
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Total investment income',
        'search_strings': ['total investment income'], 
        'value1': None,
        'value2': None
    },
]

type_of_income_list = [
    {
        'row_title': 'Total interest income',
        'search_strings': ['interest income'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Total dividend income',
        'search_strings': ['dividend income'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Total payment-in-kind income',
        'search_strings': ['payment in kind interest income'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Total other income',
        'search_strings': ['fee income'], 
        'value1': None,
        'value2': None
    },
]

type_of_investment_list = [
    {
        'row_title': 'Total non-controlled, non-affiliated investments',
        'search_strings': ['non control non affiliate investments'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Total controlled affiliated investments',
        'search_strings': ['control investments'], 
        'value1': None,
        'value2': None
    },
    {
        'row_title': 'Total non-controlled affiliated investments',
        'search_strings': ['affiliate investments'], 
        'value1': None,
        'value2': None
    },
]

# Create a DataFrame from the operations_list
desired_columns = ['row_title', 'value1', 'value2']
inv_df = pd.DataFrame([{key: d[key] for key in desired_columns} for d in investment_income_list])

print(inv_df)

                                            row_title value1 value2
0                                   Investment income   None   None
1   Non-controlled, non-affiliated investments: In...   None   None
2   Non-controlled, non-affiliated investments: Di...   None   None
3   Non-controlled, non-affiliated investments: Pa...   None   None
4   Non-controlled, non-affiliated investments: Ot...   None   None
5   Non-controlled, non-affiliated investments: to...   None   None
6   Controlled affiliated investments: Interest in...   None   None
7   Controlled affiliated investments: Dividend in...   None   None
8   Controlled affiliated investments: Payment-in-...   None   None
9     Controlled affiliated investments: Other income   None   None
10  Controlled affiliated investments: total inter...   None   None
11  Non-controlled affiliated investments: Interes...   None   None
12  Non-controlled affiliated investments: Dividen...   None   None
13  Non-controlled affiliated investments: Payme

In [251]:
df = pd.read_csv("ares_operations.csv")

# print(clean_text_2(df.iloc[:, 0][0]))

def extract_investment_rows (df, start_cell_str, end_cell_str):
    try:
        start_row = df[df.iloc[:, 0].apply(clean_text_2) == start_cell_str].index[0]
        end_row = df[df.iloc[:, 0].apply(clean_text_2) == end_cell_str].index[0]
    except IndexError:
        print("Start or end cell string not found in the DataFrame.")
        return
    
    portion = df.iloc[start_row : end_row + 1, :]
    return portion

In [252]:
df_investments = extract_investment_rows(df, 'investmentincome', 'totalinvestmentincome')

In [253]:
subcategory_list = [
    {
        'category': 'No affiliation',
        'search_terms': ['non control non affiliate', 
                        'non control non affiliated', 
                        'non controlled non affiliate', 
                        'non controlled non affiliated',
                        'non control unaffiliated',
                        'non control unaffiliate']
    },
    {
        'category': 'Non-controlled affiliate',
        'search_terms': ['non control affiliate',
                        'non control affiliated',
                        'non controlled affiliate',
                        'non controlled affiliated']
                        # 'affiliate',
                        # 'affiliated']
    },
    {
        'category': 'Controlled affiliate',
        'search_terms': ['control affiliate',
                         'control affiliated',
                         'controlled affiliate',
                         'controlled affiliated']
    },
    {
        'category': 'Interest income',
        'search_terms': ['interest income']
    },
    {
        'category': 'Dividend income',
        'search_terms': ['dividend income']
    },
    {
        'category': 'PIK income',
        'search_terms': ['payment in kind']
    }
]




In [254]:
def get_investment_income (df):
    subheading = ''
    for i in range(len(df)):
        row = df.iloc[i]
        if pd.isnull(row[1]) and pd.isnull(row[2]):
            subheading = row[0]
            continue
        header = clean_text_2(subheading + row[0])
        best_match = return_best_match_2(header, investment_income_list)
        print(best_match)
        for item in investment_income_list:
                    if item['row_title'] == best_match:
                        item['value1'] = row[1]
                        item['value2'] = row[2]

In [255]:
get_investment_income (df_investments)
investment_income_list
# return_best_match_2(header, investment_income_list)
# investment_income_list

Non-controlled, non-affiliated investments: Payment-in-kind income
Non-controlled, non-affiliated investments: Interest income
Non-controlled, non-affiliated investments: Interest income
Non-controlled, non-affiliated investments: Dividend income
Non-controlled, non-affiliated investments: Other income
Non-controlled, non-affiliated investments: Payment-in-kind income
Non-controlled affiliated investments: Payment-in-kind income
Non-controlled affiliated investments: Interest income
Non-controlled, non-affiliated investments: Payment-in-kind income
Non-controlled affiliated investments: Payment-in-kind income
Non-controlled affiliated investments: Interest income
Non-controlled affiliated investments: Interest income
Non-controlled affiliated investments: Dividend income
Controlled affiliated investments: Other income
Non-controlled, non-affiliated investments: Payment-in-kind income
Non-controlled affiliated investments: Interest income


[{'row_title': 'Investment income',
  'search_strings': [],
  'value1': None,
  'value2': None},
 {'row_title': 'Non-controlled, non-affiliated investments: Interest income',
  'search_strings': ['from non controlled non affiliate company investments interest income',
   'from non controlled non affiliated investments interest income',
   'non control non affiliate investments interest income',
   'investment income from non controlled non affiliated investments interest income'],
  'value1': '10000000.0',
  'value2': 26000000.0},
 {'row_title': 'Non-controlled, non-affiliated investments: Dividend income',
  'search_strings': ['from non controlled non affiliate company investments dividend income',
   'from non controlled non affiliated investments dividend income',
   'non control non affiliate investments dividend income',
   'investment income from non controlled  non affiliated investments dividend income'],
  'value1': '57000000.0',
  'value2': 41000000.0},
 {'row_title': 'Non-co