# Analyzing CTCAC tax credits project applications from previous years 
Context: 2025 4% allocations. After training on round 1 data, the model is used to predict the round 2 (I did this manually), but only achieved 60% accuracy (terrible result)  
Testing on round 2 data shows that our model trained on only round 1 data is insufficient.   
Possible solutions:
- Get more training data from previous years (get at least 1000 data points if possible)
- include fund allocation per round data into the training 



In [3]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

# ensure changes in files are reflected
%load_ext autoreload
%autoreload 2
# Set display options to show all columns
pd.set_option('display.max_columns', None)  # Show all columns

In [None]:
# 2025 DataFrames
R1_2025_applicant = pd.read_excel("../data/external/2025-R1-ApplicantList.xlsx", header=1, index_col=None)
R1_2025_applicant.attrs['file_name'] = "../data/external/2025-R1-ApplicantList.xlsx"  # Corrected to match file
award_2025_R1 = pd.read_excel("../data/external/2025-R1-AwardList.xlsx")
award_2025_R1.attrs['file_name'] = "../data/external/2025-R1-AwardList.xlsx"

# 2024 DataFrames
R1_2024_applicant = pd.read_excel("../data/external/2024-R1-ApplicantList.xlsx", header=1, index_col=None)
R1_2024_applicant.attrs['file_name'] = "../data/external/2024-R1-ApplicantList.xlsx"  # Corrected to match file
R2_2024_applicant = pd.read_excel("../data/external/2024-R2-ApplicantList.xlsx", header=1, index_col=None)
R2_2024_applicant.attrs['file_name'] = "../data/external/2024-R2-ApplicantList.xlsx"  # Corrected to match file
award_2024 = pd.read_excel("../data/external/2024-Financing-data.xlsx", index_col=None)
award_2024.attrs['file_name'] = "../data/external/2024-Financing-data.xlsx"

# 2023 DataFrames (completing the missing attrs assignments)
R1_2023_applicant = pd.read_excel("../data/external/2023-R1-ApplicantList.xlsx", header=1, index_col=None)
R1_2023_applicant.attrs['file_name'] = "../data/external/2023-R1-ApplicantList.xlsx"
R2_2023_applicant = pd.read_excel("../data/external/2023-R2-ApplicantList.xlsx", header=1, index_col=None)
R2_2023_applicant.attrs['file_name'] = "../data/external/2023-R2-ApplicantList.xlsx"
R3_2023_applicant = pd.read_excel("../data/external/2023-R3-ApplicantList.xlsx", header=1, index_col=None)
R3_2023_applicant.attrs['file_name'] = "../data/external/2023-R3-ApplicantList.xlsx"
award_2023 = pd.read_excel("../data/external/2023-Financing-data.xlsx", index_col=None)
award_2023.attrs['file_name'] = "../data/external/2023-Financing-data.xlsx"

# List of all DataFrames with their names for printing
dataframes = [
    (R1_2025_applicant, "R1_2025_applicant"),
    (award_2025_R1, "award_2025_R1"),
    (R1_2024_applicant, "R1_2024_applicant"),
    (R2_2024_applicant, "R2_2024_applicant"),
    (award_2024, "award_2024"),
    (R1_2023_applicant, "R1_2023_applicant"),
    (R2_2023_applicant, "R2_2023_applicant"),
    (R3_2023_applicant, "R3_2023_applicant"),
    (award_2023, "award_2023")
]

# Print file name and columns for each DataFrame
for df, df_name in dataframes:
    print(f"DataFrame: {df_name}")
    print(f"File Name: {df.attrs['file_name']}")
    print("Columns:", df.columns.tolist())
    print("info:", df.describe(include=""))
    print("-" * 50)

DataFrame: R1_2025_applicant
File Name: ../data/external/2025-R1-ApplicantList.xlsx
Columns: ['APPLICATION NUMBER', 'PROJECT NAME', 'CONSTRUCTION TYPE', 'HOUSING TYPE', 'CITY', 'COUNTY', 'TOTAL UNITS', 'LOW INCOME UNITS', 'MARKET RATE UNITS', 'UNITS FOR HOMELESS', 'HOMELESS %', 'AVERAGE TARGETED AFFORDABILITY', 'TOTAL PROJECT COSTS', 'BOND REQUEST', 'ANNUAL FEDERAL CREDIT REQUEST', 'STATE CREDIT REQUEST', 'CDLAC POOL', 'NEW CONSTRUCTION SET ASIDE', 'BIPOC PRE-QUALIFIED', 'CTCAC REGION', 'CDLAC REGION', 'CDLAC TOTAL POINTS SCORE', 'PRESERVATION AND OTHER REHAB. PROJECT PRIORITIES (20 PTS)', 'NEW CONSTRUCTION DENSITY & LOCAL INCENTIVES (10 PTS)', 'EXCEEDING MINIMUM INCOME RESTRICTIONS (20 PTS)', 'EXCEEDING MINIMUM RENT RESTRICTIONS (10 PTS)', 'GP & MGMT. CO. EXPERIENCE (10 PTS)', 'HOUSING NEEDS (10 PTS)', 'LEVERAGED SOFT RESOURCES (8 PTS)', 'READINESS TO PROCEED (10 PTS)', 'AFFIRMATIVELY FURTHERING FAIR HOUSING (10 PTS)', 'SERVICE AMENITIES (10 PTS)', 'COST CONTAINMENT (12 PTS)', 'SITE A

In [19]:
import re 
# Function to find columns by partial name match (case-insensitive)
def find_columns(df, pattern):
    """
    Find columns in a DataFrame that match a given pattern (case-insensitive).
    pattern: str, e.g., 'id', 'name', 'status'
    Returns: list of matching column names
    """
    return [col for col in df.columns if re.search(pattern, col, re.IGNORECASE)]

# Function to summarize columns with describe()
def summarize_columns(df, df_name, file_name, patterns):
    """
    Run describe() on columns matching the given patterns.
    patterns: list of strings to match column names (e.g., ['id', 'name'])
    """
    print(f"DataFrame: {df_name}")
    print(f"File Name: {file_name}")
    
    for pattern in patterns:
        matched_columns = find_columns(df, pattern)
        if not matched_columns:
            print(f"No columns found matching pattern '{pattern}'")
            continue
        
        print(f"\nPattern: {pattern}")
        for col in matched_columns:
            print(f"\nColumn: {col}")
            # Check if column is categorical or object
            if df[col].dtype in ['category', 'object']:
                print(df[col].describe())
                print(df[col].value_counts())
            print("-" * 50)
    print("=" * 50)

# List of patterns to match column names (modify based on your data)
patterns = ['construction']  # Adjust these based on expected column names

# Loop through DataFrames and summarize matched columns
for df, df_name in dataframes:
    summarize_columns(df, df_name, df.attrs['file_name'], patterns)

DataFrame: R1_2025_applicant
File Name: ../data/external/2025-R1-ApplicantList.xlsx

Pattern: construction

Column: CONSTRUCTION TYPE
count                   91
unique                   3
top       New Construction
freq                    64
Name: CONSTRUCTION TYPE, dtype: object
CONSTRUCTION TYPE
New Construction          64
Acq and Rehabilitation    26
Adaptive Reuse             1
Name: count, dtype: int64
--------------------------------------------------

Column: NEW CONSTRUCTION SET ASIDE
count                    27
unique                    2
top       Homeless, ELI/VLI
freq                     15
Name: NEW CONSTRUCTION SET ASIDE, dtype: object
NEW CONSTRUCTION SET ASIDE
Homeless, ELI/VLI    15
ELI/VLI              12
Name: count, dtype: int64
--------------------------------------------------

Column: NEW CONSTRUCTION DENSITY & LOCAL INCENTIVES (10 PTS)
--------------------------------------------------
DataFrame: award_2025_R1
File Name: ../data/external/2025-R1-AwardList.xlsx


## Issue
The columns and their names are all slightly different from one another. How do I merge them into a single file? What about future excel files? Is it possible to setup a pipeline that deals with all these variability? 

In [41]:
import re

for df in df_list: 
    print(df.shape)
    cols_to_drop = [col for col in df.columns if (re.match(r'GP.*', col) or re.match(r'(.* PTS)', col) or  re.match(r'Points,.*', col))]
    print(cols_to_drop)
    df = df.drop(columns=cols_to_drop)
    print("new df shape: ", df.shape)
    print(df.columns)
    print("-----------------\n")

(161, 45)
['PRESERVATION AND OTHER REHAB. PROJECT PRIORITIES (20 PTS)', 'NEW CONSTRUCTION DENSITY & LOCAL INCENTIVES (10 PTS)', 'EXCEEDING MINIMUM INCOME RESTRICTIONS (20 PTS)', 'EXCEEDING MINIMUM RENT RESTRICTIONS (10 PTS)', 'GP & MGMT. CO. EXPERIENCE (10 PTS)', 'HOUSING NEEDS (10 PTS)', 'LEVERAGED SOFT RESOURCES (8 PTS)', 'READINESS TO PROCEED (10 PTS)', 'AFFIRMATIVELY FURTHERING FAIR HOUSING (10 PTS)', 'SERVICE AMENITIES (10 PTS)', 'COST CONTAINMENT (12 PTS)', 'SITE AMENITIES (10 PTS)', 'GP 1 COMPANY NAME', 'GP 1 CONTACT NAME', 'GP 1 PARENT COMPANY', 'GP 2 COMPANY NAME', 'GP 2 CONTACT NAME', 'GP 2 PARENT COMPANY', 'GP 3 COMPANY NAME', 'GP 3 CONTACT NAME', 'GP 3 PARENT COMPANY']
new df shape:  (161, 24)
Index(['APPLICATION NUMBER', 'PROJECT NAME', 'CONSTRUCTION TYPE',
       'HOUSING TYPE', 'CITY', 'COUNTY', 'TOTAL UNITS', 'LOW INCOME UNITS',
       'MARKET RATE UNITS', 'UNITS FOR HOMELESS INDIVIDUALS',
       'AVERAGE AFFORDABILTY (TARGETED AMI)', 'TOTAL PROJECT COST',
       'CONST