In [3]:
import pandas as pd
import os
import re

# Load the property details file
file_path = 'Property & Alias info.xlsx'
sheet_name = 'PROPERTY INFO'

try:
    # Read the property data and create a property dictionary
    property_data = pd.read_excel(file_path, sheet_name=sheet_name)
    property_dict = {
        row['PROPERTY']: {'PID': row['PID'], 'PHONE': row['PHONE'], 'ADDRESS': row['ADDRESS'], 'WEBSITE': row['WEBSITE']}
        for _, row in property_data.iterrows()
    }

    # Path to the email data folder
    email_data_folder = 'email_data'
    
    # Ignore these keywords
    terms_to_avoid = ['microsoft.com']

    def normalize_property_name(name):
        """Normalize the property name by removing certain keywords and formatting."""
        name = name.split(' at ')[0].lower()
        for keyword in ['apartments', 'Apartments']:
            name = name.replace(keyword, '').strip()
        return name

    def content_contains_term(content, terms):
        """Check if any of the specified terms are in the email content."""
        return any(term in content for term in terms)

    def match_phone_in_content(content, phone):
        """Check if the phone number is in the email content."""
        return pd.notnull(phone) and phone in content

    def match_address_in_content(content, address):
        """Check if both street and zipcode from the address are in the content."""
        if pd.notnull(address):
            address_parts = [part.strip().lower() for part in address.split(',') if part.strip()]
            if len(address_parts) >= 2:
                street, zipcode = address_parts[0], address_parts[-1]
                return street in content and zipcode in content
        return False

    def extract_domain_from_url(url):
        """Extract the domain from a URL (ignoring 'https://', 'www.' and trailing slashes)."""
        if pd.notnull(url):
            match = re.search(r'(https?://)?(www\.)?([^/]+)', url)
            if match:
                return match.group(3).lower()
        return None

    def match_website_in_content(content, website):
        """Check if the website domain is in the email content."""
        domain = extract_domain_from_url(website)
        return domain is not None and domain in content

    # DataFrame to hold matched emails
    matched_emails = pd.DataFrame()

    # Process each email CSV file in the folder
    for filename in os.listdir(email_data_folder):
        if filename.endswith('.csv'):
            email_file_path = os.path.join(email_data_folder, filename)
            
            try:
                # Load the email data
                email_data = pd.read_csv(email_file_path)
                email_data['Property ID'] = None

                for index, row in email_data.iterrows():
                    email_content = f"{row['Email ID']} {row['Subject']} {row['Body']}".lower()

                    if content_contains_term(email_content, terms_to_avoid):
                        continue  # Skip this email if 'microsoft.com' is found

                    # Try to match by property name
                    for property_name, property_info in property_dict.items():
                        if normalize_property_name(property_name) in email_content:
                            email_data.at[index, 'Property ID'] = property_info['PID']
                            break  # Stop once a match is found

                    # If no name match, try matching by phone, address, or website
                    if pd.isnull(email_data.at[index, 'Property ID']):
                        for property_name, property_info in property_dict.items():
                            if match_phone_in_content(email_content, property_info['PHONE']) or \
                               match_address_in_content(email_content, property_info['ADDRESS']) or \
                               match_website_in_content(email_content, property_info['WEBSITE']):
                                email_data.at[index, 'Property ID'] = property_info['PID']
                                break

                # Filter out emails without Property ID and append to the aggregated DataFrame
                matched_emails = pd.concat([matched_emails, email_data[email_data['Property ID'].notna()]])

            except Exception as e:
                print(f"Error processing {filename}: {e}")
    
    matched_emails['ID'] = range(1 , len(matched_emails) + 1)

    # Save the aggregated data with Property ID matched to a new CSV file
    output_file = 'aggregated_email_data.csv'
    matched_emails.to_csv(output_file, index=False)
    print(f"Aggregated email data has been saved to {output_file}.")

except Exception as e:
    print(f"Error loading property data: {e}")


Aggregated email data has been saved to aggregated_email_data.csv.
