In [6]:
#Version 2.1
import pandas as pd
import re
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from nltk.corpus import stopwords
from metaphone import doublemetaphone

# Load NLTK stop words
stop_words = set(stopwords.words('english'))

# Load the property details and alias details
property_data = pd.read_excel('Property & Alias Info.xlsx', sheet_name='PROPERTY INFO')
alias_data = pd.read_excel('Property & Alias Info.xlsx', sheet_name='ALIAS INFO')

# Load the text message and voicemail logs
text_message_logs = pd.read_csv('Text_Logs(2024-08-19-2024-09-27)-20240927.csv')  
voicemail_logs = pd.read_csv('Voicemails(2024-08-19-2024-09-27)-20240927.csv')  

# Normalize phone numbers: remove country code (1) and non-digit characters
def normalize_phone_number(phone_number):
    """Normalize phone number by removing country code and non-digit characters."""
    if pd.isna(phone_number):  # Check for NaN values
        return ''  # Return empty string for NaN values
    phone_number = str(phone_number)  # Ensure it's a string
    phone_number = re.sub(r'\D', '', phone_number)  # Remove all non-digit characters
    if phone_number.startswith('1') and len(phone_number) == 11:  # Remove country code '1'
        phone_number = phone_number[1:]
    return phone_number

# Normalize phone numbers in text and voicemail logs
text_message_logs['ReceiverPhoneNumber'] = text_message_logs['to_phone'].apply(normalize_phone_number)
text_message_logs['SenderPhoneNumber'] = text_message_logs['from_phone'].apply(normalize_phone_number)
voicemail_logs['SenderPhoneNumber'] = voicemail_logs['external_number'].apply(normalize_phone_number)
voicemail_logs['ReceiverPhoneNumber'] = voicemail_logs['internal_number'].apply(normalize_phone_number)

# Normalize phone numbers in property data for comparison
property_data['PHONE'] = property_data['PHONE'].apply(normalize_phone_number)
alias_data['PHONE NUMBER'] = alias_data['PHONE NUMBER'].apply(normalize_phone_number)

# Create property and alias dictionaries for quick lookup
property_dict = {row['PHONE']: row['PID'] for _, row in property_data.iterrows()}
alias_dict = {row['PHONE NUMBER']: row['#'] for _, row in alias_data.iterrows()}

# Define a list of common keywords to remove
keywords_to_remove = ['apartments', 'ranch', 'residences', 'villas', 'homes', 'condos', 'estates']

def preprocess_property_name(property_name):
    """Remove common keywords like apartments, ranch from property names and handle None values."""
    if not isinstance(property_name, str):
        return ''  # Return an empty string if the property_name is None or not a string
    
    # Remove the keywords from the property name
    for keyword in keywords_to_remove:
        property_name = re.sub(rf'\b{keyword}\b', '', property_name.lower(), flags=re.IGNORECASE)
    
    # Clean up extra spaces and return
    return property_name.strip()

    
def fuzzy_match_property_name(text, property_data , threshold=75):
    """Perform fuzzy string matching to identify the most likely property name."""
    if not isinstance(text, str):  # Ensure the text is a string before matching
        return None
    # Preprocess the property names in the dataset
    property_data['CLEANED PROPERTY NAME'] = property_data['PROPERTY'].apply(preprocess_property_name)
    
    # Perform fuzzy matching with cleaned property names
    best_match = process.extractOne(text, property_data['CLEANED PROPERTY NAME'], scorer=fuzz.token_set_ratio)
    if best_match and best_match[1] >= threshold:  # Check if match confidence exceeds threshold
        return property_data[property_data['CLEANED PROPERTY NAME'] == best_match[0]]['PID'].values[0]
    
    return None


def phonetic_matching(word, property_data, top_n=3):
    """Perform phonetic matching to find the top N property names and matched words."""
    matches = []
    phonetic_word = doublemetaphone(word)[0]  # Get the primary phonetic encoding

    for _, property_row in property_data.iterrows():
        preprocessed_property_name = preprocess_property_name(property_row['PROPERTY'])
        property_pid = property_row['PID']

        # Get phonetic representation of the property name
        property_phonetic = doublemetaphone(preprocessed_property_name)[0]

        # Calculate a similarity score (0 or 1)
        if phonetic_word == property_phonetic:
            matches.append((property_pid, 100, [word]))  # Perfect match
        else:
            # If they don't match, you can calculate a lower score based on some logic
            matches.append((property_pid, 0, []))  # No similarity

    # Sort matches based on score and get the top N matches
    matches.sort(key=lambda x: x[1], reverse=True)
    return matches[:top_n]

def enhanced_fuzzy_matching(text, property_data, top_n=3):
    """Perform enhanced fuzzy matching to identify the top N property names and matched words."""
    if not isinstance(text, str):
        return [], [], []  # Return empty lists if text is not valid

    matches = []  # List to hold matches

    # Tokenize the text and remove stop words
    words = [word for word in re.findall(r'\w+', text) if word.lower() not in stop_words]

    for _, property_row in property_data.iterrows():
        preprocessed_property_name = preprocess_property_name(property_row['PROPERTY'])
        property_pid = property_row['PID']

        # Fuzzy Matching Scores
        fuzzy_score = fuzz.token_set_ratio(text, preprocessed_property_name)

        # Store matched words from the text that are in the property name
        matched_words = [word for word in words if word.lower() in preprocessed_property_name.lower()]

        # Only add to matches if there are matched words
        if matched_words:
            matches.append((property_pid, fuzzy_score, matched_words))

    # Sort matches based on score and get the top N matches
    matches.sort(key=lambda x: x[1], reverse=True)
    top_matches = matches[:top_n]

    # If no matches were found, try phonetic matching
    if not top_matches:
        for word in words:
            phonetic_matches = phonetic_matching(word, property_data, top_n)
            for property_pid, phonetic_score, matched_words in phonetic_matches:
                if phonetic_score > 0:  # Only add if there is a score greater than 0
                    matches.append((property_pid, phonetic_score, matched_words))  # Add phonetic match info

    # Sort final matches based on score and return top N
    matches.sort(key=lambda x: x[1], reverse=True)
    top_matches = matches[:top_n]

    # Return lists for Property IDs, scores, and matched words
    if top_matches:
        return ([match[0] for match in top_matches],  # Property IDs
                [match[1] for match in top_matches],  # Scores
                [match[2] for match in top_matches])  # Matched Words
    else:
        return [], [], []  # Return empty lists if no matches




def map_text_property_and_alias(logs, property_dict, alias_dict, property_data, text_column):
    """Assign Property ID and Alias ID based on phone number or matched property name."""
    for idx, row in logs.iterrows():
        # Check if Sender's phone matches any property
        if row['SenderPhoneNumber'] in property_dict:
            logs.at[idx, 'Property ID'] = property_dict[row['SenderPhoneNumber']]
        else:
            # Fuzzy match the property name from the text content, if text is a string
            property_id = fuzzy_match_property_name(row[text_column], property_data)
            if property_id:
                logs.at[idx, 'Property ID'] = property_id
        
        # Check if Receiver's phone matches any alias
        if row['ReceiverPhoneNumber'] in alias_dict:
            logs.at[idx, 'Alias ID'] = alias_dict[row['ReceiverPhoneNumber']]


           
def map_voicemail_property(voicemail_logs, property_dict, alias_dict, property_data, text_column):
    """Assign Property ID and Alias ID based on phone number, fuzzy matching, and phonetic matching."""
    # New columns to store the results
    voicemail_logs['Top Property Matches'] = None
    voicemail_logs['Match Scores'] = None
    voicemail_logs['Matched Words'] = None

    for idx, row in voicemail_logs.iterrows():
        # 1. Phone Number Matching
        if row['SenderPhoneNumber'] in property_dict:
            voicemail_logs.at[idx, 'Property ID'] = property_dict[row['SenderPhoneNumber']]
        
        elif pd.isna(row['Property ID']):  # Continue only if no property matched via phone number
            # 2. Combined Matching
            top_matches, match_scores, matched_words = enhanced_fuzzy_matching(row[text_column], property_data)
            if top_matches:  # Check if there are any top matches
                voicemail_logs.at[idx, 'Top Property Matches'] = top_matches
                voicemail_logs.at[idx, 'Match Scores'] = match_scores
                voicemail_logs.at[idx, 'Matched Words'] = matched_words

        # 3. Alias Matching via Phone Number
        if row['ReceiverPhoneNumber'] in alias_dict:
            voicemail_logs.at[idx, 'Alias ID'] = alias_dict[row['ReceiverPhoneNumber']]

    print("Logs have been successfully updated with Property ID and Alias ID using fuzzy matching.")




text_message_logs['Property ID'] = None
voicemail_logs['Property ID'] = None

# Apply fuzzy matching to the logs
map_text_property_and_alias(text_message_logs, property_dict, alias_dict, property_data, 'encrypted_aes_text')
map_voicemail_property(voicemail_logs, property_dict, alias_dict, property_data, 'transcription_text')


# Specify the columns to include for updated text message logs
text_message_columns = [
    'date', 'message_id', 'target_id', 'encrypted_aes_text', 
    'ReceiverPhoneNumber', 'SenderPhoneNumber', 'Property ID', 'Alias ID'
]

# Specify the columns to include for updated voicemail logs
voicemail_columns = [
    'date', 'call_id', 'target_id', 'recording_url', 
    'transcription_text', 'ReceiverPhoneNumber', 'SenderPhoneNumber', 
    'Property ID', 'Top Property Matches', 'Match Scores', 'Matched Words', 'Alias ID'
]

# Write to CSV with only the selected columns for text message logs
text_message_logs.to_csv('updated_text_message_logs.csv', columns=text_message_columns, index=False)

# Write to CSV with only the selected columns for voicemail logs
voicemail_logs.to_csv('updated_voicemail_logs.csv', columns=voicemail_columns, index=False)

print("Logs have been successfully updated with selected columns.")


Logs have been successfully updated with Property ID and Alias ID using fuzzy matching.
Logs have been successfully updated with selected columns.


Optional -- Code for Extracting Phone Numbers

In [None]:
# Function to extract phone number from text
def extract_phone_number(text):
    """Extract phone number from the text in various formats like xxx-xxx-xxxx."""
    if not isinstance(text, str):  # Ensure the input is a string
        return None
    phone_pattern = re.compile(r'(\d{3}[-.\s]?\d{3}[-.\s]?\d{4})')
    match = phone_pattern.search(text)
    if match:
        return normalize_phone_number(match.group())
    return None

# Apply phone number extraction to logs
text_message_logs['FollowUp Phone Number'] = text_message_logs['encrypted_aes_text'].apply(extract_phone_number)
voicemail_logs['FollowUp Phone Number'] = voicemail_logs['transcription_text'].apply(extract_phone_number)

# Create a dictionary with phone numbers and assigned Property IDs from the logs
phone_property_dict = {}

# Combine the logs from text messages and voicemails
combined_logs = pd.concat([text_message_logs, voicemail_logs])

# Iterate through each row in the combined logs
for _, row in combined_logs.iterrows():
    followup_phone = row['FollowUp Phone Number']
    property_id = row['Property ID']
    
    # Add to dictionary if both values are not empty or NaN
    if pd.notna(followup_phone) and pd.notna(property_id):
        phone_property_dict[followup_phone] = property_id


# Function to check if any sequence of 9 digits in a number matches another number
def contains_9_digit_match(phone1, phone2):
    """Check if phone1 contains any sequence of 9 consecutive digits from phone2 in order."""
    # Ensure both phone numbers are not None
    if phone1 is None or phone2 is None:
        return False

    for i in range(len(phone2) - 8):  # Subtract 8 to avoid overflow on the last sequence of 9 digits
        if phone2[i:i + 9] in phone1:
            return True
    return False

# Function to update logs with matched Property IDs based on the 9-digit comparison
def update_empty_property_ids(logs, phone_property_dict):
    """Update logs where Property ID is empty by comparing FollowUp Phone Number with the dictionary."""
    for idx, row in logs.iterrows():
        property_id = row['Property ID']
        followup_phone = row['FollowUp Phone Number']
        
        # Check only rows with empty Property ID
        if pd.isna(property_id) or property_id == '':
            # Ensure followup_phone is not None before proceeding
            if followup_phone is not None:
                # Compare the followup phone number with all phone numbers in the dictionary
                for dict_phone, dict_property_id in phone_property_dict.items():
                    # Ensure dict_phone is not None
                    if dict_phone is not None and contains_9_digit_match(followup_phone, dict_phone):
                        # Assign the matched Property ID
                        logs.at[idx, 'Property ID'] = dict_property_id
                        break




# Update logs with the mapped phone number-based Property IDs
update_empty_property_ids(text_message_logs, phone_property_dict)
update_empty_property_ids(voicemail_logs, phone_property_dict)