In [1]:
import pandas as pd 
#  Load property info from the 'Property & Alias Info.xlsx' file
property_info = pd.read_excel('Property & Alias Info.xlsx', sheet_name='PROPERTY INFO')
property_info = property_info[['PID']]  # Assuming 'PID' is the column for Property ID
property_info.rename(columns={'PID': 'Property ID'}, inplace=True)

# Load alias info from the 'ALIAS INFO' sheet
alias_info = pd.read_excel('Property & Alias Info.xlsx', sheet_name='ALIAS INFO')
 
alias_info.rename(columns={'#': 'Alias ID'}, inplace=True) 

# Step 4: Generate a full Cartesian product (many-to-many) of all Property IDs and Alias IDs
property_alias_combinations = pd.merge(
    pd.DataFrame({'Property ID': property_info['Property ID'].unique()}),
    pd.DataFrame({'Alias ID': alias_info['Alias ID'].unique()}),
    how='cross'  # This creates the full combination of each Property ID with each Alias ID
)

merged_data = pd.read_csv('Merged_Follow_Up_Data.csv')

combined_df = pd.merge(property_alias_combinations, merged_data, how='outer' , on=['Alias ID', 'Property ID'])

combined_df.to_csv('Combined Data.csv' , index=False)

Task 5

In [4]:
import re

def contains_keywords(content, keywords):
    # Create a regex pattern that combines all keywords, allowing for case-insensitive matches
    pattern = r'\b(?:' + '|'.join(map(re.escape, keywords)) + r')\b'
    return bool(re.search(pattern, content, re.IGNORECASE))

def classify_follow_up(row):
    if pd.isna(row['Summary of content']) or row['Summary of content'].strip() == '':
        # If the content is empty, return the row without further processing
        row['Tour Confirmation'] = None
        row['Booking Link'] = None
        row['Requests Tour Booking'] = None
        row['Contains Pictures'] = None
        row['Personalized or Generalized'] = None
        return row
    
    content = row['Summary of content'].lower() if pd.notna(row['Summary of content']) else ''
    attachments = row['Attachments'] if pd.notna(row['Attachments']) else ''

    # Check for tour confirmation using regex
    tour_confirmation_keywords = [
    'tour confirmation',
    'confirmed your tour',
    'tour is confirmed',
    'tour is booked',
    'appointment tour',
    'appointment is confirmed',
    'tour reservation',
    'tour has been confirmed',
    'confirmation for your tour',
    'confirmed your tour',
    'your visit'
]
    row['Tour Confirmation'] = 'Yes' if contains_keywords(content, tour_confirmation_keywords) else 'No'

    # Check for Booking Link
    booking_link_keywords = [
    'booking',
    'schedule',
    'book',
    'tour',
    'reservation',
    'reserve'
    ]
    booking_link_pattern = r'(http[s]?://\S+)'  # Regex to identify URLs
    row['Booking Link'] = 'Yes' if contains_keywords(content, booking_link_keywords) and re.search(booking_link_pattern, content) else 'No'

    # Check for Requests Tour Booking
    request_tour_keywords = [
    'booking a tour',
    'schedule a tour',
    'book a tour'
    'reserve a tour',
    'request a tour',
    'arrange a tour'
]
    row['Requests Tour Booking'] = 'Yes' if contains_keywords(content, request_tour_keywords) else 'No'

    # Contains Pictures
    picture_filetypes = ['.jpg', '.jpeg', '.png', '.gif' , '.webp']
    row['Contains Pictures'] = 'Yes' if any(ext in attachments.lower() for ext in picture_filetypes) else 'No'

    # Check for alias name
    if pd.notna(row['Alias ID']):
        alias_row = alias_info.loc[alias_info['Alias ID'] == row['Alias ID']]
        
        if not alias_row.empty:
            alias_name = alias_row['ALIAS NAME'].values[0]
            # Split the alias name into substrings and ignore common words like "or"
            substrings = [word for word in alias_name.lower().split() if word not in ['or']]
            
            # Check if any substring is in the content
            if any(substring in content.lower() for substring in substrings):
                row['Personalized or Generalized'] = 'Personalized'
            else:
                row['Personalized or Generalized'] = 'Generalized'
        else:
            row['Personalized or Generalized'] = 'Generalized'
    else:
        row['Personalized or Generalized'] = 'Generalized'

    return row

# Apply the classification to the follow-up data
classified_follow_ups = combined_df.apply(lambda row: classify_follow_up(row), axis=1)

classified_follow_ups['ID'] = range(1 , len(classified_follow_ups) + 1
                                    )
# Save the classified follow-up data
classified_follow_ups.to_csv('Classified_Follow_Up_Data_V2.csv', index=False)

print("Classified follow-up data has been generated successfully.")


Classified follow-up data has been generated successfully.
