<a href="https://colab.research.google.com/github/jessica2907/Property-Data-Preprocessing/blob/main/new_features_spaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install spaCy and the English language model
!pip install spacy
!python -m spacy download en_core_web_sm

import spacy
import pandas as pd
import re

# Load spaCy's pre-trained English model
nlp = spacy.load("en_core_web_sm")

# Read the input CSV file
input_path = '/content/drive/MyDrive/Original CSV/data_cleaned_with_description.csv'
output_path = '/content/drive/MyDrive/Text_Blob/data_cleaned_with_new_features.csv'

# Load the CSV data
df = pd.read_csv(input_path)

# Initialize new columns
df['views'] = ''
df['condition'] = ''
df['parking_type'] = ''
df['accessibility_features'] = ''

# Function to extract relevant features using spaCy and regular expressions
def extract_features(description):
    # Process the description text with spaCy
    doc = nlp(description)

    # Initialize empty strings for each feature
    views = ''
    condition = ''
    parking_type = ''
    accessibility_features = ''

    # Use regular expressions to extract common view types (sea, lake, city, mountain, etc.)
    view_patterns = ['sea view', 'lake view', 'mountain view', 'city view', 'ocean view', 'garden view']
    for view in view_patterns:
        if re.search(view, description, re.IGNORECASE):
            views = view
            break  # Stop after finding the first match

    # Use spaCy to extract conditions, if any
    for ent in doc.ents:
        if ent.label_ == 'CARDINAL':  # Cardinal numbers might indicate property age or condition
            condition = ent.text
            break

    # Parking type extraction (looking for words related to parking)
    parking_keywords = ['garage', 'carport', 'parking space', 'covered parking', 'driveway']
    for keyword in parking_keywords:
        if re.search(keyword, description, re.IGNORECASE):
            parking_type = keyword
            break

    # Accessibility features (common keywords or phrases related to accessibility)
    accessibility_keywords = ['wheelchair', 'accessible', 'ramp', 'elevator', 'no stairs']
    for keyword in accessibility_keywords:
        if re.search(keyword, description, re.IGNORECASE):
            accessibility_features = keyword
            break

    return views, condition, parking_type, accessibility_features

# Loop through the descriptions and fill in the new columns
for index, row in df.iterrows():
    description = row['naturalDescription']

    if description:
        views, condition, parking_type, accessibility_features = extract_features(description)
        df.at[index, 'views'] = views
        df.at[index, 'condition'] = condition
        df.at[index, 'parking_type'] = parking_type
        df.at[index, 'accessibility_features'] = accessibility_features

# Save the updated dataframe to a new CSV file
df.to_csv(output_path, index=False)

print(f"Output saved to {output_path}")


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m66.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Output saved to /content/drive/MyDrive/Text_Blob/data_cleaned_with_new_features.csv
