## Importing libraries

In [3]:
import pandas as pd
import numpy as np

from pandas import NA

import re #

## Loading the dataset

In [4]:
condo = pd.read_csv("[nov 26] dot_condo.csv")

In [5]:
condo.shape

(1250, 51)

## Data cleaning

 - Check for NA values.
     - Perform text analysis to possibly obtain NA values from listing description.
 - Drop irrelevant columns.

In [7]:
# List of source DataFrames
dataframes = [condo]
df_names = ["condominium"]

# Collect NA counts in each DataFrame
na_counts_data = []
for df, name in zip(dataframes, df_names):
    # Replace "na" text with NaN if needed
    df.replace("na", pd.NA, inplace=True)
    
    # Count NA values in each column
    na_counts = df.isna().sum()
    
    # Append results with the source DataFrame name
    for column, count in na_counts.items():
        na_counts_data.append([name, column, count])

# Create the summary DataFrame
na_counts_df = pd.DataFrame(na_counts_data, columns=["Listing Type", "Feature", "NA Count"])

na_counts_df

Unnamed: 0,Listing Type,Feature,NA Count
0,condominium,title,0
1,condominium,price,0
2,condominium,price_per_sqm,11
3,condominium,location,1250
4,condominium,description,0
5,condominium,bedrooms,218
6,condominium,bathrooms,9
7,condominium,floor_area,9
8,condominium,floor_number,388
9,condominium,property_type,0


In [8]:
# Remove duplicate rows and irrelevant columns
condo = condo.replace({pd.NA: NA, np.nan: NA, 'NaN': NA})
condo = condo.drop_duplicates()
condo = condo.drop(columns = ["location", "agency_name"])

# First, replace NA values in bedrooms with 0
condo['bedrooms'] = condo['bedrooms'].fillna(0)

# Drop rows where longitude OR latitude is NA
condo = condo.dropna(subset=['longitude', 'latitude'])

In [9]:
condo.shape

(1242, 49)

In [10]:
def extract_floor_info(description):
    # Initialize variables to store extracted values
    floor_number = None
    floor_area = None
    bathrooms = None
    
    # Convert to lowercase for easier matching
    desc = str(description).lower()
    
    # Extract floor number
    floor_patterns = [
        r'(\d+)(?:st|nd|rd|th)\s+floor',
        r'(?:on\s+)?(?:the\s+)?(\d+)(?:st|nd|rd|th)?(?:\s+)?(?:floor|flr)',
        r'floor\s*-\s*(\d+)',
        r'level\s*-?\s*(\d+)'
    ]
    
    for pattern in floor_patterns:
        match = re.search(pattern, desc)
        if match:
            floor_number = int(match.group(1))
            break
    
    # Extract floor area
    area_patterns = [
        r'(\d+(?:\.\d+)?)\s*(?:sqm|sq\.?\s*m|square\s*meters?)',
        r'floor\s*area\s*:?\s*(\d+(?:\.\d+)?)',
        r'area\s*:?\s*(\d+(?:\.\d+)?)\s*sqm',
        r'(\d+(?:\.\d+)?)\s*m2'
    ]
    
    for pattern in area_patterns:
        match = re.search(pattern, desc)
        if match:
            floor_area = float(match.group(1))
            break
    
    # Extract number of bathrooms
    bathroom_patterns = [
        r'(\d+)\s*(?:bathroom|bath|t&b|toilet|tb|t & b)',
        r'(\d+)\s*(?:cr|comfort room)',
        r'bath(?:room)?s?\s*:?\s*(\d+)'
    ]
    
    for pattern in bathroom_patterns:
        match = re.search(pattern, desc)
        if match:
            bathrooms = float(match.group(1))
            break
            
    return pd.Series({
        'floor_number': floor_number,
        'floor_area': floor_area,
        'bathrooms': bathrooms
    })

def fill_missing_values(df):
    # Create mask for rows with missing values
    floor_number_mask = df['floor_number'].isna()
    floor_area_mask = df['floor_area'].isna()
    bathrooms_mask = df['bathrooms'].isna()
    
    # Extract information from descriptions where values are missing
    extracted_info = df[floor_number_mask | floor_area_mask | bathrooms_mask]['description'].apply(extract_floor_info)
    
    # Update only the missing values with extracted information
    df.loc[floor_number_mask, 'floor_number'] = extracted_info[floor_number_mask]['floor_number']
    df.loc[floor_area_mask, 'floor_area'] = extracted_info[floor_area_mask]['floor_area']
    df.loc[bathrooms_mask, 'bathrooms'] = extracted_info[bathrooms_mask]['bathrooms']
    
    return df

In [11]:
condo = fill_missing_values(condo)

  df.loc[floor_number_mask, 'floor_number'] = extracted_info[floor_number_mask]['floor_number']
  df.loc[floor_area_mask, 'floor_area'] = extracted_info[floor_area_mask]['floor_area']
  df.loc[bathrooms_mask, 'bathrooms'] = extracted_info[bathrooms_mask]['bathrooms']


In [12]:
# List of source DataFrames
dataframes = [condo]
df_names = ["condominium"]

# Collect NA counts in each DataFrame
na_counts_data = []
for df, name in zip(dataframes, df_names):
    # Replace "na" text with NaN if needed
    df.replace("na", pd.NA, inplace=True)
    
    # Count NA values in each column
    na_counts = df.isna().sum()
    
    # Append results with the source DataFrame name
    for column, count in na_counts.items():
        na_counts_data.append([name, column, count])

# Create the summary DataFrame
na_counts_df = pd.DataFrame(na_counts_data, columns=["Listing Type", "Feature", "NA Count"])

na_counts_df

Unnamed: 0,Listing Type,Feature,NA Count
0,condominium,title,0
1,condominium,price,0
2,condominium,price_per_sqm,11
3,condominium,description,0
4,condominium,bedrooms,0
5,condominium,bathrooms,9
6,condominium,floor_area,8
7,condominium,floor_number,361
8,condominium,property_type,0
9,condominium,listing_url,0
