In [3]:
import re
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
data = pd.read_csv('properties.csv')
data.head()

Unnamed: 0,title,location,spec,price,description,features
0,Newly Built 4 Bedroom Terraced Duplex,Ikate Lekki Lagos,4 Beds | 4 Baths | 5 Toilets,"20,000,000/year",Newly built 4 bedroom terrace duplex \n ...,"['Street Lights', 'Big Compound', '24 Hours Se..."
1,3 Bedroom Apartment,Oniru Victoria Island Lagos,3 Beds | 3 Baths | 4 Toilets,"20,000,000/year",CW07369 \n This spacious 3-bedroom apar...,
2,2 Bedroom Apartment,Lekki Phase 1 Lekki Lagos,2 Beds | 2 Baths | 3 Toilets,"13,000,000/year",CW07377 This 2-bedroom apartment delivers a si...,
3,3 Bedroom Apartment,Ikoyi Lagos,3 Beds | 3 Baths | 4 Toilets,"20,000,000/year",CW07192 \n \n 3-BEDROOM APARTME...,
4,3 Bedroom Apartment With Bq,Old Ikoyi Ikoyi Lagos,3 Beds | 3 Baths | 4 Toilets,"30,000,000/year",CW07241 \n \n 3 BEDROOM APARTME...,


### Data Cleaning ###

In [6]:
df = data.copy()

# Check for missing values
def check_missing(df):
    missing = df.isnull().sum()
    missing_percent = (missing / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing Count': missing,
        'Missing Percentage': missing_percent
    })
    return round(missing_df[missing_df['Missing Count'] > 0], 2)

check_missing(df)

Unnamed: 0,Missing Count,Missing Percentage
spec,3131,22.04
features,9711,68.36


In [12]:
def clean_price(price):
    if pd.isnull(price):
        return np.nan
    price = str(price).lower().replace('/year', '').strip()
    price = re.sub(r'[^\d]', '', price)
    return float(price) if price else np.nan

def clean_spec(spec):
    if pd.isnull(spec):
        return {'bedrooms': np.nan, 'bathrooms': np.nan, 'toilets': np.nan}
    
    parts = str(spec).split('|')
    cleaned = {'bedrooms': np.nan, 'bathrooms': np.nan, 'toilets': np.nan}
    
    for part in parts:
        part = part.strip().lower()
        match = re.search(r'(\d+)', part)
        
        if match:
            number = int(match.group(1))
            if 'bed' in part:
                cleaned['bedrooms'] = number
            elif 'bath' in part:
                cleaned['bathrooms'] = number
            elif 'toilet' in part:
                cleaned['toilets'] = number
    
    return cleaned

def clean_location(location):
    if pd.isnull(location):
        return np.nan
    
    location = str(location).strip()
    parts = location.split()
    
    if len(parts) < 2:
        return location.title()
    
    area = ' '.join(parts[:-1])
    return area.title()

def extract_from_description(description):
    features = {
        'has_pool': 0,
        'has_gym': 0,
        'has_parking': 0,
        'has_bq': 0,
        'has_elevator': 0,
        'is_newly_built': 0,
        'is_furnished': 0,
        'is_serviced': 0
    }
    
    if pd.isnull(description):
        return features
    
    desc = str(description).lower()
    
    if any(word in desc for word in ['pool', 'swimming']):
        features['has_pool'] = 1
    if 'gym' in desc or 'fitness' in desc:
        features['has_gym'] = 1
    if 'parking' in desc or 'garage' in desc:
        features['has_parking'] = 1
    if 'bq' in desc or 'boys quarter' in desc:
        features['has_bq'] = 1
    if 'elevator' in desc or 'lift' in desc:
        features['has_elevator'] = 1
    if 'newly' in desc or 'brand new' in desc:
        features['is_newly_built'] = 1
    if 'furnished' in desc:
        features['is_furnished'] = 1
    if 'serviced' in desc or 'service charge' in desc:
        features['is_serviced'] = 1
    
    return features

def extract_from_features(features):
    feature_dict = {
        'has_pool': 0,
        'has_gym': 0,
        'has_parking': 0,
        'has_security': 0,
        'has_generator': 0,
        'has_ac': 0,
        'has_wifi': 0
    }
    
    if pd.isnull(features):
        return feature_dict
    
    features_str = str(features).lower()
    
    if 'pool' in features_str or 'swimming' in features_str:
        feature_dict['has_pool'] = 1
    if 'gym' in features_str or 'fitness' in features_str:
        feature_dict['has_gym'] = 1
    if 'parking' in features_str or 'garage' in features_str:
        feature_dict['has_parking'] = 1
    if 'security' in features_str:
        feature_dict['has_security'] = 1
    if 'generator' in features_str or 'power' in features_str:
        feature_dict['has_generator'] = 1
    if 'air condition' in features_str or 'ac' in features_str:
        feature_dict['has_ac'] = 1
    if 'wifi' in features_str or 'internet' in features_str:
        feature_dict['has_wifi'] = 1
    
    return feature_dict

def get_property_type(title):
    if pd.isnull(title):
        return 'Other'
    
    title = str(title).lower()
    
    if 'duplex' in title:
        return 'Duplex'
    elif 'terrace' in title:
        return 'Terrace'
    elif 'detached' in title and 'semi' not in title:
        return 'Detached'
    elif 'semi' in title or 'semi-detached' in title:
        return 'Semi-Detached'
    elif 'maisonette' in title:
        return 'Maisonette'
    elif 'townhouse' in title:
        return 'Townhouse'
    elif 'apartment' in title or 'flat' in title:
        return 'Apartment'
    elif 'villa' in title:
        return 'Villa'
    elif 'office' in title:
        return 'Office'
    elif 'commercial' in title:
        return 'Commercial'
    else:
        return 'Other'


In [13]:
# Apply the cleaning functions to the dataframe

df['price_clean'] = df['price'].apply(clean_price)

spec_df = df['spec'].apply(clean_spec).apply(pd.Series)
df['bedrooms'] = spec_df['bedrooms']
df['bathrooms'] = spec_df['bathrooms']
df['toilets'] = spec_df['toilets']

df['area'] = df['location'].apply(clean_location)

desc_features = df['description'].apply(extract_from_description).apply(pd.Series)
df = pd.concat([df, desc_features], axis=1)

features_extracted = df['features'].apply(extract_from_features).apply(pd.Series)
for col in features_extracted.columns:
    if col in df.columns:
        df[col] = df[col] | features_extracted[col]
    else:
        df[col] = features_extracted[col]

df['property_type'] = df['title'].apply(get_property_type)

df.head(20)

Unnamed: 0,title,location,spec,price,description,features,cleaned_price,price_clean,bedrooms,bathrooms,...,has_bq,has_elevator,is_newly_built,is_furnished,is_serviced,has_security,has_generator,has_ac,has_wifi,property_type
0,Newly Built 4 Bedroom Terraced Duplex,Ikate Lekki Lagos,4 Beds | 4 Baths | 5 Toilets,"20,000,000/year",Newly built 4 bedroom terrace duplex \n ...,"['Street Lights', 'Big Compound', '24 Hours Se...",20000000.0,20000000.0,4.0,4.0,...,0,0,1,0,0,1,0,1,1,Duplex
1,3 Bedroom Apartment,Oniru Victoria Island Lagos,3 Beds | 3 Baths | 4 Toilets,"20,000,000/year",CW07369 \n This spacious 3-bedroom apar...,,20000000.0,20000000.0,3.0,3.0,...,1,0,0,0,0,0,0,0,0,Apartment
2,2 Bedroom Apartment,Lekki Phase 1 Lekki Lagos,2 Beds | 2 Baths | 3 Toilets,"13,000,000/year",CW07377 This 2-bedroom apartment delivers a si...,,13000000.0,13000000.0,2.0,2.0,...,0,0,0,0,0,0,0,0,0,Apartment
3,3 Bedroom Apartment,Ikoyi Lagos,3 Beds | 3 Baths | 4 Toilets,"20,000,000/year",CW07192 \n \n 3-BEDROOM APARTME...,,20000000.0,20000000.0,3.0,3.0,...,0,0,1,0,1,0,0,0,0,Apartment
4,3 Bedroom Apartment With Bq,Old Ikoyi Ikoyi Lagos,3 Beds | 3 Baths | 4 Toilets,"30,000,000/year",CW07241 \n \n 3 BEDROOM APARTME...,,30000000.0,30000000.0,3.0,3.0,...,1,1,0,0,1,0,0,0,0,Apartment
5,Furnished 2 Bedroom Apartment,Oniru Victoria Island Lagos,2 Beds | 2 Baths | 3 Toilets,"30,000,000/year",CW07199 \n \n 2 BED FURNISHED A...,,30000000.0,30000000.0,2.0,2.0,...,0,0,0,1,1,0,0,0,0,Apartment
6,3 Bedroom Maisonette,Banana Island Ikoyi Lagos,3 Beds | 3 Baths | 4 Toilets,"35,000,000/year",CW07233 \n \n 3 BED MAISONETTE ...,,35000000.0,35000000.0,3.0,3.0,...,1,0,0,0,1,0,0,0,0,Maisonette
7,Multiple Office,Old Ikoyi Ikoyi Lagos,,"220,000/sqm",CW07201 \n \n Multiple Office| ...,,220000.0,220000.0,,,...,0,0,0,0,0,0,0,0,0,Office
8,Commercial Property,Victoria Island Lagos,,"100,000,000/year",CW07283 \n \n COMMERCIAL PROPER...,,100000000.0,100000000.0,,,...,0,0,0,0,0,0,0,0,0,Commercial
9,Commercial Property,Victoria Island Lagos,,"80,000,000/year",CW07368 \n \n Commercial Proper...,,80000000.0,80000000.0,,,...,0,0,1,0,0,0,0,0,0,Commercial


In [15]:
# Drop unnecessary columns and save cleaned data
columns_to_drop = ['spec', 'location', 'description', 'features', 'price', 'cleaned_price']
df.drop(columns=columns_to_drop, inplace=True)
df.to_csv('properties_cleaned.csv', index=False)