In [1]:
import numpy as np
import pandas as pd

In [2]:
data_file = "data/project-dataset-final.xlsx"
df_listings = pd.read_excel(data_file, index_col=None)
df_listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2858 entries, 0 to 2857
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   web-scraper-order      2858 non-null   object 
 1   web-scraper-start-url  2858 non-null   object 
 2   pagination             2751 non-null   object 
 3   listing-title          2858 non-null   object 
 4   listing-description    2858 non-null   object 
 5   listing-housing-type   2858 non-null   object 
 6   listing-features       2858 non-null   object 
 7   listing-notices        2858 non-null   object 
 8   listing-id             2858 non-null   object 
 9   listing-link           2858 non-null   object 
 10  listing-link-href      2858 non-null   object 
 11  listing-availability   1534 non-null   object 
 12  listing-posting-date   2858 non-null   object 
 13  listing-address        2098 non-null   object 
 14  listing-map-latitude   2857 non-null   float64
 15  list

In [3]:
############################################### Creating a Data Quality Report ###############################################
## Get the data types of each column of the dataset
data_types = pd.DataFrame(
    df_listings.dtypes,
    columns=['Data Type']
)
data_types

Unnamed: 0,Data Type
web-scraper-order,object
web-scraper-start-url,object
pagination,object
listing-title,object
listing-description,object
listing-housing-type,object
listing-features,object
listing-notices,object
listing-id,object
listing-link,object


In [4]:
## Get the total values in the each column
total_values = pd.DataFrame(
    df_listings.count(),
    columns=['Total Values']
)
total_values

Unnamed: 0,Total Values
web-scraper-order,2858
web-scraper-start-url,2858
pagination,2751
listing-title,2858
listing-description,2858
listing-housing-type,2858
listing-features,2858
listing-notices,2858
listing-id,2858
listing-link,2858


In [5]:
## Check total missing values in each column of the dataset
missing_data = pd.DataFrame(
    df_listings.isnull().sum(),
    columns=['Missing Values']
)
missing_data

Unnamed: 0,Missing Values
web-scraper-order,0
web-scraper-start-url,0
pagination,107
listing-title,0
listing-description,0
listing-housing-type,0
listing-features,0
listing-notices,0
listing-id,0
listing-link,0


In [6]:
## Check total unique values in each column of the dataset
unique_values = pd.DataFrame(
    df_listings.nunique(),
    columns=['Unique Values']
)
unique_values

Unnamed: 0,Unique Values
web-scraper-order,2858
web-scraper-start-url,1
pagination,24
listing-title,2404
listing-description,2324
listing-housing-type,35
listing-features,643
listing-notices,2
listing-id,2858
listing-link,1680


In [7]:
dq_report = data_types.join(total_values).join(missing_data).join(unique_values)
dq_report

Unnamed: 0,Data Type,Total Values,Missing Values,Unique Values
web-scraper-order,object,2858,0,2858
web-scraper-start-url,object,2858,0,1
pagination,object,2751,107,24
listing-title,object,2858,0,2404
listing-description,object,2858,0,2324
listing-housing-type,object,2858,0,35
listing-features,object,2858,0,643
listing-notices,object,2858,0,2
listing-id,object,2858,0,2858
listing-link,object,2858,0,1680


In [8]:
# Get the listing-description fields from the data and Clean it
# Removes the Prefix - "QR Code Link to This Post"
# Removes line breaks and Extra Tabs and spaces and replaces them with semicolon ";"

import re

# Creates regex patterns for newline, tabs and semicolon
regex_newline = re.compile(r'\n', flags=re.IGNORECASE)
regex_tab = re.compile(r'(\s{2,})', flags=re.IGNORECASE)
regex_semicolon = re.compile(r';+', flags=re.IGNORECASE)

# Apply the regex(s) to the "listing-description" field and generate new field "description"
series_description = df_listings['listing-description'].str.replace(regex_newline, ';', regex=True)

series_description = series_description.str.replace(regex_tab, '', regex=True)
series_description = series_description.str.replace(regex_semicolon, ';', regex=True)
regex_qr_code = re.compile(r'QR Code Link to This Post;', flags=re.IGNORECASE)
series_description = series_description.str.replace(regex_qr_code, '', regex=True)
series_description = series_description.str.strip()
series_description = series_description.dropna(how='all', axis=0)
df_description = pd.DataFrame()
df_description['description'] = series_description
# df_description.to_excel("data/description.xlsx", index=False)
df_description.info()
df_description

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2858 entries, 0 to 2857
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  2858 non-null   object
dtypes: object(1)
memory usage: 44.7+ KB


Unnamed: 0,description
0,Apartment home available July 1st on Chicago’s...
1,Call Apartment Guys at 773-549-3474 and ask fo...
2,Contact me to learn more about an innovative L...
3,"PROPERTY INFO;ID: 232839975Rent: $3,500 / Mont..."
4,"4535 N Paulina Unit #3D Chicago, IL 60640;Brig..."
...,...
2853,Interested in this property?;Click on: Reply t...
2854,"We are conveniently located near Edens, Metra ..."
2855,Interested in this property?;Click on: Reply t...
2856,"PROPERTY INFO;ID: 237708192Rent: $3,147 / Mont..."


In [9]:
# Get the listing-title fields from the data and Clean it
# Removes Rent Amount, BedRoom, Footage, etc. from beginning and creates individual fields
# listing-rent, listing-room-type and listing-footage
# Note that when it says "0BR" on listing-housing-type, the title will not have the count of bedrooms

df_title = pd.DataFrame(df_listings['listing-title'])
# listing_title.info()
# listing_title


# Creates regex patterns for Rent amount and create dataframe listing-rent
regex_rent = re.compile(r'(\$\d+\,*\d+\s)', flags=re.IGNORECASE)

# Apply the regex(s) to the "listing-title" field and generate new field "description"
df_rent = df_title['listing-title'].str.extract(regex_rent, expand=True)
df_rent.columns = ['rent']
df_rent.info()
df_rent['rent'] = df_rent['rent'].str.replace(r'\$|\,', '', regex=True)
df_rent

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2858 entries, 0 to 2857
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   rent    2857 non-null   object
dtypes: object(1)
memory usage: 22.5+ KB


Unnamed: 0,rent
0,1795
1,1650
2,2860
3,3500
4,1225
...,...
2853,3685
2854,1400
2855,2250
2856,3147


In [10]:
# Creates regex patterns for Room Type and create dataframe listing-room-type
# It checks for either 1 or more Bed Room / Studio type
regex_room_type = re.compile(r'(\d*br\s|studio|\d+\s*bed)', flags=re.IGNORECASE)

# Apply the regex(s) to the "listing-title" field and generate new field "description"
df_room_type = df_listings['listing-title'].str.extract(regex_room_type, expand=True)
df_room_type.columns = ['type']
df_room_type.info()
df_room_type['type'] = df_room_type['type'].str.replace('br', ' Bed Room')
df_room_type['type'] = df_room_type['type'].str.replace(r'(?i)studio', 'STUDIO')
df_room_type

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2858 entries, 0 to 2857
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    2768 non-null   object
dtypes: object(1)
memory usage: 22.5+ KB


  df_room_type['type'] = df_room_type['type'].str.replace(r'(?i)studio', 'STUDIO')


Unnamed: 0,type
0,1 Bed Room
1,1 Bed Room
2,4 Bed Room
3,2 Bed Room
4,1 Bed Room
...,...
2853,2 Bed Room
2854,1 Bed Room
2855,1 Bed Room
2856,3 Bed Room


In [11]:
# Creates regex patterns for footage and create dataframe listing-footage
regex_footage = re.compile(r'(\d+\s*ft)', flags=re.IGNORECASE)

# Apply the regex(s) to the "listing-title" field and generate new field "description"
df_footage = df_listings['listing-title'].str.extract(regex_footage, expand=True)
df_footage.columns = ['footage']
df_footage.info()
df_footage
# df = df_footage.join(df_rent).join(df_room_type)
# df
# df_footage.isna().sum()
# df_footage.count()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2858 entries, 0 to 2857
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   footage  1034 non-null   object
dtypes: object(1)
memory usage: 22.5+ KB


Unnamed: 0,footage
0,
1,
2,2213ft
3,
4,
...,...
2853,
2854,
2855,
2856,


In [12]:
# df_rent.info()
df = df_rent.join(df_room_type).join(df_footage).join(df_description)
df

Unnamed: 0,rent,type,footage,description
0,1795,1 Bed Room,,Apartment home available July 1st on Chicago’s...
1,1650,1 Bed Room,,Call Apartment Guys at 773-549-3474 and ask fo...
2,2860,4 Bed Room,2213ft,Contact me to learn more about an innovative L...
3,3500,2 Bed Room,,"PROPERTY INFO;ID: 232839975Rent: $3,500 / Mont..."
4,1225,1 Bed Room,,"4535 N Paulina Unit #3D Chicago, IL 60640;Brig..."
...,...,...,...,...
2853,3685,2 Bed Room,,Interested in this property?;Click on: Reply t...
2854,1400,1 Bed Room,,"We are conveniently located near Edens, Metra ..."
2855,2250,1 Bed Room,,Interested in this property?;Click on: Reply t...
2856,3147,3 Bed Room,,"PROPERTY INFO;ID: 237708192Rent: $3,147 / Mont..."


In [13]:
# Get the listing-features fields from the data and Clean it
# Separate the various features of the listing into individual columns

df_features = pd.DataFrame(df_listings['listing-features'])
df_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2858 entries, 0 to 2857
Data columns (total 1 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   listing-features  2858 non-null   object
dtypes: object(1)
memory usage: 22.5+ KB


In [103]:
# Creates REGEX patterns for various features

# LISTING TYPE
regex_listing_type = re.compile(r'(house|apartment|duplex|flat|condo|townhouse|loft)|(?:open house)', flags=re.IGNORECASE)

# CATs Allowed
regex_cats_allowed = re.compile(r'(cats are OK)', flags=re.IGNORECASE)

# Dogs Allowed
regex_dogs_allowed = re.compile(r'(dogs are OK)', flags=re.IGNORECASE)

# LAUNDRY TYPE
regex_laundry = re.compile(r'(w/d in unit|w/d hookups|laundry in bldg|no laundry on site|laundry on site)', flags=re.IGNORECASE)

# GARAGE
regex_parking = re.compile(r'(attached garage|detached garage|no parking|off-street parking|street parking|carport|valet parking)', flags=re.IGNORECASE)

# RENT PERIOD
regex_rent_period = re.compile(r'(monthly|weekly)', flags=re.IGNORECASE)

# SMOKING
regex_smoking = re.compile(r'(\w+.smoking)', flags=re.IGNORECASE)

# APPLICATION FEES
regex_app_fees = re.compile(r'(\$\d+)', flags=re.IGNORECASE)


df_listing_type = df_features['listing-features'].str.extract(regex_listing_type)
df_listing_type.columns = ['Listing Type']

df_cats = pd.DataFrame(df_features['listing-features'].str.contains('cats are OK', regex=True, flags=re.IGNORECASE, ))
df_cats.columns=['Cats Allowed']

df_dogs = pd.DataFrame(df_features['listing-features'].str.contains('dogs are OK', regex=True, flags=re.IGNORECASE, ))
df_dogs.columns=['Dogs Allowed']


df_laundry = df_features['listing-features'].str.extract(regex_laundry, expand=True)
df_laundry.columns = ['Laundry Type']

df_parking = df_features['listing-features'].str.extract(regex_parking, expand=True)
df_parking.columns = ['Parking Type']

df_rent_period = df_features['listing-features'].str.extract(regex_rent_period, expand=True)
df_rent_period.columns = ['Rent Period']

df_smoking = pd.DataFrame(df_features['listing-features'].str.contains(regex_smoking, regex=True))
df_smoking.columns = ['No Smoking']

df_app_fees = df_features['listing-features'].str.extract(regex_app_fees, expand=True)
df_app_fees.columns = ['Application Fees']

df_app_fees

  df_smoking = pd.DataFrame(df_features['listing-features'].str.contains(regex_smoking, regex=True))


Unnamed: 0,Application Fees
0,
1,
2,$90
3,
4,
...,...
2853,
2854,
2855,
2856,


In [106]:
# np.where(df_app_fees[0].isna() == True)[0]

In [107]:
# Get the listing-housing-type fields from the data and Clean it
# Separate Number of BedRooms and Bathrooms of the listing into individual columns

df_housing_type = pd.DataFrame(df_listings['listing-housing-type'])
df_housing_type

Unnamed: 0,listing-housing-type
0,1BR / 1Ba
1,1BR / 1Ba
2,4BR / 2.5Ba
3,2BR / 2Ba
4,1BR / 1Ba
...,...
2853,2BR / 2Ba
2854,1BR / 1Ba
2855,1BR / 1Ba
2856,3BR / 2Ba


In [108]:
# BED ROOMS
regex_bedrooms = re.compile(r'(\d+(?=br))', flags=re.IGNORECASE)

# BATH
regex_baths = re.compile(r'(\d*\.*\d*(?=ba))', flags=re.IGNORECASE)

df_bedrooms = df_housing_type['listing-housing-type'].str.extract(regex_bedrooms, expand=True)
df_baths = df_housing_type['listing-housing-type'].str.extract(regex_baths, expand=True)


df_bedrooms.columns = ['Total Bed Rooms']
df_baths.columns = ['Total Baths']
df_housing_type = df_bedrooms.join(df_baths)
df_housing_type

Unnamed: 0,Total Bed Rooms,Total Baths
0,1,1
1,1,1
2,4,2.5
3,2,2
4,1,1
...,...,...
2853,2,2
2854,1,1
2855,1,1
2856,3,2
