# Data Cleaning

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../Data/Locations_Data_Added.csv')

#### Let's start by checking for listings without titles.

In [3]:
no_title_df = df[df['Title'] == 'Title Not Found']

In [4]:
no_title_df

Unnamed: 0,Title,Price,Bedrooms,Square Feet,Full Address,monthly,apartment,cats are OK - purrr,dogs are OK - wooof,laundry on site,...,house,w/d hookups,latitude,longitude,nearest_budget_grocery_store_distance,nearest_budget_grocery_store,nearest_midTier_grocery_store_distance,nearest_midTier_grocery_store,nearest_premium_grocery_store_distance,nearest_premium_grocery_store
0,Title Not Found,Price Not Found,Bedrooms Info Not Found,Square Feet Not Found,None listed,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,,,
413,Title Not Found,Price Not Found,Bedrooms Info Not Found,Square Feet Not Found,None listed,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,,,
547,Title Not Found,Price Not Found,Bedrooms Info Not Found,Square Feet Not Found,None listed,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,,,
552,Title Not Found,Price Not Found,Bedrooms Info Not Found,Square Feet Not Found,None listed,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,,,
567,Title Not Found,Price Not Found,Bedrooms Info Not Found,Square Feet Not Found,None listed,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2079,Title Not Found,Price Not Found,Bedrooms Info Not Found,Square Feet Not Found,None listed,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,,,
2170,Title Not Found,Price Not Found,Bedrooms Info Not Found,Square Feet Not Found,None listed,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,,,
2210,Title Not Found,Price Not Found,Bedrooms Info Not Found,Square Feet Not Found,None listed,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,,,
2235,Title Not Found,Price Not Found,Bedrooms Info Not Found,Square Feet Not Found,None listed,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,,,


#### Occasionally there are listings without titles, perhaps uploaded in error. 

If that is that case, we will remove them here.

In [5]:
# Drop the rows from the DataFrame
df = df.drop(df[df['Title'] == 'Title Not Found'].index)

In [6]:
df.head()

Unnamed: 0,Title,Price,Bedrooms,Square Feet,Full Address,monthly,apartment,cats are OK - purrr,dogs are OK - wooof,laundry on site,...,house,w/d hookups,latitude,longitude,nearest_budget_grocery_store_distance,nearest_budget_grocery_store,nearest_midTier_grocery_store_distance,nearest_midTier_grocery_store,nearest_premium_grocery_store_distance,nearest_premium_grocery_store
1,1 Bedroom in Marina Del Rey -Quartz Counters -...,"$3,295",1br,750,"415 Washington Boulevard, Venice, CA 90292",1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,33.981495,-118.463117,1.212094,"Costco Bakery - 13463 Washington Blvd, Marina ...",1.155451,"Ralphs Fresh Fare - 4311 Lincoln Blvd, Marina ...",0.569974,"Erewhon - 585 Venice Blvd., Venice"
2,1 Bedroom 1 BA in West L.A. | Hardwood Style F...,"$2,250",1br,700,None listed,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,,,,,,,,
3,"Lease TODAY, Save BIG! One Month FREE Rent Offer!","$2,700",1br,590,"11411 Rochester Avenue, Los Angeles, CA 90025",1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,34.049005,-118.454049,0.869316,"Smart & Final - 12210 Santa Monica Blvd W, Los...",0.796004,"Ralphs Fresh Fare - 12057 Wilshire Blvd, Los A...",0.825743,"Whole Foods Market - 11737 San Vicente Blvd, L..."
4,1 Bedroom in the Heart of Venice* Plank Floors...,"$2,895",1br,750,"237 Fourth Avenue, Venice, CA 90291",1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,33.99809,-118.475972,0.69602,"Smart & Final Extra! - 604 Lincoln Blvd, Venice",0.843992,"Ralphs - 910 Lincoln Blvd, Venice",0.408345,"Whole Foods Market - 225 Lincoln Blvd, Venice"
5,"SPECIALS, Rooftop Sky Deck, Brand New 1+1 Bren...","$3,438",1br,711,"11916 West Pico Boulevard, Los Angeles, CA 90064",1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,34.029804,-118.448669,0.790421,"Smart & Final Extra! - 11221 W Pico Blvd, Los ...",0.267695,"Trader Joe's - 11755 W Olympic Blvd, Los Angeles",0.794601,"Whole Foods Market - 11666 National Blvd, Los ..."


In [7]:
df['Price'] = pd.to_numeric(df['Price'].str.replace('$','').str.replace(',',''), errors='coerce')

In [8]:
# Set pandas to display all columns (35 in your case or more)
pd.set_option('display.max_columns', None)

# # Rename 'Price' to 'SalePrice' and 'Qty' to 'Quantity'
# df.rename(columns={'Price': 'SalePrice', 'Qty': 'Quantity'}, inplace=True)

df.rename(columns={'Title': 'title',
                   'Price': 'price', 
                   'Square Feet': 'square_feet',
                   'Full Address': 'full_address',
                   'cats are OK - purrr': 'cats_allowed',
                  'dogs are OK - wooof': 'dogs_allowed',
                  'laundry on site': 'laundry_on_site',
                  'air conditioning': 'air_conditioning',
                  'off-street parking': 'off_street_parking',
                  'EV charging': 'EV_charging',
                  'w/d in unit': 'washer_dryer_in_unit',
                  'no smoking': 'no_smoking',
                  'attached garage': 'attached_garage',
                  'detached garage': 'detached_garage',
                  'laundry in bldg': 'laundry_in_bldg',
                  'Fee Needed To Apply': 'fee_needed_to_apply',
                  'wheelchair accessible': 'wheelchair_accessible',
                  'no parking': 'no_parking',
                  'street parking': 'street_parking',
                  'no laundry on site': 'no_laundry_on_site',
                  'w/d hookups': 'washer_dryer_hookups'}, inplace=True)

#### Let's check the data types.

In [9]:
df.dtypes

title                                      object
price                                       int64
Bedrooms                                   object
square_feet                                object
full_address                               object
monthly                                   float64
apartment                                 float64
cats_allowed                              float64
dogs_allowed                              float64
laundry_on_site                           float64
air_conditioning                          float64
off_street_parking                        float64
EV_charging                               float64
washer_dryer_in_unit                      float64
carport                                   float64
no_smoking                                float64
attached_garage                           float64
detached_garage                           float64
laundry_in_bldg                           float64
fee_needed_to_apply                       float64


#### We need to change the one-hot encoded values to integers to use them in the model.

Let's write a function that does this. We'll define the columns to convert outside of the function. That way, in the future if there are more boolean float types, we can simply add them in here and the function will take care of it. 

In [10]:
# List of boolean column names to convert
boolean_float_columns = [
    'monthly', 'apartment', 'cats_allowed', 'dogs_allowed',
    'laundry_on_site', 'air_conditioning', 'off_street_parking', 'EV_charging',
    'washer_dryer_in_unit', 'carport', 'no_smoking', 'attached_garage',
    'detached_garage', 'laundry_in_bldg', 'fee_needed_to_apply',
    'wheelchair_accessible', 'no_parking', 'furnished', 'street_parking',
    'no_laundry_on_site', 'house', 'washer_dryer_hookups'
]

In [11]:
def convert_float_booleans_to_int(df, columns_to_convert):
    
    # Convert each column in the list to 'int' datatype
    for column in columns_to_convert:
        df[column] = df[column].astype(int)

In [12]:
convert_float_booleans_to_int(df, boolean_float_columns)

#### Check for duplicates.

In [13]:
len(df)

2327

In [14]:
# Remove duplicate rows, keeping the first occurrence
df = df.drop_duplicates()

In [15]:
len(df)

1917

In [16]:
pd.set_option('display.max_colwidth', None)

In [17]:
df.to_csv('../Data/Cleaned_Data_03282024.csv', index=False)