## Final step -- search data for keywords

In [1]:
import pandas as pd
import string

In [2]:
lennar = pd.read_json('./lennar-data.json')

In [3]:
lennar.shape

(11365, 185)

In [4]:
lennar.head(1)

Unnamed: 0,Availability,Price,Beds,Baths,Sqft,Address,Community,URL,Data_Dump,feature_0,...,feature_166,feature_167,feature_168,feature_169,feature_170,feature_171,feature_172,feature_173,feature_174,feature_175
0,Move-in ready,"$374,990",4.0,3.5,2179,"6417 Mentor Place, Raleigh, NC",Bradley in 5401 North,https://www.lennar.com/new-homes/north-carolin...,"{'Kitchen': ['36"" cabinetry with crown molding...","36"" cabinetry with crown molding and nickel ha...",...,,,,,,,,,,


In [5]:
clean = lennar.drop_duplicates(subset=['Availability', 'Price', 'Beds', 'Baths', 'Sqft', 'Address', 'Community', 'URL'],
                    keep='first',
                    ignore_index=True)

In [6]:
clean.shape

(9716, 185)

After removing duplicate homes (from the fact that some cities that are close together list the same homes under different urls -- ex Charlotte is under SC and NC for some reason), we see there are **9716 homes in the dataset**

In [22]:
import re

In [58]:
for d in clean['Data_Dump'][500:]:
    lines = [item.lower() for sublist in d.values() for item in sublist]
    
    # get rid of characters we don't want
    cleaned_lines = [l.strip().replace('\n', '').replace('\r', '').replace('®', '').
                     replace('™','').replace('¹', '').replace('”','') for l in lines]
    
    # remove punc other than dash
    ultra_cleaned_lines = [cl.translate(str.maketrans('', '', string.punctuation.replace('-',''))) 
                           for cl in cleaned_lines]
    
    cleaned_words = [i for l in ultra_cleaned_lines for i in l.split()]
    
    print(cleaned_words)
    break

['engineered', 'hardwood', 'flooring', 'in', 'kitchen', 'dining', 'room', 'and', 'foyer', 'stainless', 'steel', 'single-bowl', 'undermount', 'sink', 'with', 'stainless', 'steel', 'pull-out', 'faucet', 'elegant', 'square', 'at-panel', 'cabinetry', 'with', 'crown', 'molding', 'granite', 'or', 'quartz', 'countertops', 'with', 'flat', 'polish', 'edge', 'and', 'stainless', 'steel', 'undermount', 'sink', '3x6', 'subway', 'tile', 'backsplash', 'installed', 'in', 'a', 'brick', 'pattern', 'pull', 'out', 'stainless', 'steel', 'faucet', 'frigidaire', 'gallery', '30', 'stainless', 'steel', 'and', 'black', 'freestanding', 'smoothtopquick-clean', 'range', 'with', 'auto', 'shutoff', 'frigidaire', 'gallery', 'stainless', 'steel', 'and', 'black', 'over-the-range', 'microwave', 'ovenwith', 'venting', 'system', 'frigidaire', 'gallery', 'stainless', 'steel', 'and', 'black', '4-cycle', 'energy', 'stardishwasher', 'with', 'push', 'button', 'controls', 'frigidaire', 'garbage', 'disposal', '36', 'and', '42', 

In [80]:
def search_homes(keyword):
    """
    Method that searches through the Data_dump dict structure
    
    Returns:
        vals: dict where keys include keyword and values are counts
        home_count: # of unique homes where the keyword is present
    """
    keyword = keyword.lower()
    vals = dict()
    home_count = 0
    index = 0
    
    for d in clean.Data_Dump:
        index += 1
        lines = [item.lower() for sublist in d.values() for item in sublist]
    
        # get rid of characters we don't want
        cleaned_lines = [l.strip().replace('\n', '').replace('\r', '').replace('®', '').
                         replace('™','').replace('¹', '').replace('”','') for l in lines]

        # remove punc other than dash
        ultra_cleaned_lines = [cl.translate(str.maketrans('', '', string.punctuation.replace('-',''))) 
                               for cl in cleaned_lines]

        cleaned_words = [i for l in ultra_cleaned_lines for i in l.split()]
        
        if keyword in cleaned_words:
            home_count += 1
            
            for ucl in ultra_cleaned_lines:
                if keyword in ucl:
                    vals[ucl] = vals.get(ucl, 0) + 1
    
    df = pd.DataFrame([[home_count, sorted(vals.items(), key=lambda x: x[1], reverse=True), 
                        sum(vals.values())]], 
                      columns = ['n_homes', 'data', 'sum_data_vals'])
    df.to_csv(f'./search-results/{keyword}.csv', index=False)
    return df

Okay so here's the function. Try 'usb', 'wifi', 'motion' etc

In [88]:
df = search_homes('dimmers')

In [89]:
df['data'][0]

[('lutron caseta wireless in-wall dimmers', 8)]

In [60]:
# keyword = 'camera'
# vals = dict()
# home_count = 0
# index = 0

# for d in clean.Data_Dump:
#     index += 1
#     lines = [item.lower() for sublist in d.values() for item in sublist]
#     cleaned_lines = [l.strip().translate(str.maketrans('', '', string.punctuation)).replace(
#         '®', '').replace('\n', '').replace('\r', '') for l in lines]
#     cleaned_words = [i for l in cleaned_lines for i in l.split()]
    
#     if keyword in cleaned_words:
#         home_count += 1
        
#         for cl in cleaned_lines:
#             if keyword in cl:
#                 vals[cl] = vals.get(cl, 0) + 1

In [61]:
# i = 0
# for d in clean.Data_Dump:
#     i+=1
#     lines = [item.lower() for sublist in d.values() for item in sublist]
#     words = [i for l in lines for i in l.split()]
#     if 'camera' in words:
#         print(words)