## Final step -- search data for keywords

In [27]:
import pandas as pd
import string

In [38]:
lennar = pd.read_json('./lennar-data-06-2023.json')

In [39]:
lennar.shape

(10081, 180)

In [40]:
lennar.head(1)

Unnamed: 0,Availability,Price,Beds,Baths,Sqft,Address,Community,URL,Data_Dump,feature_0,...,feature_161,feature_162,feature_163,feature_164,feature_165,feature_166,feature_167,feature_168,feature_169,feature_170
0,Move-in ready,"$347,815",3.0,3.5,1744.0,"6235 Truxton Lane, Raleigh, NC",Manteo II in 5401 North,https://www.lennar.com/new-homes/north-carolin...,"{'Kitchen': ['36"" cabinetry with crown molding...","36"" cabinetry with crown molding and nickel ha...",...,,,,,,,,,,


In [41]:
clean = lennar.drop_duplicates(subset=['Availability', 'Price', 'Beds', 'Baths', 'Sqft', 'Address', 'Community', 'URL'],
                    keep='first',
                    ignore_index=True)

In [42]:
clean.shape

(8859, 180)

After removing duplicate homes (from the fact that some cities that are close together list the same homes under different urls -- ex Charlotte is under SC and NC for some reason), we see there are **9716 homes in the dataset**

In [43]:
# import re

In [44]:
# for d in clean['Data_Dump'][500:]:
#     lines = [item.lower() for sublist in d.values() for item in sublist]
    
#     # get rid of characters we don't want
#     cleaned_lines = [l.strip().replace('\n', '').replace('\r', '').replace('®', '').
#                      replace('™','').replace('¹', '').replace('”','') for l in lines]
    
#     # remove punc other than dash
#     ultra_cleaned_lines = [cl.translate(str.maketrans('', '', string.punctuation.replace('-',''))) 
#                            for cl in cleaned_lines]
    
#     cleaned_words = [i for l in ultra_cleaned_lines for i in l.split()]
    
#     print(cleaned_words)
#     break

In [45]:
def search_homes(keyword):
    """
    Method that searches through the Data_dump dict structure
    
    Returns:
        vals: dict where keys include keyword and values are counts
        home_count: # of unique homes where the keyword is present
    """
    keyword = keyword.lower()
    vals = dict()
    home_count = 0
    index = 0
    
    for d in clean.Data_Dump:
        index += 1
        lines = [item.lower() for sublist in d.values() for item in sublist]
    
        # get rid of characters we don't want
        cleaned_lines = [l.strip().replace('\n', '').replace('\r', '').replace('®', '').
                         replace('™','').replace('¹', '').replace('”','') for l in lines]

        # remove punc other than dash
        ultra_cleaned_lines = [cl.translate(str.maketrans('', '', string.punctuation.replace('-',''))) 
                               for cl in cleaned_lines]

        cleaned_words = [i for l in ultra_cleaned_lines for i in l.split()]
        
        if keyword in cleaned_words:
            home_count += 1
            
            for ucl in ultra_cleaned_lines:
                if keyword in ucl:
                    vals[ucl] = vals.get(ucl, 0) + 1
    
    df = pd.DataFrame([[home_count, sorted(vals.items(), key=lambda x: x[1], reverse=True), 
                        sum(vals.values())]], 
                      columns = ['n_homes', 'data', 'sum_data_vals'])
    df.to_csv(f'./search-results-06-2023/{keyword}.csv', index=False)
    return df

Okay so here's the function. Try 'usb', 'wifi', 'motion' etc

In [60]:
df = search_homes('wifi')

In [61]:
df['data'][0]

[('wifi heat mapping engineering', 929),
 ('schlage encode smart wifi deadbolt', 649),
 ('span stylecolor 242424 background-color ffffffschlage encodetrade smart wifi deadboltspan',
  245),
 ('schlage encode   smart wifi deadbolt', 168),
 ('schlage smart wifi deadbolt', 108),
 ('span stylebackground-color ffffff color 242424schlage smart wifi deadboltspan',
  72),
 ('eero pro 6 - mesh wifi system', 53),
 ('wifi heat mapping engineering no wifi dead spots', 34),
 ('span stylebackground-color ffffff color 242424schlagetrade smart wifi deadboltspan',
  28),
 ('insulated wifi enabled garage doors with 1 belt drive openerand 2 remotes',
  24),
 ('smart wifi deadbolt', 20),
 ('schlage encodetrade smart wifi deadbolt', 17),
 ('an eero pro 6 - mesh wifi system helps keep residents connected', 12),
 ('honeywell lyric round wifi thermostat', 12),
 ('ge monogramreg interior dishwasher with hidden controls led lighting and wifi connect',
  12),
 ('honeywell home t6 pro wifi – smart thermostat', 9)

In [60]:
# keyword = 'camera'
# vals = dict()
# home_count = 0
# index = 0

# for d in clean.Data_Dump:
#     index += 1
#     lines = [item.lower() for sublist in d.values() for item in sublist]
#     cleaned_lines = [l.strip().translate(str.maketrans('', '', string.punctuation)).replace(
#         '®', '').replace('\n', '').replace('\r', '') for l in lines]
#     cleaned_words = [i for l in cleaned_lines for i in l.split()]
    
#     if keyword in cleaned_words:
#         home_count += 1
        
#         for cl in cleaned_lines:
#             if keyword in cl:
#                 vals[cl] = vals.get(cl, 0) + 1

In [61]:
# i = 0
# for d in clean.Data_Dump:
#     i+=1
#     lines = [item.lower() for sublist in d.values() for item in sublist]
#     words = [i for l in lines for i in l.split()]
#     if 'camera' in words:
#         print(words)