## Final step -- search data for keywords

In [27]:
import pandas as pd
import string

In [38]:
lennar = pd.read_json('./lennar-data-06-2023.json')

In [39]:
lennar.shape

(10081, 180)

In [40]:
lennar.head(1)

Unnamed: 0,Availability,Price,Beds,Baths,Sqft,Address,Community,URL,Data_Dump,feature_0,...,feature_161,feature_162,feature_163,feature_164,feature_165,feature_166,feature_167,feature_168,feature_169,feature_170
0,Move-in ready,"$347,815",3.0,3.5,1744.0,"6235 Truxton Lane, Raleigh, NC",Manteo II in 5401 North,https://www.lennar.com/new-homes/north-carolin...,"{'Kitchen': ['36"" cabinetry with crown molding...","36"" cabinetry with crown molding and nickel ha...",...,,,,,,,,,,


In [41]:
clean = lennar.drop_duplicates(subset=['Availability', 'Price', 'Beds', 'Baths', 'Sqft', 'Address', 'Community', 'URL'],
                    keep='first',
                    ignore_index=True)

In [42]:
clean.shape

(8859, 180)

After removing duplicate homes (from the fact that some cities that are close together list the same homes under different urls -- ex Charlotte is under SC and NC for some reason), we see there are **9716 homes in the dataset**

In [43]:
# import re

In [44]:
# for d in clean['Data_Dump'][500:]:
#     lines = [item.lower() for sublist in d.values() for item in sublist]
    
#     # get rid of characters we don't want
#     cleaned_lines = [l.strip().replace('\n', '').replace('\r', '').replace('®', '').
#                      replace('™','').replace('¹', '').replace('”','') for l in lines]
    
#     # remove punc other than dash
#     ultra_cleaned_lines = [cl.translate(str.maketrans('', '', string.punctuation.replace('-',''))) 
#                            for cl in cleaned_lines]
    
#     cleaned_words = [i for l in ultra_cleaned_lines for i in l.split()]
    
#     print(cleaned_words)
#     break

In [69]:
def search_homes(date, keyword, save=False):
    """
    Method that searches through the Data_dump dict structure
    
    Returns:
        vals: dict where keys include keyword and values are counts
        home_count: # of unique homes where the keyword is present
    """
    lennar = pd.read_json(f'./lennar-data-{date}.json')
    clean = lennar.drop_duplicates(subset=['Availability', 'Price', 'Beds', 'Baths', 'Sqft', 'Address', 'Community', 'URL'],
                    keep='first',
                    ignore_index=True)
    
    keyword = keyword.lower()
    vals = dict()
    home_count = 0
    index = 0
    
    for d in clean.Data_Dump:
        index += 1
        lines = [item.lower() for sublist in d.values() for item in sublist]
    
        # get rid of characters we don't want
        cleaned_lines = [l.strip().replace('\n', '').replace('\r', '').replace('®', '').
                         replace('™','').replace('¹', '').replace('”','') for l in lines]

        # remove punc other than dash
        ultra_cleaned_lines = [cl.translate(str.maketrans('', '', string.punctuation.replace('-',''))) 
                               for cl in cleaned_lines]

        cleaned_words = [i for l in ultra_cleaned_lines for i in l.split()]
        
        if keyword in cleaned_words:
            home_count += 1
            
            for ucl in ultra_cleaned_lines:
                if keyword in ucl:
                    vals[ucl] = vals.get(ucl, 0) + 1
    
    df = pd.DataFrame([[home_count, sorted(vals.items(), key=lambda x: x[1], reverse=True), 
                        sum(vals.values())]], 
                      columns = ['n_homes', 'data', 'sum_data_vals'])
    
    if save:
        df.to_csv(f'./search-results-{date}/{keyword}.csv', index=False)
    return pd.DataFrame(df['data'][0], columns=['description', 'count'])

Okay so here's the function. Try 'usb', 'wifi', 'motion' etc

In [70]:
june_df = search_homes('06-2023', 'usb')
oct_df = search_homes('10-2022', 'usb')

In [78]:
june_df.head()

Unnamed: 0,description,count
0,usb outlets in owners suite bedroom and kitchen,359
1,power hub in kitchen with 4 usb ports and 2 ou...,307
2,usb outlets in kitchen,228
3,usb charging port,185
4,usb charger outlet,147


In [79]:
oct_df.head()

Unnamed: 0,description,count
0,usb outlets in owners suite bedroom and kitchen,1180
1,usb charging port,224
2,power hub in kitchen with 4 usb ports and 2 ou...,208
3,usb outlets in owners suite bedroom,155
4,usb outlets in kitchen,122


In [94]:
def percentage_change(col1,col2):
    return round(((col2 - col1) / col1) * 100, 2)

In [98]:
df = june_df.merge(oct_df, how='outer', on='description', suffixes=('_06-23','_10-22'))

In [99]:
df.fillna(0, inplace=True)
sorted_indices = (df["count_06-23"] + df["count_10-22"]).sort_values(ascending=False).index
df = df.loc[sorted_indices, :]
df['pct-change'] = percentage_change(df['count_10-22'],df['count_06-23'])    

In [100]:
df

Unnamed: 0,description,count_06-23,count_10-22,pct-change
0,usb outlets in owners suite bedroom and kitchen,359.0,1180.0,-69.58
1,power hub in kitchen with 4 usb ports and 2 ou...,307.0,208.0,47.6
3,usb charging port,185.0,224.0,-17.41
2,usb outlets in kitchen,228.0,122.0,86.89
6,usb outlets in owners suite bedroom,60.0,155.0,-61.29
5,usb outlets select locations,120.0,68.0,76.47
4,usb charger outlet,147.0,0.0,inf
7,usb outlets in all bedrooms and kitchen,40.0,97.0,-58.76
17,combo plug and usb charger in kitchen and owne...,14.0,51.0,-72.55
11,combo plug and usb charger in kitchen and owne...,27.0,37.0,-27.03


In [83]:
df.to_csv('./analysis/usb-description-differences'

Unnamed: 0,description,count_06-23,count_10-22
0,usb outlets in owners suite bedroom and kitchen,359.0,1180.0
1,power hub in kitchen with 4 usb ports and 2 ou...,307.0,208.0
2,usb outlets in kitchen,228.0,122.0
3,usb charging port,185.0,224.0
4,usb charger outlet,147.0,
5,usb outlets select locations,120.0,68.0
6,usb outlets in owners suite bedroom,60.0,155.0
7,usb outlets in all bedrooms and kitchen,40.0,97.0
8,usb charger in kitchen and ownerrsquos suite,40.0,
9,usb outlets in master secondary bedrooms and k...,39.0,5.0
