In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from thefuzz import fuzz
from thefuzz import process

# Clean up data from details page dataset

In [3]:
df_d=pd.read_json('bizbuysell.detail.lines.json',lines=True)
df_d['id']=df_d.s_id
df_d['src']='details'
df_d['url']=df_d['s_url']
df_d['title']=df_d['s_name']
df_d['desc']=df_d['s_desc']
df_d['local']=df_d['s_local']
df_d['region']=df_d['s_region']
df_d['location']=df_d['p_location']
df_d['title_loc']=df_d['title'].str.extract(r"in (.+) - BizBuySell")
df_d['categories']=df_d['s_breadcrumbs']
df_d['details']=df_d['p_details_text']
df_d['financials']=df_d['p_financials_text']
df_d['price']=df_d['s_price']
df_d['similar']=df_d['s_similar']

In [4]:
df_dd=df_d[['id','src','url','title',
            'desc','local','region','location','title_loc','categories','similar',
            'details','financials','price']]

In [5]:
df_dd.iloc[0]

id                                                    2067319.0
src                                                     details
url           https://www.bizbuysell.com/Business-Real-Estat...
title         Popular Marina in Central New Jersey in Middle...
desc          It is fully owned by a man and his wife since ...
local                                                 Middlesex
region                                               New Jersey
location                                   Middlesex County, NJ
title_loc     Central New Jersey in Middlesex County, New Je...
categories    [Real Estate For Sale, New Jersey, Marinas and...
similar                             [1864705, 2071534, 2057087]
details       Location:\nMiddlesex County, NJ\nType:\nOther ...
financials    Asking Price:\n$2,500,000\n                   ...
price                                                 2500000.0
Name: 0, dtype: object

# Clean up data from listings dataset

In [6]:
def proc_fin(row):
    result=''
    if pd.isna(row['asking_price'])==False:
        result+=f'Asking Price:\n{row["asking_price"]}\n'
    if pd.isna(row['cash_flow'])==False:
        result+=f'Cash Flow:\n{row["cash_flow"]}\n'        
    return result

In [7]:
df_l=pd.read_json('bizbuysell.list.lines.json',lines=True)
df_l['id']=df_l.s_id
df_l['src']='listings'
df_l['url']='https://www.bizbuysell.com'+df_l['s_url']
df_l['title']=df_l['s_name']
df_l['desc']=df_l['s_desc']
df_l['local']=df_l['s_local']
df_l['region']=df_l['s_region']
df_l['location']=df_l['loc']
df_l['title_loc']=df_l['title'].str.extract(r"in (.+) - BizBuySell")
df_l['categories']=df_l['s_breadcrumbs']
df_l['details']=''
df_l['financials']=df_l.apply(proc_fin,axis=1)
df_l['price']=df_d['s_price'].replace('$','').replace(',','')
df_l['similar']=[list() for x in range(len(df_l.index))]

In [8]:
df_ll=df_l[['id','src','url','title',
            'desc','local','region','location','title_loc','categories', 'similar',
            'details','financials','price']]

In [9]:
df_ll.iloc[0]

id                                                      1972353
src                                                    listings
url           https://www.bizbuysell.com/Business-Opportunit...
title                    Turnkey Jewelry Store in the Caribbean
desc          Almost 33 years ago, the founders of The Natur...
local                                                      None
region                                            Christiansted
location                                          Christiansted
title_loc                                                   NaN
categories    [Businesses For Sale, Travel Businesses For Sa...
similar                                                      []
details                                                        
financials                          Asking Price:\n$2,000,000\n
price                                                 2500000.0
Name: 0, dtype: object

# Combine datasets

In [84]:
df=pd.concat([df_dd,df_ll])

In [85]:
df.shape

(302163, 14)

# Enrich

In [86]:
df['franchise']=df.title.str.contains("franchise", case=False)
df['lease']=df.title.str.contains("lease", case=False)
df['auction']=df.title.str.contains("auction", case= False)
df['rent']=df.title.str.contains("rent", case= False)

In [87]:
df['_location']=df['location'].fillna(value='')
df['nationwide']=df._location.str.contains("Available Nationwide", case=False)
df['nationwide'].fillna(value=False, inplace=True)
df['multiple_locations']=df._location.str.contains("Available in Multiple Locations", case=False)
df['multiple_locations'].fillna(value=False, inplace=True)
df['relocatable']=df._location.str.contains("Relocatable", case=False)
df['relocatable'].fillna(value=False, inplace=True)

In [88]:
def proc_loc(row):
    result=None
    if pd.isna(row['location'])==False:
        l=str(row['location']).lower().strip()
        result= l
    elif pd.isna(row['local'])==False and pd.isna(row['region'])==False:
        l=str(row['local']).lower().strip()
        r=str(row['region']).lower().strip()
        result= f"{l}, {r}"
    elif pd.isna(row['title_loc'])==False:
        l=str(row['title_loc']).lower().strip()
        result= l
    if result !=None:
        for r in ['(relocatable)','available in']:
            result=result.replace(r,'')
        result=result.strip()
    return result
df['proc_loc'] = df.apply(proc_loc, axis=1)

# Get rid of franchise, lease, auction, rent

In [89]:
#df=df.loc[(df.franchise==False)&(df.lease==False)&(df.acution==False)&(df.rent==False)]
df=df.loc[(df.lease==False)&(df.auction==False)&(df.rent==False)]

In [90]:
df.shape

(294623, 23)

# Get rid of nationwide and multiple location listings

In [91]:
df=df.loc[(df.nationwide==False)&(df.multiple_locations==False)]

In [92]:
df.shape

(287470, 23)

# Get rid of duplicates by id

In [93]:
df=df.drop_duplicates(['id'])

In [94]:
df.shape

(38104, 23)

# Clean up null data

In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38104 entries, 0 to 263471
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  38104 non-null  float64
 1   src                 38104 non-null  object 
 2   url                 38104 non-null  object 
 3   title               38104 non-null  object 
 4   desc                38104 non-null  object 
 5   local               33717 non-null  object 
 6   region              34014 non-null  object 
 7   location            37980 non-null  object 
 8   title_loc           37062 non-null  object 
 9   categories          38104 non-null  object 
 10  similar             38104 non-null  object 
 11  details             37952 non-null  object 
 12  financials          38104 non-null  object 
 13  price               37775 non-null  float64
 14  franchise           38104 non-null  object 
 15  lease               38104 non-null  object 
 16  auction 

In [96]:
df=df.loc[(df.price.isna()==False) & (df.proc_loc.isna()==False)]
df['details'].fillna(value='', inplace=True)

In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37773 entries, 0 to 37743
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  37773 non-null  float64
 1   src                 37773 non-null  object 
 2   url                 37773 non-null  object 
 3   title               37773 non-null  object 
 4   desc                37773 non-null  object 
 5   local               33439 non-null  object 
 6   region              33690 non-null  object 
 7   location            37713 non-null  object 
 8   title_loc           37060 non-null  object 
 9   categories          37773 non-null  object 
 10  similar             37773 non-null  object 
 11  details             37773 non-null  object 
 12  financials          37773 non-null  object 
 13  price               37773 non-null  float64
 14  franchise           37773 non-null  object 
 15  lease               37773 non-null  object 
 16  auction  

# Load location data

In [98]:
df_loc=pd.read_pickle('../census/codes/locations.pkl')
df_loc.to_excel('locations.xlsx')

In [99]:
df_loc.head()

Unnamed: 0,FEATURE_ID,location,H3_15
1859,44784,arizona,8f29b6d357ae255
6298,83350,arkansas,8f265b41b7485a4
10369,165344,alabama,8f44ec7996a29b5
12114,201738,colorado,8f268cda8071b71
13216,213160,connecticut,8f2a14b9892e0e0


In [107]:
df_loc=df_loc.loc[df_loc['location'].str.len()>2]
loc=list(df_loc['location'].unique())
loc.sort(key = len,reverse=True)

#loc

df_f=df[:20]

# df_f["fuzz_loc"] = df_f["proc_loc"].apply(
#   lambda x: process.extract(x, loc, scorer=fuzz.ratio)
# )

df_f["fuzz_loc"] = df_f["proc_loc"].apply(
  lambda x: process.extract(x, loc,scorer=fuzz.partial_token_sort_ratio)
)

df_f

Unnamed: 0,id,src,url,title,desc,local,region,location,title_loc,categories,...,franchise,lease,auction,rent,_location,nationwide,multiple_locations,relocatable,proc_loc,fuzz_loc
0,2067319.0,details,https://www.bizbuysell.com/Business-Real-Estat...,Popular Marina in Central New Jersey in Middle...,It is fully owned by a man and his wife since ...,Middlesex,New Jersey,"Middlesex County, NJ","Central New Jersey in Middlesex County, New Je...","[Real Estate For Sale, New Jersey, Marinas and...",...,False,False,False,False,"Middlesex County, NJ",False,False,False,"middlesex county, nj","[(middlesex county, nj, 100), (middlesex, nj, ..."
1,1990890.0,details,https://www.bizbuysell.com/Business-Real-Estat...,"High Exposure in Mays Landing, New Jersey - Bi...",AMAZING OPPORTUNITY! Priced to sell and ready ...,Mays Landing,New Jersey,"Mays Landing, NJ (Atlantic County)","Mays Landing, New Jersey","[Real Estate For Sale, New Jersey, Other, Mays...",...,False,False,False,False,"Mays Landing, NJ (Atlantic County)",False,False,False,"mays landing, nj (atlantic county)","[(atlantic county, nj, 89), (harding county, n..."
3,2039720.0,details,https://www.bizbuysell.com/Business-Real-Estat...,"Duplex, Short term or Long term in Pinellas Co...",Rare Largo/Seminole area duplex centrally loca...,Pinellas,Florida,"Pinellas County, FL","Pinellas County, Florida","[Real Estate For Sale, Florida, Other, Pinella...",...,False,False,False,False,"Pinellas County, FL",False,False,False,"pinellas county, fl","[(pinellas county, fl, 100), (pinellas, fl, 10..."
4,1576680.0,details,https://www.bizbuysell.com/Business-Real-Estat...,"Major Price Reduction Rest/Tavern/ 13,000sq. f...",Price reduced to $850. 000 MAKE AN OFFER!!!!!!...,Baltimore,Maryland,"Baltimore, MD (Baltimore City County)","Baltimore, Maryland","[Real Estate For Sale, Maryland, Bars, Pubs an...",...,False,False,False,False,"Baltimore, MD (Baltimore City County)",False,False,False,"baltimore, md (baltimore city county)","[(baltimore, md, 83), (gem county, id, 82), (c..."
5,2087638.0,details,https://www.bizbuysell.com/Business-Real-Estat...,"Business & Real Estate in Casco, Michigan - Bi...",• The Business and the Real Estate are For Sal...,Casco,Michigan,"Casco, MI (Saint Clair County)","Casco, Michigan","[Real Estate For Sale, Michigan, Convenience S...",...,False,False,False,False,"Casco, MI (Saint Clair County)",False,False,False,"casco, mi (saint clair county)","[(saint clair county, mi, 100), (saint clair c..."
6,2086340.0,details,https://www.bizbuysell.com/Business-Real-Estat...,High Grossing Atlanta Liquor Store in Fulton C...,High grossing Atlanta liquor store with limite...,Fulton,Georgia,"Fulton County, GA","Fulton County, Georgia","[Real Estate For Sale, Georgia, Liquor Stores,...",...,False,False,False,False,"Fulton County, GA",False,False,False,"fulton county, ga","[(fulton county, ga, 100), (fulton, ga, 100), ..."
7,2051958.0,details,https://www.bizbuysell.com/Business-Real-Estat...,Turnkey Restaurant Near Lake Wister State Park...,COMMERCIAL REAL ESTATE FOR SALE WELCOME TO 409...,Wister,Oklahoma,"Wister, OK (LeFlore County)","Wister, Oklahoma","[Real Estate For Sale, Oklahoma, Diners, Wister]",...,False,False,False,False,"Wister, OK (LeFlore County)",False,False,False,"wister, ok (leflore county)","[(leflore county, ms, 88), (love, ok, 86), (le..."
8,1861169.0,details,https://www.bizbuysell.com/Business-Real-Estat...,Well Established Sports Bar & Restaurant w/ Re...,Grand Opening after closing for few years.Owne...,Quantico,Virginia,"Quantico, VA (Prince William County)","Quantico, Virginia","[Real Estate For Sale, Virginia, American Rest...",...,False,False,False,False,"Quantico, VA (Prince William County)",False,False,False,"quantico, va (prince william county)","[(price county, wi, 87), (king william, va, 80..."
9,1975813.0,details,https://www.bizbuysell.com/Business-Real-Estat...,Auto-Repair shop and community shopping center...,Over 30 years Community Auto Repair Mechanic s...,Fairfax,Virginia,"Fairfax County, VA","Fairfax County, Virginia","[Real Estate For Sale, Virginia, Auto Repair a...",...,False,False,False,False,"Fairfax County, VA",False,False,False,"fairfax county, va","[(fairfax county, va, 100), (fairfax, va, 100)..."
10,1975814.0,details,https://www.bizbuysell.com/Business-Real-Estat...,Auto-Repair shop and community shopping center...,Over 30 years Community Auto Repair Mechanic s...,Fairfax,Virginia,"Fairfax County, VA","Fairfax County, Virginia","[Real Estate For Sale, Virginia, Auto Repair a...",...,False,False,False,False,"Fairfax County, VA",False,False,False,"fairfax county, va","[(fairfax county, va, 100), (fairfax, va, 100)..."


# Final dataframe

In [26]:
df=df[['id','src','url','title',
            'desc','location','relocatable','categories','similar',
            'details','financials','price']]

In [27]:
df.head()

Unnamed: 0,id,src,url,title,desc,location,relocatable,categories,similar,details,financials,price
0,2067319.0,details,https://www.bizbuysell.com/Business-Real-Estat...,Popular Marina in Central New Jersey in Middle...,It is fully owned by a man and his wife since ...,"Middlesex County, NJ",False,"[Real Estate For Sale, New Jersey, Marinas and...","[1864705, 2071534, 2057087]","Location:\nMiddlesex County, NJ\nType:\nOther ...","Asking Price:\n$2,500,000\n ...",2500000.0
1,1990890.0,details,https://www.bizbuysell.com/Business-Real-Estat...,"High Exposure in Mays Landing, New Jersey - Bi...",AMAZING OPPORTUNITY! Priced to sell and ready ...,"Mays Landing, NJ (Atlantic County)",False,"[Real Estate For Sale, New Jersey, Other, Mays...","[2075819, 1864705, 2071534]","Location:\nMays Landing, NJ\nType:\nOffice\nBu...","Asking Price:\n$225,000\n ...",225000.0
3,2039720.0,details,https://www.bizbuysell.com/Business-Real-Estat...,"Duplex, Short term or Long term in Pinellas Co...",Rare Largo/Seminole area duplex centrally loca...,"Pinellas County, FL",False,"[Real Estate For Sale, Florida, Other, Pinella...","[2035549, 2067510, 2054271]","Location:\nPinellas County, FL\nType:\nMulti-F...","Asking Price:\n$595,000\n ...",595000.0
4,1576680.0,details,https://www.bizbuysell.com/Business-Real-Estat...,"Major Price Reduction Rest/Tavern/ 13,000sq. f...",Price reduced to $850. 000 MAKE AN OFFER!!!!!!...,"Baltimore, MD (Baltimore City County)",False,"[Real Estate For Sale, Maryland, Bars, Pubs an...","[2082461, 2050777, 2043561]","Location:\nBaltimore, MD\nType:\nRetail\nBuild...","Asking Price:\n$1,050,000\n ...",1050000.0
5,2087638.0,details,https://www.bizbuysell.com/Business-Real-Estat...,"Business & Real Estate in Casco, Michigan - Bi...",• The Business and the Real Estate are For Sal...,"Casco, MI (Saint Clair County)",False,"[Real Estate For Sale, Michigan, Convenience S...","[2063595, 2048305, 2069188]","Location:\nCasco, MI\nType:\nMulti-Family\nBui...","Asking Price:\n$699,900\n ...",699900.0


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37773 entries, 0 to 37743
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           37773 non-null  float64
 1   src          37773 non-null  object 
 2   url          37773 non-null  object 
 3   title        37773 non-null  object 
 4   desc         37773 non-null  object 
 5   location     37713 non-null  object 
 6   relocatable  37773 non-null  bool   
 7   categories   37773 non-null  object 
 8   similar      37773 non-null  object 
 9   details      37773 non-null  object 
 10  financials   37773 non-null  object 
 11  price        37773 non-null  float64
dtypes: bool(1), float64(2), object(9)
memory usage: 3.5+ MB
