# Bizbuysell Data Wrangling & Exploration 1

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd

### Read in "detailed" business data from web scraper

In [3]:
df_d=pd.read_json('data/bizbuysell.detail.lines.json.zip',lines=True)

### Clean up detailed data

In [4]:
df_d['id']=df_d.s_id
df_d['src']='details'
df_d['url']=df_d['s_url']
df_d['title']=df_d['s_name']
df_d['desc']=df_d['s_desc']
df_d['local']=df_d['s_local']
df_d['region']=df_d['s_region']
df_d['location']=df_d['p_location']
df_d['title_loc']=df_d['title'].str.extract(r"in (.+) - BizBuySell")
df_d['categories']=df_d['s_breadcrumbs']
df_d['details']=df_d['p_details_text']
df_d['financials']=df_d['p_financials_text']
df_d['price']=df_d['s_price']
df_d['similar']=df_d['s_similar']

In [5]:
df_dd=df_d[['id','src','url','title',
            'desc','local','region','location','title_loc','categories','similar',
            'details','financials','price']]

In [6]:
df_dd.shape

(38571, 14)

In [7]:
df_dd.head()

Unnamed: 0,id,src,url,title,desc,local,region,location,title_loc,categories,similar,details,financials,price
0,2067319.0,details,https://www.bizbuysell.com/Business-Real-Estat...,Popular Marina in Central New Jersey in Middle...,It is fully owned by a man and his wife since ...,Middlesex,New Jersey,"Middlesex County, NJ","Central New Jersey in Middlesex County, New Je...","[Real Estate For Sale, New Jersey, Marinas and...","[1864705, 2071534, 2057087]","Location:\nMiddlesex County, NJ\nType:\nOther ...","Asking Price:\n$2,500,000\n ...",2500000.0
1,1990890.0,details,https://www.bizbuysell.com/Business-Real-Estat...,"High Exposure in Mays Landing, New Jersey - Bi...",AMAZING OPPORTUNITY! Priced to sell and ready ...,Mays Landing,New Jersey,"Mays Landing, NJ (Atlantic County)","Mays Landing, New Jersey","[Real Estate For Sale, New Jersey, Other, Mays...","[2075819, 1864705, 2071534]","Location:\nMays Landing, NJ\nType:\nOffice\nBu...","Asking Price:\n$225,000\n ...",225000.0
2,2050006.0,details,https://www.bizbuysell.com/Business-Real-Estat...,"Rental Units with Real Estate in Randolph, New...",This is a 6 unit Airbnb with an average occupa...,Randolph,New York,"Randolph, NY (Cattaraugus County)","Randolph, New York","[Real Estate For Sale, New York, Hotels, Rando...","[2059235, 2080381, 2077331]","Location:\nRandolph, NY\nType:\nOther Business...","Asking Price:\n$425,000\n ...",425000.0
3,2039720.0,details,https://www.bizbuysell.com/Business-Real-Estat...,"Duplex, Short term or Long term in Pinellas Co...",Rare Largo/Seminole area duplex centrally loca...,Pinellas,Florida,"Pinellas County, FL","Pinellas County, Florida","[Real Estate For Sale, Florida, Other, Pinella...","[2035549, 2067510, 2054271]","Location:\nPinellas County, FL\nType:\nMulti-F...","Asking Price:\n$595,000\n ...",595000.0
4,1576680.0,details,https://www.bizbuysell.com/Business-Real-Estat...,"Major Price Reduction Rest/Tavern/ 13,000sq. f...",Price reduced to $850. 000 MAKE AN OFFER!!!!!!...,Baltimore,Maryland,"Baltimore, MD (Baltimore City County)","Baltimore, Maryland","[Real Estate For Sale, Maryland, Bars, Pubs an...","[2082461, 2050777, 2043561]","Location:\nBaltimore, MD\nType:\nRetail\nBuild...","Asking Price:\n$1,050,000\n ...",1050000.0


### Read in "listing" business data from web scraper

In [8]:
df_l=pd.concat([pd.read_json('data/bizbuysell.list.lines.1.json.zip',lines=True),
                             pd.read_json('data/bizbuysell.list.lines.2.json.zip',lines=True)])

### Clean up listing data

In [9]:
def proc_fin(row):
    result=''
    if pd.isna(row['asking_price'])==False:
        result+=f'Asking Price:\n{row["asking_price"]}\n'
    if pd.isna(row['cash_flow'])==False:
        result+=f'Cash Flow:\n{row["cash_flow"]}\n'        
    return result

In [10]:
df_l['id']=df_l.s_id
df_l['src']='listings'
df_l['url']='https://www.bizbuysell.com'+df_l['s_url']
df_l['title']=df_l['s_name']
df_l['desc']=df_l['s_desc']
df_l['local']=df_l['s_local']
df_l['region']=df_l['s_region']
df_l['location']=df_l['loc']
df_l['title_loc']=df_l['title'].str.extract(r"in (.+) - BizBuySell")
df_l['categories']=df_l['s_breadcrumbs']
df_l['details']=''
df_l['financials']=df_l.apply(proc_fin,axis=1)
df_l['price']=df_d['s_price'].replace('$','').replace(',','')
df_l['similar']=[list() for x in range(len(df_l.index))]

In [11]:
df_ll=df_l[['id','src','url','title',
            'desc','local','region','location','title_loc','categories', 'similar',
            'details','financials','price']]

In [12]:
df_ll.shape

(263592, 14)

In [13]:
df_ll.head()

Unnamed: 0,id,src,url,title,desc,local,region,location,title_loc,categories,similar,details,financials,price
0,1972353,listings,https://www.bizbuysell.com/Business-Opportunit...,Turnkey Jewelry Store in the Caribbean,"Almost 33 years ago, the founders of The Natur...",,Christiansted,Christiansted,,"[Businesses For Sale, Travel Businesses For Sa...",[],,"Asking Price:\n$2,000,000\n",2500000.0
1,1815206,listings,https://www.bizbuysell.com/Business-Opportunit...,Tennessee River Gorge® Mountain Cabins Offers ...,"We are selling 51% ""controlling interest"" of T...",Whitwell,TN,"Whitwell, TN",,"[Businesses For Sale, Travel Businesses For Sa...",[],,"Asking Price:\n$870,000\nCash Flow:\nCash Flow...",225000.0
2,2023577,listings,https://www.bizbuysell.com/Business-Opportunit...,Top-Rated Detroit Tour Company For Sale,This top-rated Detroit tour company offers an ...,Detroit,MI,"Detroit, MI",,"[Businesses For Sale, Travel Businesses For Sa...",[],,"Asking Price:\n$2,300,000\nCash Flow:\nCash Fl...",425000.0
3,2066007,listings,https://www.bizbuysell.com/Business-Opportunit...,Southern Vermont B&B,Step back in time to the Victorian era in this...,Poultney,VT,"Poultney, VT",,"[Businesses For Sale, Travel Businesses For Sa...",[],,"Asking Price:\n$695,000\n",595000.0
4,27828757,listings,https://www.bizbuysell.com/Business-Auction/co...,Courtyard Houston Hobby Airport,"84,444 SF Hotel For Sale",Houston,TX,,,"[Businesses For Sale, Travel Businesses For Sa...",[],,,1050000.0


### Join detailed and listing data

In [14]:
df_out=pd.concat([df_dd,df_ll])

In [15]:
df_out.shape

(302163, 14)

### Save inital wrangled data parquet file

In [16]:
df_out.to_parquet('data/bizbuysell.dataset.1.parquet')