## This notebook cleans the nested list of strings in the categories and attributes fields for the yelp business data. 

### Categories is always a list of strings. This is parsed by getting all categories, and then calling the uniqify function on them.

### Attributes is a strange list of strings in the shape of a nested dictionary (see the printed output when you run this script). This requires parsing into a dictionary and then scraping the nested dictionaries onto the same level. 

### This script should show the final fields for categories and attributes (it looks like a lot)

In [1]:
import json # To parse json files

In [2]:
# Configure this for your local path
dirpath = 'C:\\Users\\Jason\\Desktop\\Yelp\\'
file = 'yelp_academic_dataset_business.json'
filepath = dirpath + file

In [3]:
# Returns a dictionary containing all attributes on a single level, calls helper to parse nested atts
def ParseAtt(Atts):
    newDict = {}
    i = 0
    for field in Atts:
        fieldAtts = field.split()
        length = len(fieldAtts)
        if length is 2:
            # insert into dictionary dict[key] = data
            # 1 is of the form "data:", take substring -> "data"
            length = len(fieldAtts[0])
            key = fieldAtts[0][0:length - 1]
            data = fieldAtts[1]
            newDict[key] = data
        else:
            # If nested dict
            newDict.update(ParseHelp(fieldAtts))
        i = i + 1
    return newDict

In [4]:
# Parses nested list of strings into dictionary, returns the dictionary
# Which is then updated into the main dictionary containing all of the attributes
def ParseHelp(listAtts):
    newDict = {}
    # Clean the data
    listAtts = list(map(clean, listAtts))
    masterKey = listAtts[0]
    # Master key is added on to each key in the dict
    keyWordLen = len(listAtts[0])
    keyWord = listAtts[0][0:keyWordLen - 1]
    # Every other string
    for i in range(1, len(listAtts), 2):
        newWord = listAtts[i]
        newDict[masterKey + '_' + newWord] = listAtts[i + 1]
    return newDict

In [5]:
# Cleans a string to take away various characters present in the default strings
def clean(string):
    clear = string
    clear = clear.replace('{', '')
    clear = clear.replace('}', '')
    clear = clear.replace(',', '')
    clear = clear.replace("'", '')
    clear = clear.replace(':', '')
    return clear

In [6]:
# Get all attributes and convert them into a dictionary
attribs = []
with open(filepath, encoding="utf8") as f:
    data = f.readlines()
    i = 0
    for line in data:
        dataPoint = json.loads(line)
        if dataPoint['attributes'] != None:
            attribs += (dataPoint['attributes'])
        i+=1
        if i>1000:
            break
parsedAtt = ParseAtt(attribs)
print("Original \n")
print(attribs)
print("Parsed \n")
print(parsedAtt)

Original 

['BikeParking: True', 'BusinessAcceptsBitcoin: False', 'BusinessAcceptsCreditCards: True', "BusinessParking: {'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}", 'DogsAllowed: False', 'RestaurantsPriceRange2: 2', 'WheelchairAccessible: True', 'BusinessAcceptsBitcoin: False', 'BusinessAcceptsCreditCards: True', 'Alcohol: none', "Ambience: {'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': False}", 'BikeParking: True', 'BusinessAcceptsCreditCards: True', "BusinessParking: {'garage': False, 'street': False, 'validated': False, 'lot': False, 'valet': False}", 'Caters: True', 'GoodForKids: True', "GoodForMeal: {'dessert': False, 'latenight': False, 'lunch': False, 'dinner': False, 'breakfast': False, 'brunch': False}", 'HasTV: True', 'NoiseLevel: quiet', 'OutdoorSeating: False', 'RestaurantsAttire: casual', 'RestaurantsDelivery: True', 'RestaurantsGoodForGroups:

In [7]:
def uniqify(data): 
    unique = []
    [unique.append(i) for i in data if not unique.count(i)]
    return unique

In [8]:
# Check the format and size of the categories data
cat = []
with open(filepath, encoding="utf8") as f:
    data = f.readlines()
    i = 0
    for line in data:
        dataPoint = json.loads(line)
        if dataPoint['categories'] != None:
            cat += dataPoint['categories']
        i+=1
        if i>1000:
            break
uniqueCat = uniqify(cat)
print(uniqueCat)

['Tobacco Shops', 'Nightlife', 'Vape Shops', 'Shopping', 'Caterers', 'Grocery', 'Food', 'Event Planning & Services', 'Party & Event Planning', 'Specialty Food', 'Restaurants', 'Pizza', 'Chicken Wings', 'Italian', 'Hair Removal', 'Beauty & Spas', 'Blow Dry/Out Services', 'Hair Stylists', 'Hair Extensions', 'Massage', 'Permanent Makeup', 'Waxing', 'Hair Salons', 'Hotels & Travel', 'Hotels', 'Nail Salons', 'Baby Gear & Furniture', 'Tex-Mex', 'Mexican', 'Fast Food', 'Local Services', 'Self Storage', 'Bakeries', 'Hawaiian', 'Barbeque', 'Bookstores', 'Books', 'Mags', 'Music & Video', 'General Dentistry', 'Health & Medical', 'Dentists', 'Leather Goods', 'Fashion', 'Watches', 'Accessories', 'Mobile Phones', 'Computers', 'Electronics', 'IT Services & Computer Repair', 'Animal Shelters', 'Pets', 'Shoe Stores', "Men's Clothing", 'Venues & Event Spaces', 'Sports Clubs', 'Active Life', 'Cafes', 'Funeral Services & Cemeteries', 'Sandwiches', 'Breakfast & Brunch', 'Salad', 'Fitness & Instruction', 'Y