# Violations

In [None]:
import pandas as pd
import re
from collections import Counter
#test2
# Load the dataset
file_path = "Food_Inspections_20250216.csv"  # Ensure the correct path
df = pd.read_csv(file_path, low_memory=False)

# Function to find pattern per row
def row_pattern(row_text):
    if not isinstance(row_text, str):
        return "missing"

    entries = [e.strip() for e in row_text.split("|") if e.strip()]
    structure = []
    for entry in entries:
        if re.match(r"\d+\.\s+.*?- Comments:", entry):
            structure.append("number.text-Comments")
        elif re.match(r"\d+\.\s+.*", entry):
            structure.append("number.text")
        else:
            structure.append("unknown")
    return " | ".join(structure)

# Apply function to all rows
df['Pattern_Structure'] = df['Violations'].apply(row_pattern)

# Count unique patterns
pattern_counts = Counter(df['Pattern_Structure'])

# Show most common patterns
print("Most common row patterns:")
for pattern, count in pattern_counts.most_common():
    print(f"{pattern}: {count} rows")

# Optional: see patterns per row
# print(df[['Violations', 'Pattern_Structure']])


Most common row patterns:
missing: 79652 rows
number.text-Comments | number.text-Comments | number.text-Comments: 33693 rows
number.text-Comments | number.text-Comments: 33407 rows
number.text-Comments | number.text-Comments | number.text-Comments | number.text-Comments: 29251 rows
number.text-Comments: 27404 rows
number.text-Comments | number.text-Comments | number.text-Comments | number.text-Comments | number.text-Comments: 22996 rows
number.text-Comments | number.text-Comments | number.text-Comments | number.text-Comments | number.text-Comments | number.text-Comments: 17296 rows
number.text-Comments | number.text-Comments | number.text-Comments | number.text-Comments | number.text-Comments | number.text-Comments | number.text-Comments: 12384 rows
number.text-Comments | number.text-Comments | number.text-Comments | number.text-Comments | number.text-Comments | number.text-Comments | number.text-Comments | number.text-Comments: 8847 rows
number.text-Comments | number.text-Comments | n

In [21]:
import pandas as pd
import re

# Sample data
file_path = "Food_Inspections_20250216.csv"  # Ensure the correct path
df = pd.read_csv(file_path, low_memory=False)

# Regex to detect correct format
valid_pattern = re.compile(r"^\d+\.\s+.*?- Comments:")

# Reason checks
def get_invalid_reason(segment):
    if not re.search(r"\d+\.", segment):
        return "Missing starting number"
    elif "- Comments:" not in segment:
        return "Missing '- Comments:'"
    else:
        return "Malformed format"

# Function to process each row
def analyze_violations(text):
    if not isinstance(text, str):
        return pd.Series({
            'PatternStatus': 'missing',
            'ViolationCount': 0,
            'InvalidSegments': '',
            'InvalidReasons': ''
        })

    segments = [seg.strip() for seg in text.split("|") if seg.strip()]
    invalid_segments = []
    reasons = []

    for seg in segments:
        if not valid_pattern.match(seg):
            invalid_segments.append(seg)
            reasons.append(get_invalid_reason(seg))

    return pd.Series({
        'PatternStatus': 'valid' if not invalid_segments else 'invalid',
        'ViolationCount': len(segments),
        'InvalidSegments': "; ".join(invalid_segments),
        'InvalidReasons': "; ".join(reasons)
    })

# Apply to the DataFrame
df[['PatternStatus', 'ViolationCount', 'InvalidSegments', 'InvalidReasons']] = df['Violations'].apply(analyze_violations)

# Show only the invalid rows
print("🔍 Invalid rows with detailed reasons:")
print(df[df['PatternStatus'] == 'invalid'][['Violations', 'ViolationCount', 'InvalidSegments', 'InvalidReasons']])




🔍 Invalid rows with detailed reasons:
                                               Violations  ViolationCount  \
54      1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...               5   
469     2. CITY OF CHICAGO FOOD SERVICE SANITATION CER...               8   
773     2. CITY OF CHICAGO FOOD SERVICE SANITATION CER...               2   
775     5. PROCEDURES FOR RESPONDING TO VOMITING AND D...               9   
823     1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...               7   
...                                                   ...             ...   
286630  10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...              11   
286723  10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...              10   
286755  1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...               8   
286793  37. FOOD PROPERLY LABELED; ORIGINAL CONTAINER ...               6   
286794  5. PROCEDURES FOR RESPONDING TO VOMITING AND D...               9   

                                     

In [22]:
invalid_rows = df[df['PatternStatus'] == 'invalid']
invalid_rows

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,...,Inspection Type,Results,Violations,Latitude,Longitude,Location,PatternStatus,ViolationCount,InvalidSegments,InvalidReasons
54,2608106,"STATION RESTAURANT AND BAR, VINNY'S PIZZA BAR,...","STATION RESTAURANT AND BAR, VINNY'S PIZZA BAR,...",2141717.0,Restaurant,Risk 1 (High),120 N CANAL ST,CHICAGO,IL,60606.0,...,Canvass,Pass w/ Conditions,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",41.883872,-87.639940,"(41.88387173662125, -87.63994016069968)",invalid,5,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",Missing '- Comments:'
469,2497525,EDIBLE ARRANGEMENTS,EDIBLE ARRANGEMENTS OF CHICAGO-HYDE PARK,2622530.0,Restaurant,Risk 1 (High),1465 E HYDE PARK BLVD,CHICAGO,IL,60615.0,...,Canvass,Pass w/ Conditions,2. CITY OF CHICAGO FOOD SERVICE SANITATION CER...,41.802344,-87.589218,"(41.802344169636754, -87.58921808934608)",invalid,8,57. ALL FOOD EMPLOYEES HAVE FOOD HANDLER TRAINING,Missing '- Comments:'
773,2604777,PEET'S TEA & COFFEE,PEET'S TEA & COFFEE,2341239.0,Restaurant,Risk 2 (Medium),222 S RIVERSIDE PLZ,CHICAGO,IL,60606.0,...,Canvass,Pass w/ Conditions,2. CITY OF CHICAGO FOOD SERVICE SANITATION CER...,41.878582,-87.638579,"(41.87858156072147, -87.6385786681057)",invalid,2,"53. TOILET FACILITIES: PROPERLY CONSTRUCTED, S...",Missing '- Comments:'
775,2312266,BP GAS,BP GAS,2551845.0,Grocery Store,Risk 3 (Low),6308 N CENTRAL AVE,CHICAGO,IL,60646.0,...,Canvass,Pass w/ Conditions,5. PROCEDURES FOR RESPONDING TO VOMITING AND D...,41.996372,-87.764277,"(41.99637159702214, -87.76427703822098)",invalid,9,36. THERMOMETERS PROVIDED & ACCURATE; 53. TOIL...,Missing '- Comments:'; Missing '- Comments:'
823,2545756,"FITTINGLY DELICIOUS, LLC",FITTINGLY DELICIOUS,2495771.0,Restaurant,Risk 1 (High),3939 W IRVING PARK RD,CHICAGO,IL,60618.0,...,Canvass,Fail,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",41.953552,-87.726673,"(41.953551804435314, -87.72667299244584)",invalid,7,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",Missing '- Comments:'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286630,2556682,TATAS TACOS,TATAS TACOS,2589789.0,Restaurant,Risk 1 (High),4929 W IRVING PARK RD,CHICAGO,IL,60641.0,...,Canvass,Fail,10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...,41.953264,-87.751092,"(41.953263831201625, -87.7510919961523)",invalid,11,56. ADEQUATE VENTILATION & LIGHTING; DESIGNATE...,Missing '- Comments:'
286723,2243530,POMP & CIRCUMSTANCE,BROKEN ENGLISH TACO PUB OLD TOWN,2348608.0,Restaurant,Risk 1 (High),1400 N WELLS ST,CHICAGO,IL,60610.0,...,Canvass,Pass w/ Conditions,10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...,41.907738,-87.634755,"(41.90773847735842, -87.63475461836646)",invalid,10,39. CONTAMINATION PREVENTED DURING FOOD PREPAR...,Missing '- Comments:'
286755,2290421,DUNKIN DONUTS/BASKIN ROBIN,DUNKIN DONUTS/BASKIN ROBBINS,1675002.0,Restaurant,Risk 2 (Medium),5205 N NAGLE AVE,CHICAGO,IL,60630.0,...,Canvass,Pass w/ Conditions,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",41.975911,-87.787723,"(41.97591106463358, -87.78772272010706)",invalid,8,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",Missing '- Comments:'
286793,2362770,ASTER HALL,3-GREENS MARKET/ TOP GOLF,2589784.0,Restaurant,Risk 1 (High),900 N MICHIGAN AVE,CHICAGO,IL,60611.0,...,Canvass,Pass,37. FOOD PROPERLY LABELED; ORIGINAL CONTAINER ...,41.899249,-87.624275,"(41.89924859703626, -87.62427497820545)",invalid,6,41. WIPING CLOTHS: PROPERLY USED & STORED,Missing '- Comments:'


In [23]:
valid_rows = df[df['PatternStatus'] == 'valid']
valid_rows

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,...,Inspection Type,Results,Violations,Latitude,Longitude,Location,PatternStatus,ViolationCount,InvalidSegments,InvalidReasons
0,1106427,BLOOMING BUD DAYCARE,BLOOMING BUD DAYCARE,2215789.0,Daycare Combo 1586,Risk 1 (High),5715 N LINCOLN AVE,CHICAGO,IL,60659.0,...,License,Pass,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,41.985390,-87.698734,"(41.98538950526786, -87.69873407149943)",valid,4,,
1,2608378,Babas Halal,Babas Halal,2684170.0,Restaurant,Risk 1 (High),7901 S DAMEN AVE,CHICAGO,IL,60620.0,...,Complaint,Fail,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",41.750189,-87.672986,"(41.750189342293375, -87.67298583977204)",valid,19,,
2,1106406,FIRST ZABIHA MEAT BAZAAR,FIRST ZABIHA MEAT BAZAAR,2232559.0,Grocery Store,Risk 2 (Medium),2907 W DEVON AVE,CHICAGO,IL,60659.0,...,License,Fail,"11. ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, D...",41.997401,-87.702385,"(41.99740137039031, -87.70238538227812)",valid,15,,
3,2609909,HAPPY MARKET,HAPPY MARKET,2912802.0,Grocery Store,Risk 2 (Medium),2334 S WENTWORTH AVE,CHICAGO,IL,60616.0,...,Canvass,Pass w/ Conditions,2. CITY OF CHICAGO FOOD SERVICE SANITATION CER...,41.849954,-87.632094,"(41.84995400192252, -87.63209419559098)",valid,4,,
4,2609927,SAT KAIVAL FOOD INC/SUBWAY,SAT KAIVAL FOOD INC/SUBWAY,2728400.0,Restaurant,Risk 1 (High),1916 S STATE ST,CHICAGO,IL,60616.0,...,Canvass,Pass,36. THERMOMETERS PROVIDED & ACCURATE - Comment...,41.856053,-87.627311,"(41.85605269621059, -87.62731125804903)",valid,2,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287230,2610806,HILTON O'HARE,HILTON O'HARE/ANDIAMO'S/CAFFE MERCATO,2647047.0,Restaurant,Risk 1 (High),11601 W TOUHY AVE,CHICAGO,IL,60666.0,...,Canvass Re-Inspection,Pass,43. IN-USE UTENSILS: PROPERLY STORED - Comment...,42.008751,-87.906874,"(42.00875076599798, -87.90687413400109)",valid,4,,
287232,2610937,DAPPER'S EAST RESTAURANT,DAPPER'S EAST RESTAURANT,12540.0,Restaurant,Risk 1 (High),2901 W ADDISON ST,CHICAGO,IL,60618.0,...,Complaint Re-Inspection,Pass,47. FOOD & NON-FOOD CONTACT SURFACES CLEANABLE...,41.946553,-87.700536,"(41.94655282616904, -87.700535765616)",valid,2,,
287234,2610409,ACE SUSHI @POTASH MARKET,ACE SUSHI,2808766.0,Restaurant,Risk 1 (High),875 N STATE ST,CHICAGO,IL,60610.0,...,Canvass,Pass,"55. PHYSICAL FACILITIES INSTALLED, MAINTAINED ...",41.898878,-87.628111,"(41.89887819742381, -87.6281111483033)",valid,2,,
287235,2610823,"Kitchen Chicago, LLC","Kitchen Chicago, LLC",2119905.0,Shared Kitchen,Risk 1 (High),324 N LEAVITT ST,CHICAGO,IL,60612.0,...,Canvass,Pass,49. NON-FOOD/FOOD CONTACT SURFACES CLEAN - Com...,41.887434,-87.681849,"(41.88743405025222, -87.68184949426895)",valid,2,,


In [24]:
valid_rows.groupby('ViolationCount').count()

Unnamed: 0_level_0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location,PatternStatus,InvalidSegments,InvalidReasons
ViolationCount,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,27404,27404,27213,27403,27372,27403,27404,27392,27399,27401,27404,27404,27404,27404,27266,27266,27266,27404,27404,27404
2,33407,33407,33253,33405,33377,33406,33407,33400,33404,33407,33407,33407,33407,33407,33244,33244,33244,33407,33407,33407
3,33693,33693,33531,33688,33664,33693,33693,33681,33687,33689,33693,33693,33693,33693,33571,33571,33571,33693,33693,33693
4,29251,29251,29121,29250,29235,29251,29251,29241,29246,29250,29251,29251,29251,29251,29163,29163,29163,29251,29251,29251
5,22996,22996,22864,22996,22984,22996,22996,22993,22994,22996,22996,22996,22996,22996,22927,22927,22927,22996,22996,22996
6,17296,17296,17204,17296,17289,17296,17296,17288,17293,17295,17296,17296,17296,17296,17262,17262,17262,17296,17296,17296
7,12384,12384,12331,12384,12376,12384,12384,12378,12381,12382,12384,12384,12384,12384,12360,12360,12360,12384,12384,12384
8,8847,8847,8800,8844,8842,8847,8847,8846,8847,8847,8847,8847,8847,8847,8823,8823,8823,8847,8847,8847
9,6240,6240,6203,6240,6238,6240,6240,6237,6240,6240,6240,6240,6240,6240,6226,6226,6226,6240,6240,6240
10,4305,4305,4282,4305,4303,4305,4305,4301,4304,4304,4305,4305,4305,4305,4300,4300,4300,4305,4305,4305


In [26]:
pd.DataFrame(df[df['PatternStatus'] == 'invalid'][['Violations', 'ViolationCount', 'InvalidSegments', 'InvalidReasons']]).to_csv('InvalidSegments.csv')

In [32]:
missing_rows = df[df['PatternStatus'] == 'missing']
len(missing_rows['Violations'])

79652

In [4]:
import pandas as pd
import re

# Load your CSV
df = pd.read_csv("Food_Inspections_20250216.csv")  # Adjust filename if needed

# Function to extract and analyze violations
def extract_violations(row):
    violation_text = row.get('Violations', '')
    row_index = row.name

    if pd.isna(violation_text):
        return []

    parts = [v.strip() for v in violation_text.split('|') if v.strip()]
    pattern = r"(?P<number>\d+)\.\s+(?P<text>.+?)\s+-\s+Comments:\s+(?P<comment>.+)"

    extracted = []
    for part in parts:
        match = re.match(pattern, part)
        combined = row.drop(labels=['Violations']).to_dict()
        combined['raw_violation'] = part

        if match:
            v = match.groupdict()
            combined['violation_number'] = v['number']
            combined['violation_text'] = v['text']
            combined['violation_comment'] = v['comment']
            combined['parse_error'] = False
            combined['error_reason'] = ""
        else:
            # Attempt to detect error reason
            if not re.search(r"\d+\.", part):
                reason = "missing violation number"
            elif "Comments:" not in part:
                reason = "missing 'Comments:'"
            else:
                reason = "general format mismatch"
            
            combined['violation_number'] = None
            combined['violation_text'] = None
            combined['violation_comment'] = None
            combined['parse_error'] = True
            combined['error_reason'] = reason

        extracted.append(combined)
    return extracted

# Process each row
expanded_rows = []
for _, row in df.iterrows():
    expanded_rows.extend(extract_violations(row))

# Create the final DataFrame
violations_expanded_df = pd.DataFrame(expanded_rows)

# Save to CSV
violations_expanded_df.to_csv("Food_Inspections_Violations_Expanded.csv", index=False)

print("✅ Done! File saved as: Food_Inspections_Violations_Expanded.csv")

✅ Done! File saved as: Food_Inspections_Violations_Expanded.csv


# License #

In [38]:
import pandas as pd

# Load your Excel file
df = pd.read_csv("Food_Inspections_20250216.csv")  # Update filename if needed

# Function to validate License # pattern
def license_pattern(value):
    if pd.isna(value):
        return "missing"
    elif isinstance(value, (int, float)) and not pd.isnull(value):
        return "numeric"
    elif isinstance(value, str):
        if value.isdigit():
            return "numeric"
        else:
            return "non-numeric string"
    else:
        return "unknown format"

# Apply function to each row in the 'License #' column
df['License_Pattern'] = df['License #'].apply(license_pattern)

# Count occurrences
pattern_counts = df['License_Pattern'].value_counts()

# Show invalid ones
invalid_rows = df[df['License_Pattern'] != 'numeric']

# Display results
print("🔎 License # Pattern Summary:")
print(pattern_counts)

print("\n❌ Rows with invalid License # format:")
print(invalid_rows[['License #', 'License_Pattern']])


🔎 License # Pattern Summary:
License_Pattern
numeric    287219
missing        18
Name: count, dtype: int64

❌ Rows with invalid License # format:
        License # License_Pattern
5707          NaN         missing
31473         NaN         missing
76251         NaN         missing
108186        NaN         missing
123035        NaN         missing
137611        NaN         missing
168473        NaN         missing
210537        NaN         missing
213577        NaN         missing
232201        NaN         missing
234026        NaN         missing
235864        NaN         missing
254046        NaN         missing
255822        NaN         missing
260126        NaN         missing
266439        NaN         missing
278851        NaN         missing
281377        NaN         missing


# Inspection ID

In [37]:
import pandas as pd

# Load your Excel file
df = pd.read_csv("Food_Inspections_20250216.csv")  # Update filename if needed

# Function to validate License # pattern
def license_pattern(value):
    if pd.isna(value):
        return "missing"
    elif isinstance(value, (int, float)) and not pd.isnull(value):
        return "numeric"
    elif isinstance(value, str):
        if value.isdigit():
            return "numeric"
        else:
            return "non-numeric string"
    else:
        return "unknown format"

# Apply function to each row in the 'License #' column
df['InspectionID_Pattern'] = df['Inspection ID'].apply(license_pattern)

# Count occurrences
pattern_counts = df['InspectionID_Pattern'].value_counts()

# Show invalid ones
invalid_rows = df[df['InspectionID_Pattern'] != 'numeric']

# Display results
print("🔎 InspectionID_Pattern Pattern Summary:")
print(pattern_counts)

print("\n❌ Rows with invalid InspectionID_Pattern format:")
print(invalid_rows[['Inspection ID', 'InspectionID_Pattern']])


🔎 InspectionID_Pattern Pattern Summary:
InspectionID_Pattern
numeric    287237
Name: count, dtype: int64

❌ Rows with invalid InspectionID_Pattern format:
Empty DataFrame
Columns: [Inspection ID, InspectionID_Pattern]
Index: []


# Facility Type

In [42]:
import pandas as pd

# Load your Excel file
df = pd.read_csv("Food_Inspections_20250216.csv")  # Update filename if needed

# Count missing values
missing_facility = df['Facility Type'].isna().sum()

# Get distinct non-null values
facility_types = df['Facility Type'].dropna().unique()

# Output
print("🏢 Distinct Facility Types:")
for val in sorted(facility_types):
    print("-", val)
print(f"🔸 Missing Facility Type values: {missing_facility}")


🏢 Distinct Facility Types:
- (REHAB CENTER)
- (convenience store)
- (gas station)
- 1005 NURSING HOME
- 1023
- 1023 CHILDERN'S SERVICE S FACILITY
- 1023 CHILDERN'S SERVICES FACILITY
- 1023 CHILDREN'S SERVICES FACILITY
- 1023-CHILDREN'S SERVICES FACILITY
- 1475 LIQUOR
- 15 monts to 5 years old
- 1584-DAY CARE ABOVE 2 YEARS
- A-Not-For-Profit Chef Training Program
- ADULT DAY SERVICE
- ADULT DAYCARE
- AFTER SCHOOL CARE
- AFTER SCHOOL PROGRAM
- ALTERNATIVE SCHOOL
- ARCHDIOCESE
- ART CENTER
- ART GALLERY
- ART GALLERY W/WINE AND BEER
- ASSISSTED LIVING
- ASSISTED LIVING
- Adult Family Care Center
- Airport Lounge
- Animal Shelter Cafe Permit
- Assisted Living
- Assisted Living Senior Care
- BAKERY/ RESTAURANT
- BAKERY/DELI
- BAKERY/GROCERY
- BANQUET
- BANQUET FACILITY
- BANQUET HALL
- BANQUET HALL/CATERING
- BANQUET ROOM
- BAR
- BAR/GRILL
- BEFORE AND AFTER SCHOOL PROGRAM
- BEVERAGE/SILVERWARE WAREHOUSE
- BOOK STORE
- BOWLING LANES/BANQUETS
- BOYS AND GIRLS CLUB
- BREWERY
- BUTCHER SHOP
- 

# Risk

In [43]:
missing_risk = df['Risk'].isna().sum()
risk_levels = df['Risk'].dropna().unique()
print("\n⚠️ Distinct Risk Levels:")
for val in sorted(risk_levels):
    print("-", val)
print(f"🔸 Missing Risk values: {missing_risk}")



⚠️ Distinct Risk Levels:
- All
- Risk 1 (High)
- Risk 2 (Medium)
- Risk 3 (Low)
🔸 Missing Risk values: 88


# City

In [45]:
missing_city = df['City'].isna().sum()
city_levels = df['City'].dropna().unique()
print("\n⚠️ Distinct City Levels:")
for val in sorted(city_levels):
    print("-", val)
print(f"🔸 Missing City values: {missing_city}")


⚠️ Distinct City Levels:
- 312CHICAGO
- ALGONQUIN
- ALSIP
- BANNOCKBURNDEERFIELD
- BERWYN
- BLOOMINGDALE
- BLUE ISLAND
- BOLINGBROOK
- BRIDGEVIEW
- BROADVIEW
- BROOKFIELD
- BURBANK
- BURNHAM
- CALUMET CITY
- CCHICAGO
- CH
- CHARLES A HAYES
- CHCHICAGO
- CHCICAGO
- CHICAGO
- CHICAGO HEIGHTS
- CHICAGO.
- CHICAGOC
- CHICAGOCHICAGO
- CHICAGOI
- CHICAGOO
- CHicago
- CICERO
- COUNTRY CLUB HILLS
- Chicago
- DES PLAINES
- EAST HAZEL CREST
- ELK GROVE VILLAGE
- ELMHURST
- EVANSTON
- EVERGREEN
- EVERGREEN PARK
- FRANKFORT
- GLEN ELLYN
- GLENCOE
- GRAYSLAKE
- GRIFFITH
- HAMMOND
- HIGHLAND PARK
- INACTIVE
- JUSTICE
- LAKE BLUFF
- LAKE ZURICH
- LANSING
- LOMBARD
- LOS ANGELES
- MATTESON
- MAYWOOD
- MERRILLVILLE
- MERRIVILLE
- MORTON GROVE
- Maywood
- NAPERVILLE
- NEW HOLSTEIN
- NEW YORK
- NILES NILES
- Norridge
- OAK LAWN
- OAK PARK
- OLYMPIA FIELDS
- OOLYMPIA FIELDS
- PALOS PARK
- PLAINFIELD
- ROSEMONT
- SCHAUMBURG
- SCHILLER PARK
- SKOKIE
- STREAMWOOD
- SUMMIT
- TINLEY PARK
- TORRANCE
- WADSWORT

# State

In [46]:
missing_state = df['State'].isna().sum()
state_levels = df['State'].dropna().unique()
print("\n⚠️ Distinct State Levels:")
for val in sorted(state_levels):
    print("-", val)
print(f"🔸 Missing State values: {missing_state}")


⚠️ Distinct State Levels:
- CA
- CO
- IL
- IN
- NY
- WI
🔸 Missing State values: 58


# Inspection Type

In [47]:
missing_InspectionType = df['Inspection Type'].isna().sum()
InspectionType_levels = df['Inspection Type'].dropna().unique()
print("\n⚠️ Distinct State Levels:")
for val in sorted(InspectionType_levels):
    print("-", val)
print(f"🔸 Missing State values: {missing_InspectionType}")


⚠️ Distinct State Levels:
- 1315 license reinspection
- ADDENDUM
- ASSESSMENT
- Business Not Located
- CANVAS
- CANVASS
- CANVASS FOR RIB FEST
- CANVASS RE INSPECTION OF CLOSE UP
- CANVASS SCHOOL/SPECIAL EVENT
- CANVASS SPECIAL EVENTS
- CANVASS/SPECIAL EVENT
- CHANGED COURT DATE
- CITF
- CLOSE-UP/COMPLAINT REINSPECTION
- CORRECTIVE ACTION
- COVID COMPLAINT
- Canvass
- Canvass Re-Inspection
- Complaint
- Complaint Re-Inspection
- Complaint-Fire
- Complaint-Fire Re-inspection
- Consultation
- DAY CARE LICENSE RENEWAL
- Duplicated
- FIRE
- FIRE/COMPLAIN
- HACCP QUESTIONAIRE
- Illegal Operation
- KIDS CAFE
- KITCHEN CLOSED FOR RENOVATION
- Kids Cafe'
- LICENSE
- LICENSE CANCELED BY OWNER
- LICENSE CONSULTATION
- LICENSE DAYCARE 1586
- LICENSE RENEWAL FOR DAYCARE
- LICENSE RENEWAL INSPECTION FOR DAYCARE
- LICENSE REQUEST
- LICENSE TASK FORCE / NOT -FOR-PROFIT CLU
- LICENSE TASK FORCE / NOT -FOR-PROFIT CLUB
- LICENSE WRONG ADDRESS
- LICENSE/NOT READY
- LIQOUR TASK FORCE NOT READY
- LIQUOR C

# Results

In [48]:
missing_Results = df['Results'].isna().sum()
Resultse_levels = df['Results'].dropna().unique()
print("\n⚠️ Distinct State Levels:")
for val in sorted(Resultse_levels):
    print("-", val)
print(f"🔸 Missing State values: {missing_Results}")


⚠️ Distinct State Levels:
- Business Not Located
- Fail
- No Entry
- Not Ready
- Out of Business
- Pass
- Pass w/ Conditions
🔸 Missing State values: 0


# latitude longitude location

In [51]:
import pandas as pd
import re

# Load the Excel file
df = pd.read_csv("Food_Inspections_20250216.csv")  # Adjust filename if needed

# === Validation functions ===

def check_latitude(val):
    if pd.isna(val):
        return "missing"
    try:
        lat = float(val)
        if -90 <= lat <= 90:
            return "valid"
        else:
            return "out of range"
    except:
        return "not a number"

def check_longitude(val):
    if pd.isna(val):
        return "missing"
    try:
        lon = float(val)
        if -180 <= lon <= 180:
            return "valid"
        else:
            return "out of range"
    except:
        return "not a number"

def check_location(val):
    if pd.isna(val):
        return "missing"
    if not isinstance(val, str):
        return "not a string"
    
    match = re.match(r"\(\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\)", val)
    if not match:
        return "pattern mismatch"
    
    lat, lon = float(match.group(1)), float(match.group(2))
    if not (-90 <= lat <= 90):
        return "latitude out of range"
    if not (-180 <= lon <= 180):
        return "longitude out of range"
    
    return "valid"

# === Apply validation functions ===
df['Latitude_Status'] = df['Latitude'].apply(check_latitude)
df['Longitude_Status'] = df['Longitude'].apply(check_longitude)
df['Location_Status'] = df['Location'].apply(check_location)

# === Count invalid reasons ===
lat_counts = df['Latitude_Status'].value_counts()
lon_counts = df['Longitude_Status'].value_counts()
loc_counts = df['Location_Status'].value_counts()

# === Display summary ===
print("📊 Invalid Reason Counts:")

print("\n🧭 Latitude Issues:")
print(lat_counts[lat_counts.index != 'valid'])

print("\n🗺️ Longitude Issues:")
print(lon_counts[lon_counts.index != 'valid'])

print("\n📍 Location Issues:")
print(loc_counts[loc_counts.index != 'valid'])


📊 Invalid Reason Counts:

🧭 Latitude Issues:
Latitude_Status
missing    978
Name: count, dtype: int64

🗺️ Longitude Issues:
Longitude_Status
missing    978
Name: count, dtype: int64

📍 Location Issues:
Location_Status
missing    978
Name: count, dtype: int64


# Zip	Inspection Date

In [57]:
import pandas as pd

# Load your file
df = pd.read_csv("Food_Inspections_20250216.csv", dtype={"Zip": str})

# === Zip Validation ===
def check_zip(val):
    if pd.isna(val):
        return "missing"
    val_str = str(val).strip()
    if not val_str.isdigit():
        return "not numeric"
    if len(val_str) != 5:
        return "not 5-digit"
    return "valid"

# === Inspection Date Validation ===
def check_inspection_date(val):
    if pd.isna(val):
        return "missing"
    try:
        pd.to_datetime(val, errors='raise')
        return "valid"
    except:
        return "invalid date format"

# Apply functions
df['Zip_Status'] = df['Zip'].apply(check_zip)
df['InspectionDate_Status'] = df['Inspection Date'].apply(check_inspection_date)

# Count invalids
zip_counts = df['Zip_Status'].value_counts()
date_counts = df['InspectionDate_Status'].value_counts()

# Display
print("📦 ZIP Code Issues:")
print(zip_counts[zip_counts.index != 'valid'])

print("\n📅 Inspection Date Issues:")
print(date_counts[date_counts.index != 'valid'])


📦 ZIP Code Issues:
Zip_Status
missing    39
Name: count, dtype: int64

📅 Inspection Date Issues:
Series([], Name: count, dtype: int64)


# Address

In [58]:
import pandas as pd
import re

# Load your CSV
df = pd.read_csv("Food_Inspections_20250216.csv")  # Update path if needed

# Address validation function
def check_address(addr):
    if pd.isna(addr):
        return "missing"
    
    addr = addr.strip()

    # Basic regex:
    # - Starts with number or number-range
    # - Optional direction (N, S, E, W)
    # - Street name + suffix
    pattern = r"^(\d+(-\d+)?)(\s+[NSEW])?\s+[\w\s\.\(\)'\-/]+$"

    if re.match(pattern, addr, flags=re.IGNORECASE):
        return "valid"
    else:
        # Diagnose why it's invalid
        if not re.match(r"^\d+(-\d+)?", addr):
            return "missing or invalid street number"
        elif not re.search(r"\s+[A-Za-z]+", addr):
            return "missing street name"
        else:
            return "general format issue"

# Apply
df['Address_Status'] = df['Address'].apply(check_address)

# Count issues
address_issue_counts = df['Address_Status'].value_counts()

# Show result
print("📬 Address Issue Summary:")
print(address_issue_counts[address_issue_counts.index != 'valid'])

# Optional: Show invalid addresses
invalid_addresses = df[df['Address_Status'] != 'valid'][['Address', 'Address_Status']]
print("\n❌ Invalid Address Rows:")
print(invalid_addresses)

📬 Address Issue Summary:
Address_Status
general format issue                342
missing or invalid street number      4
Name: count, dtype: int64

❌ Invalid Address Rows:
                        Address        Address_Status
67        2812-1/2 W DEVON AVE   general format issue
293           2169A S CHINA PL   general format issue
1347     5255 S State (1E&W) ST  general format issue
2169       2812-1/2 W DEVON AVE  general format issue
2579         2140A S ARCHER AVE  general format issue
...                         ...                   ...
279363     1570-1/2 N DAMEN AVE  general format issue
280196     12450 S State (1E&W)  general format issue
280799     1570-1/2 N DAMEN AVE  general format issue
283008  12302 S State St (1E&W)  general format issue
283869       806- 808 E 79TH ST  general format issue

[346 rows x 2 columns]


In [59]:
invalid_addresses.to_csv('invalid_addresses.csv')

# DBA Name

In [60]:
import pandas as pd

# Load your Excel file
df = pd.read_csv("Food_Inspections_20250216.csv")  # Update filename if needed

# Count missing values
missing_DBA_Name= df['DBA Name'].isna().sum()

# Get distinct non-null values
DBA_Name_types = df['DBA Name'].dropna().unique()

# Output
print("🏢 Distinct DBA Name Types:")
for val in sorted(DBA_Name_types):
    print("-", val)
print(f"🔸 Missing Facility Type values: {missing_DBA_Name}")


🏢 Distinct DBA Name Types:
- "GENGHIS" EATERY
- "M" BAR
- "SWEET SUGAH" CATERING BOUTIQUE
- #1 CHINA EXPRESS, LTD.
- #1 CHOP SUEY
- #1 CHOP SUEY RESTAURANT
- #1 CHOP SUEY RESTAURANT, INC
- #1 DELI
- #1 DELI, INC
- #1 MARKET INC.
- #1 WOK N ROLL
- 'C' A CATERING AND EVENT COMPANY
- 'LAINE'S BAKE SHOP
- (K)  NEW  RESTAURANT
- 1 2 3 EXPRESS
- 1 FISH 2 FISH
- 1 JACKPOT MINI MART, INC.
- 1 KARACHI CHAAT HOUSE
- 1 N WACKER KITCHEN AND BAR
- 1 SHOP & SAVE, INC
- 1 STOP  FOOD MART
- 1 STOP & SHOP INC.
- 1 STOP MARKET & GRILL
- 1,200 SQ.FT. - YEMEN RESTAURANT, INC.
- 1,4,3,7 SOUL RESTAURANT
- 1-94 RIBS & GRILL
- 10 PIN  BOWLING LOUNGE
- 10 SOUTH MARKET
- 100 PENNIES STORE
- 1000 LIQUORS / BIG CITY TAP
- 101 CLUB
- 1021 MONTROSE
- 103 CHICKEN & MORE, INC.
- 103RD CITGO INC
- 1053 W 103RD INC
- 106 & HALSTED
- 10640 S HALSTED
- 10Q CHICKEN
- 11 DEGREES NORTH
- 11 DINING
- 11 DINING, LLC
- 11 DINING,LLC
- 11 DINNING
- 11-7 FOOD & LIQUOR
- 1104 WABASH
- 111 COFFEE BAR
- 111 CONCESSION SHOP
- 111 FO

change all column capital?
same DBA name should have same location? License #? Facility Type? address?City?	State?	Zip?

In [None]:
import pandas as pd
from fuzzywuzzy import process

# Load dataset
df = pd.read_csv("Food_Inspections_20250216.csv")

# Columns to clean
columns_to_clean = ['Facility Type', 'City', 'Address', 'Inspection Type']

# Dictionary to store grouped labels for display
all_grouped_labels = {}

# Function to clean one column
def clean_column_fuzzy(df, column_name):
    print(f"\n🔍 Processing column: {column_name}")
    
    # Step 1: Capitalize everything
    df[column_name] = df[column_name].astype(str).str.strip().str.upper()

    # Step 2: Get unique values
    unique_values = df[column_name].dropna().unique().tolist()

    # Step 3: Fuzzy group similar values
    grouped_map = {}     # raw -> cleaned
    grouped_labels = {}  # cleaned -> [originals]

    for value in unique_values:
        if value in grouped_map:
            continue  # Already grouped

        # Fuzzy match: similarity >= 85
        matches = process.extract(value, unique_values, limit=None)
        close_matches = [m for m, score in matches if score >= 85]

        canonical = close_matches[0]

        for match in close_matches:
            grouped_map[match] = canonical

        grouped_labels[canonical] = close_matches

    # Step 4: Create new cleaned column
    cleaned_col = f"{column_name} Cleaned"
    df[cleaned_col] = df[column_name].map(grouped_map)

    # Save grouping for display
    all_grouped_labels[column_name] = grouped_labels

    return df

# Apply to each target column
for col in columns_to_clean:
    df = clean_column_fuzzy(df, col)

# Save new CSV
df.to_csv("Food_Inspections_Cleaned.csv", index=False)
print("\n✅ Cleaned data saved as: Food_Inspections_Cleaned.csv")

# Show grouping results
for col, group_map in all_grouped_labels.items():
    print(f"\n📦 Grouped values for '{col}':")
    for canonical, group in group_map.items():
        if len(group) > 1:
            print(f"\n  → {canonical}:")
            for g in group:
                print(f"     - {g}")





🔍 Processing column: Facility Type

🔍 Processing column: City

🔍 Processing column: Address


In [6]:
#pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from fuzzywuzzy import process

# Load dataset
df = pd.read_csv("Food_Inspections_20250216.csv")

# Columns to clean
columns_to_clean = ['Facility Type', 'City', 'Inspection Type']

# Dictionary to store grouped labels for display
all_grouped_labels = {}

# Function to clean one column
def clean_column_fuzzy(df, column_name):
    print(f"\n🔍 Processing column: {column_name}")
    
    # Step 1: Capitalize everything
    df[column_name] = df[column_name].astype(str).str.strip().str.upper()

    # Step 2: Get unique values
    unique_values = df[column_name].dropna().unique().tolist()

    # Step 3: Fuzzy group similar values
    grouped_map = {}     # raw -> cleaned
    grouped_labels = {}  # cleaned -> [originals]

    for value in unique_values:
        if value in grouped_map:
            continue  # Already grouped

        # Fuzzy match: similarity >= 85
        matches = process.extract(value, unique_values, limit=None)
        close_matches = [m for m, score in matches if score >= 85]

        canonical = close_matches[0]

        for match in close_matches:
            grouped_map[match] = canonical

        grouped_labels[canonical] = close_matches

    # Step 4: Create new cleaned column
    cleaned_col = f"{column_name} Cleaned"
    df[cleaned_col] = df[column_name].map(grouped_map)

    # Save grouping for display
    all_grouped_labels[column_name] = grouped_labels

    return df

# Apply to each target column
for col in columns_to_clean:
    df = clean_column_fuzzy(df, col)

# Save new CSV
df.to_csv("Food_Inspections_Cleaned.csv", index=False)
print("\n✅ Cleaned data saved as: Food_Inspections_Cleaned.csv")

# Show grouping results
for col, group_map in all_grouped_labels.items():
    print(f"\n📦 Grouped values for '{col}':")
    for canonical, group in group_map.items():
        if len(group) > 1:
            print(f"\n  → {canonical}:")
            for g in group:
                print(f"     - {g}")





🔍 Processing column: Facility Type

🔍 Processing column: City

🔍 Processing column: Inspection Type

✅ Cleaned data saved as: Food_Inspections_Cleaned.csv

📦 Grouped values for 'Facility Type':

  → DAYCARE COMBO 1586:
     - DAYCARE COMBO 1586
     - DAYCARE COMBO
     - DAY CARE COMBO (1586)
     - DAYCARE
     - DAYCARE ABOVE AND UNDER 2 YEARS
     - DAYCARE 1586

  → RESTAURANT:
     - RESTAURANT
     - RESTAURANT/BAR
     - GROCERY & RESTAURANT
     - GROCERY/RESTAURANT
     - RESTAURANT/HOSPITAL
     - RESTAURANT/GROCERY STORE
     - RESTAURANT(PROTEIN SHAKE BAR)
     - GAS STATION/RESTAURANT
     - RESTAURANT/LIQUOR
     - TAVERN/RESTAURANT
     - GROCERY STORE/RESTAURANT
     - RESTAURANT.BANQUET HALLS
     - RESTAURANT AND LIQUOR
     - RESTAURANT/BAR/THEATER
     - GROCERY/ RESTAURANT
     - GROCERY STORE/ RESTAURANT
     - RESTAURANT/BAKERY
     - RESTAURANT/GROCERY
     - BAKERY/ RESTAURANT
     - BAKERY/RESTAURANT

  → GROCERY STORE:
     - GROCERY STORE
     - GROCERY/DR

In [2]:
import pandas as pd
from fuzzywuzzy import process

# Load dataset
df = pd.read_csv("Food_Inspections_20250216.csv")

# Columns to clean
columns_to_clean = ['Facility Type', 'City', 'Inspection Type']

# Dictionary to store grouped labels for display
all_grouped_labels = {}

# Function to clean one column
def clean_column_fuzzy(df, column_name):
    print(f"\n🔍 Processing column: {column_name}")
    
    # Step 1: Capitalize everything
    df[column_name] = df[column_name].astype(str).str.strip().str.upper()

    # Step 2: Get unique values
    unique_values = df[column_name].dropna().unique().tolist()

    # Step 3: Fuzzy group similar values
    grouped_map = {}     # raw -> cleaned
    grouped_labels = {}  # cleaned -> [originals]

    for value in unique_values:
        if value in grouped_map:
            continue  # Already grouped

        # Fuzzy match: similarity >= 85
        matches = process.extract(value, unique_values, limit=None)
        close_matches = [m for m, score in matches if score >= 95]

        canonical = close_matches[0]

        for match in close_matches:
            grouped_map[match] = canonical

        grouped_labels[canonical] = close_matches

    # Step 4: Create new cleaned column
    cleaned_col = f"{column_name} Cleaned"
    df[cleaned_col] = df[column_name].map(grouped_map)

    # Save grouping for display
    all_grouped_labels[column_name] = grouped_labels

    return df

# Apply to each target column
for col in columns_to_clean:
    df = clean_column_fuzzy(df, col)

# Save new CSV
df.to_csv("Food_Inspections_Cleaned.csv", index=False)
print("\n✅ Cleaned data saved as: Food_Inspections_Cleaned.csv")

# Show grouping results
for col, group_map in all_grouped_labels.items():
    print(f"\n📦 Grouped values for '{col}':")
    for canonical, group in group_map.items():
        if len(group) > 1:
            print(f"\n  → {canonical}:")
            for g in group:
                print(f"     - {g}")



🔍 Processing column: Facility Type

🔍 Processing column: City

🔍 Processing column: Inspection Type

✅ Cleaned data saved as: Food_Inspections_Cleaned.csv

📦 Grouped values for 'Facility Type':

  → DAYCARE COMBO 1586:
     - DAYCARE COMBO 1586
     - DAYCARE COMBO
     - DAY CARE COMBO (1586)

  → RESTAURANT:
     - RESTAURANT
     - RESTAURANT/BAR

  → GROCERY STORE:
     - GROCERY STORE
     - GROCERY/DRUG STORE
     - DRUG STORE/GROCERY
     - GROCERY STORE/DELI

  → CHILDREN'S SERVICES FACILITY:
     - CHILDREN'S SERVICES FACILITY
     - CHILDRENS SERVICES FACILITY
     - 1023-CHILDREN'S SERVICES FACILITY
     - 1023 CHILDREN'S SERVICES FACILITY
     - CHILDERN'S SERVICES  FACILITY

  → DAYCARE ABOVE AND UNDER 2 YEARS:
     - DAYCARE ABOVE AND UNDER 2 YEARS
     - DAYCARE (UNDER 2 YEARS)

  → LONG TERM CARE:
     - LONG TERM CARE
     - LONG-TERM CARE

  → DAYCARE (2 - 6 YEARS):
     - DAYCARE (2 - 6 YEARS)
     - DAYCARE (2 YEARS)

  → MOBILE FROZEN DESSERTS VENDOR:
     - MOBIL

In [3]:
import pandas as pd
import re

# Load your CSV
df = pd.read_csv("Food_Inspections_Cleaned.csv")  # Adjust filename if needed

# Function to extract and analyze violations
def extract_violations(row):
    violation_text = row.get('Violations', '')
    row_index = row.name

    if pd.isna(violation_text):
        return []

    parts = [v.strip() for v in violation_text.split('|') if v.strip()]
    pattern = r"(?P<number>\d+)\.\s+(?P<text>.+?)\s+-\s+Comments:\s+(?P<comment>.+)"

    extracted = []
    for part in parts:
        match = re.match(pattern, part)
        combined = row.drop(labels=['Violations']).to_dict()
        combined['raw_violation'] = part

        if match:
            v = match.groupdict()
            combined['violation_number'] = v['number']
            combined['violation_text'] = v['text']
            combined['violation_comment'] = v['comment']
            combined['parse_error'] = False
            combined['error_reason'] = ""
        else:
            # Attempt to detect error reason
            if not re.search(r"\d+\.", part):
                reason = "missing violation number"
            elif "Comments:" not in part:
                reason = "missing 'Comments:'"
            else:
                reason = "general format mismatch"
            
            combined['violation_number'] = None
            combined['violation_text'] = None
            combined['violation_comment'] = None
            combined['parse_error'] = True
            combined['error_reason'] = reason

        extracted.append(combined)
    return extracted

# Process each row
expanded_rows = []
for _, row in df.iterrows():
    expanded_rows.extend(extract_violations(row))

# Create the final DataFrame
violations_expanded_df = pd.DataFrame(expanded_rows)

# Save to CSV
violations_expanded_df.to_csv("Food_Inspections_Violations_Expanded_with_cleandata.csv", index=False)

print("✅ Done! File saved as: Food_Inspections_Violations_Expanded.csv")

✅ Done! File saved as: Food_Inspections_Violations_Expanded.csv


In [None]:
import pandas as pd
import itertools

# === Step 1: Load your data ===
# Replace with your actual filename
file_path = "Food_Inspections_Violations_Expanded_with_cleandata.csv"
df = pd.read_csv(file_path)

# Drop columns that are entirely empty
df = df.dropna(axis=1, how='all')

# === Step 2: Set up ===
columns = df.columns.tolist()
max_comb_size = 2  # We’ll check 1 or 2 column combos
valid_fds = []
violated_fds = {}

# === Step 3: FD Check Function ===
def check_fd(data, A, B):
    grouped = data.groupby(A)[B].nunique()
    violations = grouped[(grouped > 1).any(axis=1)]
    return violations.empty, violations

# === Step 4: Try all combinations ===
for a_len in range(1, max_comb_size + 1):
    for A in itertools.combinations(columns, a_len):
        remaining_cols = [col for col in columns if col not in A]
        for b_len in range(1, max_comb_size + 1):
            for B in itertools.combinations(remaining_cols, b_len):
                is_valid, sample_violations = check_fd(df, list(A), list(B))
                fd_str = f"{list(A)} → {list(B)}"
                if is_valid:
                    valid_fds.append(fd_str)
                elif not sample_violations.empty:
                    violated_fds[fd_str] = sample_violations.head(3)

# === Step 5: Save results ===
# Save valid FDs
valid_fds_df = pd.DataFrame(valid_fds, columns=["Functional Dependency"])
valid_fds_df.to_csv("valid_functional_dependencies.csv", index=False)

# Save violations summary
violated_summary_df = pd.DataFrame([
    {"FD": fd, "Sample Violations": v.reset_index().to_dict(orient='records')}
    for fd, v in violated_fds.items()
])
violated_summary_df.to_csv("violated_functional_dependencies_samples.csv", index=False)

print("✅ FD discovery complete.")
print("📄 Saved: valid_functional_dependencies.csv")
print("📄 Saved: violated_functional_dependencies_samples.csv")



In [None]:
import pandas as pd
from fuzzywuzzy import process

# Load dataset
df = pd.read_csv("Food_Inspections_Violations_Expanded_with_cleandata.csv")

# Columns to clean
columns_to_clean = ['Address']

# Dictionary to store grouped labels for display
all_grouped_labels = {}

# Function to clean one column
def clean_column_fuzzy(df, column_name):
    print(f"\n🔍 Processing column: {column_name}")
    
    # Step 1: Capitalize everything
    df[column_name] = df[column_name].astype(str).str.strip().str.upper()

    # Step 2: Get unique values
    unique_values = df[column_name].dropna().unique().tolist()

    # Step 3: Fuzzy group similar values
    grouped_map = {}     # raw -> cleaned
    grouped_labels = {}  # cleaned -> [originals]

    for value in unique_values:
        if value in grouped_map:
            continue  # Already grouped

        # Fuzzy match: similarity >= 85
        matches = process.extract(value, unique_values, limit=None)
        close_matches = [m for m, score in matches if score >= 95]

        canonical = close_matches[0]

        for match in close_matches:
            grouped_map[match] = canonical

        grouped_labels[canonical] = close_matches

    # Step 4: Create new cleaned column
    cleaned_col = f"{column_name} Cleaned"
    df[cleaned_col] = df[column_name].map(grouped_map)

    # Save grouping for display
    all_grouped_labels[column_name] = grouped_labels

    return df

# Apply to each target column
for col in columns_to_clean:
    df = clean_column_fuzzy(df, col)

# Save new CSV
df.to_csv("Food_Inspections_Violations_Expanded_with_cleandata_address.csv", index=False)
print("\n✅ Cleaned data saved as: Food_Inspections_Cleaned.csv")

'''
# Show grouping results
for col, group_map in all_grouped_labels.items():
    print(f"\n📦 Grouped values for '{col}':")
    for canonical, group in group_map.items():
        if len(group) > 1:
            print(f"\n  → {canonical}:")
            for g in group:
                print(f"     - {g}")
'''




🔍 Processing column: Address


In [3]:
import pandas as pd
from thefuzz import process, fuzz
from collections import defaultdict

def fuzzy_normalize_column(df, column_name, threshold=80):
    """
    Normalize text values in a DataFrame column using fuzzy matching.
    
    Args:
    - df (pd.DataFrame): Input DataFrame.
    - column_name (str): Column name to normalize.
    - threshold (int): Similarity threshold for fuzzy matching (default is 80).
    
    Returns:
    - pd.DataFrame: DataFrame with a new normalized column.
    """
    df[column_name] = df[column_name].astype(str).fillna('')  # Convert to string

    unique_values = list(set(df[column_name].str.lower()))  # Unique values in lowercase

    # Reference mapping for normalization
    reference_mapping = {}
    groups = defaultdict(list)  # To store word clusters

    for value in unique_values:
        # Check if it's already in a group
        if value in reference_mapping:
            continue
        
        # Find similar words
        matches = process.extract(value, unique_values, limit=10, scorer=fuzz.ratio)
        matches = [(match, score) for match, score in matches if score >= threshold]
        
        if matches:
            best_match = max(matches, key=lambda x: x[1])[0]  # Pick the best-scoring match
        else:
            best_match = value  # Keep original if no good match found

        # Assign all similar words to the best match
        for match, score in matches:
            reference_mapping[match] = best_match
            groups[best_match].append(match)

    # Apply normalization mapping
    df[f'{column_name}_normalised'] = df[column_name].str.lower().map(reference_mapping)
    
    return df

In [2]:
pip install thefuzz

Collecting thefuzzNote: you may need to restart the kernel to use updated packages.

  Downloading thefuzz-0.22.1-py3-none-any.whl.metadata (3.9 kB)
Collecting rapidfuzz<4.0.0,>=3.0.0 (from thefuzz)
  Downloading rapidfuzz-3.12.2-cp312-cp312-win_amd64.whl.metadata (12 kB)
Downloading thefuzz-0.22.1-py3-none-any.whl (8.2 kB)
Downloading rapidfuzz-3.12.2-cp312-cp312-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ------------------------- -------------- 1.0/1.6 MB 5.0 MB/s eta 0:00:01
   -------------------------------------- - 1.6/1.6 MB 6.0 MB/s eta 0:00:01
   ---------------------------------------- 1.6/1.6 MB 2.4 MB/s eta 0:00:00
Installing collected packages: rapidfuzz, thefuzz
Successfully installed rapidfuzz-3.12.2 thefuzz-0.22.1


In [4]:
# Load dataset
df = pd.read_csv("Food_Inspections_Violations_Expanded_with_cleandata.csv")


In [5]:
fuzzy_normalize_column(df, 'Address', 95)

Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: ' ']


Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,...,Facility Type Cleaned,City Cleaned,Inspection Type Cleaned,raw_violation,violation_number,violation_text,violation_comment,parse_error,error_reason,Address_normalised
0,1106427,BLOOMING BUD DAYCARE,BLOOMING BUD DAYCARE,2215789.0,DAYCARE COMBO 1586,Risk 1 (High),5715 N LINCOLN AVE,CHICAGO,IL,60659.0,...,DAYCARE COMBO 1586,CHICAGO,LICENSE,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,32.0,FOOD AND NON-FOOD CONTACT SURFACES PROPERLY DE...,PROVIDE A DRAIN BOARD AT THREE COMPARTMENT SINK.,False,,5715 n lincoln ave
1,1106427,BLOOMING BUD DAYCARE,BLOOMING BUD DAYCARE,2215789.0,DAYCARE COMBO 1586,Risk 1 (High),5715 N LINCOLN AVE,CHICAGO,IL,60659.0,...,DAYCARE COMBO 1586,CHICAGO,LICENSE,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,32.0,FOOD AND NON-FOOD CONTACT SURFACES PROPERLY DE...,OBSERVED RUSTY GREASE TRAP UNDER THE THREE COM...,False,,5715 n lincoln ave
2,1106427,BLOOMING BUD DAYCARE,BLOOMING BUD DAYCARE,2215789.0,DAYCARE COMBO 1586,Risk 1 (High),5715 N LINCOLN AVE,CHICAGO,IL,60659.0,...,DAYCARE COMBO 1586,CHICAGO,LICENSE,"34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO...",34.0,"FLOORS: CONSTRUCTED PER CODE, CLEANED, GOOD RE...",INSTRUCTED TO SWEEP FLOOR INSIDE THE FURNACE A...,False,,5715 n lincoln ave
3,1106427,BLOOMING BUD DAYCARE,BLOOMING BUD DAYCARE,2215789.0,DAYCARE COMBO 1586,Risk 1 (High),5715 N LINCOLN AVE,CHICAGO,IL,60659.0,...,DAYCARE COMBO 1586,CHICAGO,LICENSE,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",35.0,"WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTRUCTE...","OBSERVED DEAD BULB IN THE KITCHEN, INSTRUCTED ...",False,,5715 n lincoln ave
4,2608378,Babas Halal,Babas Halal,2684170.0,RESTAURANT,Risk 1 (High),7901 S DAMEN AVE,CHICAGO,IL,60620.0,...,RESTAURANT,CHICAGO,COMPLAINT,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",1.0,"PERSON IN CHARGE PRESENT, DEMONSTRATES KNOWLED...",PIC DOESN'T HAVE A CERTIFIED FOOD MANAGERS CER...,False,,7901 s damen ave
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939275,2610409,ACE SUSHI @POTASH MARKET,ACE SUSHI,2808766.0,RESTAURANT,Risk 1 (High),875 N STATE ST,CHICAGO,IL,60610.0,...,RESTAURANT,CHICAGO,CANVASS,"55. PHYSICAL FACILITIES INSTALLED, MAINTAINED ...",55.0,"PHYSICAL FACILITIES INSTALLED, MAINTAINED & CLEAN",OBSERVED SMALL DEBRIS ON FLOOR IN PREP AREA. ...,False,,875 n state st
939276,2610409,ACE SUSHI @POTASH MARKET,ACE SUSHI,2808766.0,RESTAURANT,Risk 1 (High),875 N STATE ST,CHICAGO,IL,60610.0,...,RESTAURANT,CHICAGO,CANVASS,58. ALLERGEN TRAINING AS REQUIRED - Comments: ...,58.0,ALLERGEN TRAINING AS REQUIRED,OBSERVED NOT ALLERGEN TRAINING CERTIFICATE ON ...,False,,875 n state st
939277,2610823,"Kitchen Chicago, LLC","Kitchen Chicago, LLC",2119905.0,SHARED KITCHEN,Risk 1 (High),324 N LEAVITT ST,CHICAGO,IL,60612.0,...,SHARED KITCHEN,CHICAGO,CANVASS,49. NON-FOOD/FOOD CONTACT SURFACES CLEAN - Com...,49.0,NON-FOOD/FOOD CONTACT SURFACES CLEAN,CLEAN THE STORAGE SHELVES IN ZELLA'S STORAGE C...,False,,324 n leavitt st
939278,2610823,"Kitchen Chicago, LLC","Kitchen Chicago, LLC",2119905.0,SHARED KITCHEN,Risk 1 (High),324 N LEAVITT ST,CHICAGO,IL,60612.0,...,SHARED KITCHEN,CHICAGO,CANVASS,"55. PHYSICAL FACILITIES INSTALLED, MAINTAINED ...",55.0,"PHYSICAL FACILITIES INSTALLED, MAINTAINED & CLEAN",CLEAN THE FLOOR IN ZELLA'S STORAGE CLOSET. CL...,False,,324 n leavitt st


In [6]:
df.to_csv("Food_Inspections_Violations_Expanded_with_cleandata_address.csv", index=False)

In [1]:
!git clone https://github.com/leonswl/ntu-msds-sd6104

Cloning into 'ntu-msds-sd6104'...
Updating files:  70% (17/24)
Updating files:  75% (18/24)
Updating files:  79% (19/24)
Updating files:  83% (20/24)
Updating files:  87% (21/24)
Updating files:  91% (22/24)
Updating files:  95% (23/24)
Updating files: 100% (24/24)
Updating files: 100% (24/24), done.


In [2]:
%cd ntu-msds-sd6104\notebooks\wangyu

C:\Users\A102730\Data preparation\ntu-msds-sd6104\notebooks\wangyu


In [3]:
!git add .

In [4]:
!pwd

'pwd' is not recognized as an internal or external command,
operable program or batch file.


In [6]:
!git add project_v5.ipynb

fatal: pathspec 'project_v5.ipynb' did not match any files


In [7]:
%ls

 Volume in drive C has no label.
 Volume Serial Number is 58C0-E456

 Directory of C:\Users\A102730\Data preparation\ntu-msds-sd6104\notebooks\wangyu

04/05/2025  03:37 PM    <DIR>          .
04/05/2025  03:37 PM    <DIR>          ..
               0 File(s)              0 bytes
               2 Dir(s)  103,830,765,568 bytes free
