In [1]:
import pandas as pd 
import re

In [2]:
df = pd.read_csv('swiggy_all_menus_india.csv')
df.shape

(197430, 9)

In [3]:
# Check for rows with incorrect number of columns by reading the CSV file in raw mode
with open('swiggy_all_menus_india.csv', 'r', encoding='utf-8') as f:
    lines = f.readlines()

expected_cols = len(df.columns)
malformed_rows = [i for i, line in enumerate(lines) if len(line.strip().split(',')) != expected_cols]

print(f"Number of malformed rows: {len(malformed_rows)}")
if malformed_rows:
    print(f"Malformed row indices (0-based): {malformed_rows[:10]}")  # Show first 10 for inspection
else:
    print("All rows have the correct number of columns.")

Number of malformed rows: 9630
Malformed row indices (0-based): [428, 1298, 1299, 1300, 1301, 1618, 1652, 1653, 1654, 1655]


In [4]:
# Extract and print the malformed rows for inspection
malformed_lines = [lines[i] for i in malformed_rows]

# Attempt to correct malformed rows by joining split lines if possible
# (Assuming the issue is due to line breaks within quoted fields)
corrected_lines = []
skip_indices = set()
i = 0
while i < len(lines):
    if i in malformed_rows:
        temp_line = lines[i].rstrip('\n')
        # Keep joining next lines until the number of columns matches expected_cols
        j = i + 1
        while len(temp_line.split(',')) < expected_cols and j < len(lines):
            temp_line += lines[j].rstrip('\n')
            skip_indices.add(j)
            j += 1
        corrected_lines.append(temp_line + '\n')
        i = j
    else:
        if i not in skip_indices:
            corrected_lines.append(lines[i])
        i += 1

# Save the corrected lines to a new CSV file
with open('swiggy_all_menus_india_cleaned.csv', 'w', encoding='utf-8') as f_out:
    f_out.writelines(corrected_lines)

In [5]:
df = pd.read_csv('swiggy_all_menus_india_cleaned.csv')
df.shape

(197430, 9)

In [6]:
df.head()

Unnamed: 0,State,City,Restaurant Name,Location,Category,Dish Name,Price (INR),Rating,Rating Count
0,Karnataka,Bengaluru,Anand Sweets & Savouries,Rajarajeshwari Nagar,Snack,Butter Murukku-200gm,133.9,0.0,0
1,Karnataka,Bengaluru,Srinidhi Sagar Deluxe,Kengeri,Recommended,Badam Milk,52.0,4.5,25
2,Karnataka,Bengaluru,Srinidhi Sagar Deluxe,Kengeri,Recommended,Chow Chow Bath,117.0,4.7,48
3,Karnataka,Bengaluru,Srinidhi Sagar Deluxe,Kengeri,Recommended,Kesari Bath,65.0,4.6,65
4,Karnataka,Bengaluru,Srinidhi Sagar Deluxe,Kengeri,Recommended,Mix Raitha,130.0,0.0,0


In [7]:
df.rename(columns={'Restaurant Name': 'Restaurant_Name','Dish Name':'Dish_Name','Price (INR)': 'Price_INR', 'Rating Count':'Rating_Count'}, inplace=True)

In [8]:
def remove_special_chars(val):
    if isinstance(val, str):
        return re.sub(r'[^A-Za-z0-9\s]', '', val)
    return val

df = df.applymap(remove_special_chars)

In [9]:
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [10]:
df = df.applymap(lambda x: re.sub(r'\s+', ' ', x) if isinstance(x, str) else x)

In [11]:
df.head()

Unnamed: 0,State,City,Restaurant_Name,Location,Category,Dish_Name,Price_INR,Rating,Rating_Count
0,Karnataka,Bengaluru,Anand Sweets Savouries,Rajarajeshwari Nagar,Snack,Butter Murukku200gm,133.9,0.0,0
1,Karnataka,Bengaluru,Srinidhi Sagar Deluxe,Kengeri,Recommended,Badam Milk,52.0,4.5,25
2,Karnataka,Bengaluru,Srinidhi Sagar Deluxe,Kengeri,Recommended,Chow Chow Bath,117.0,4.7,48
3,Karnataka,Bengaluru,Srinidhi Sagar Deluxe,Kengeri,Recommended,Kesari Bath,65.0,4.6,65
4,Karnataka,Bengaluru,Srinidhi Sagar Deluxe,Kengeri,Recommended,Mix Raitha,130.0,0.0,0


In [12]:
df.to_csv('cleaned_swiggy_menu.csv', index=False)