In [1]:
import pandas as pd
import numpy as np
import re


In [5]:
df = pd.read_csv('startup_funding_cleaned.csv')
df.head(5)

Unnamed: 0,Sr No,date,startup,vertical,subvertical,city,investors,round,amount
0,1,2020-01-09,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,1650.0
1,2,2020-01-13,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,66.39925
2,3,2020-01-09,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,151.460595
3,4,2020-01-02,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,24.75
4,5,2020-01-02,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,14.85


In [7]:
df["startup"].unique() 

array(['BYJU’S', 'Shuttl', 'Mamaearth', ..., 'Dazo', 'Tradelab', 'PiQube'],
      dtype=object)

In [9]:
df['startup'] = df['startup'].astype(str).str.strip().str.title()


In [19]:
import re

def clean_startup_name(name):
    if pd.isna(name):
        return 'Unknown'
    
    name = str(name).strip().title()  # Normalize case and whitespace
    
    # Remove common web suffixes and symbols
    name = re.sub(r'\.com|\.in|\.org|\.net|\.co|\.be|\.io|\.ai', '', name, flags=re.IGNORECASE)
    name = re.sub(r'http[s]?://\S+', '', name)  # Remove URLs
    name = re.sub(r'www\.\S+', '', name)        # Remove www links
    name = re.sub(r'[^A-Za-z0-9 &]', '', name)  # Remove special characters except space and &
    
    return name.strip()

# Apply to the column
df['startup'] = df['startup'].apply(clean_startup_name)


In [45]:
df.head(5)

Unnamed: 0,Sr No,date,startup,vertical,subvertical,city,investors,round,amount
0,1,2020-01-09,ByjuS,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,1650.0
1,2,2020-01-13,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,66.39925
2,3,2020-01-09,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,151.460595
3,4,2020-01-02,HttpsWwwWealthbucket,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,24.75
4,5,2020-01-02,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,14.85


In [29]:
pip install rapidfuzz


Defaulting to user installation because normal site-packages is not writeable
Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp312-cp312-win_amd64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp312-cp312-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ------------------- -------------------- 0.8/1.6 MB 3.0 MB/s eta 0:00:01
   ---------------------------------------- 1.6/1.6 MB 3.8 MB/s eta 0:00:00
Installing collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0
Note: you may need to restart the kernel to use updated packages.


In [31]:
from rapidfuzz import fuzz, process
unique_names = df['startup'].dropna().unique()
correction_map = {}
threshold = 85
for name in unique_names:
    matches = process.extract(name, unique_names, scorer=fuzz.token_sort_ratio, limit=10)
    for match_name, score, _ in matches:
        if score >= threshold and match_name != name:
            correction_map[match_name] = name
df['startup'] = df['startup'].replace(correction_map)


In [35]:
for k, v in correction_map.items():
    print(f'"{k}" : "{v}"')


"Ayefinance" : "Aye Finance"
"Fyle Technologies" : "Fpl Technologies"
"Availfinance" : "Avail Finance"
"Craftstvilla" : "Craftsvilla"
"Unaacademy" : "Unacademy"
"Healthcare" : "My Healthcare"
"Guidoo" : "Guiddoo"
"Veritas Finance" : "Veritas Finance Ltd"
"Lets Transport" : "Letstransport"
"Healthi" : "Healthmir"
"Zippserve" : "Zippserv"
"Moengage" : "Mengage"
"Log 9 Materials" : "Log9 Materials"
"Entropika" : "Entropik"
"Aye Finance" : "Ayefinance"
"Dailyninja" : "Daily Ninja"
"Openapp" : "Opentap"
"Travel Triangle" : "Traveltriangle"
"Fpl Technologies" : "Fyle Technologies"
"Wealthy" : "Wellthy"
"My Healthcare" : "Healthcare"
"Avail Finance" : "Availfinance"
"Healthfin" : "Healthi"
"Healthmir" : "Healthi"
"High Radius" : "Highradius"
"Early Salary" : "Earlysalary"
"Vahdam Teas" : "Vahdam Tea"
"Logicroots" : "Logic Roots"
"91Springboard" : "Springboard"
"Opentap" : "Openapp"
"Happilyunmarried" : "Happily Unmarried"
"Creditmate" : "Credit Mate"
"Tempogo" : "Tempgo"
"Little Black Book De

In [37]:
df['startup'] = df['startup'].apply(lambda x: re.sub(r'[^A-Za-z0-9 &]', '', x))


In [39]:
df['startup'] = df['startup'].replace('', np.nan)
df['startup'] = df['startup'].fillna('Unknown')  # Or use df.dropna(subset=['startup']) to drop


In [41]:
print(df['startup'].value_counts())


startup
Swiggy              8
Ola Cabs            8
Healthifyme         6
Meesho              6
Urbanclap           6
                   ..
Intelligencenode    1
Ticketnew           1
Freecharge          1
Yostra              1
Piqube              1
Name: count, Length: 2169, dtype: int64


In [51]:
df['startup'] = df['startup'].replace("HttpsWwwWealthbucket","Wealth bucket")

In [55]:
df.head(5)

Unnamed: 0,Sr No,date,startup,vertical,subvertical,city,investors,round,amount
0,1,2020-01-09,ByjuS,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,1650.0
1,2,2020-01-13,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,66.39925
2,3,2020-01-09,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,151.460595
3,4,2020-01-02,Wealth bucket,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,24.75
4,5,2020-01-02,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,14.85


In [63]:
df.to_csv('startup_funding_cleaned_final1.csv', index=False)