In [328]:
# Step 1: Import essentials
import pandas as pd   
import numpy as np   
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.metrics.pairwise import cosine_similarity     
import warnings


In [329]:
warnings.filterwarnings("ignore")  # to ignore warnings

In [330]:
# Load the dataset
data = pd.read_csv("investments_VC.csv", encoding="latin1", low_memory=False)
data.head() 

Unnamed: 0,permalink,name,homepage_url,category_list,market,funding_total_usd,status,country_code,state_code,region,...,secondary_market,product_crowdfunding,round_A,round_B,round_C,round_D,round_E,round_F,round_G,round_H
0,/organization/waywire,#waywire,http://www.waywire.com,|Entertainment|Politics|Social Media|News|,News,1750000,acquired,USA,NY,New York City,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,/organization/tv-communications,&TV Communications,http://enjoyandtv.com,|Games|,Games,4000000,operating,USA,CA,Los Angeles,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,/organization/rock-your-paper,'Rock' Your Paper,http://www.rockyourpaper.org,|Publishing|Education|,Publishing,40000,operating,EST,,Tallinn,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,/organization/in-touch-network,(In)Touch Network,http://www.InTouchNetwork.com,|Electronics|Guides|Coffee|Restaurants|Music|i...,Electronics,1500000,operating,GBR,,London,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,/organization/r-ranch-and-mine,-R- Ranch and Mine,,|Tourism|Entertainment|Games|,Tourism,60000,operating,USA,TX,Dallas,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [331]:
# Clean all column names once
data.columns = data.columns.str.strip()


In [332]:
data.columns 

Index(['permalink', 'name', 'homepage_url', 'category_list', 'market',
       'funding_total_usd', 'status', 'country_code', 'state_code', 'region',
       'city', 'funding_rounds', 'founded_at', 'founded_month',
       'founded_quarter', 'founded_year', 'first_funding_at',
       'last_funding_at', 'seed', 'venture', 'equity_crowdfunding',
       'undisclosed', 'convertible_note', 'debt_financing', 'angel', 'grant',
       'private_equity', 'post_ipo_equity', 'post_ipo_debt',
       'secondary_market', 'product_crowdfunding', 'round_A', 'round_B',
       'round_C', 'round_D', 'round_E', 'round_F', 'round_G', 'round_H'],
      dtype='object')

In [333]:
data.shape

(54294, 39)

In [334]:
data.isnull().sum() #to check for null values

permalink                4856
name                     4857
homepage_url             8305
category_list            8817
market                   8824
funding_total_usd        4856
status                   6170
country_code            10129
state_code              24133
region                  10129
city                    10972
funding_rounds           4856
founded_at              15740
founded_month           15812
founded_quarter         15812
founded_year            15812
first_funding_at         4856
last_funding_at          4856
seed                     4856
venture                  4856
equity_crowdfunding      4856
undisclosed              4856
convertible_note         4856
debt_financing           4856
angel                    4856
grant                    4856
private_equity           4856
post_ipo_equity          4856
post_ipo_debt            4856
secondary_market         4856
product_crowdfunding     4856
round_A                  4856
round_B                  4856
round_C   

In [335]:
data.duplicated().sum() #to check for duplicate values

4855

In [336]:
data[data.duplicated()]  #to view the duplicate values

Unnamed: 0,permalink,name,homepage_url,category_list,market,funding_total_usd,status,country_code,state_code,region,...,secondary_market,product_crowdfunding,round_A,round_B,round_C,round_D,round_E,round_F,round_G,round_H
49439,,,,,,,,,,,...,,,,,,,,,,
49440,,,,,,,,,,,...,,,,,,,,,,
49441,,,,,,,,,,,...,,,,,,,,,,
49442,,,,,,,,,,,...,,,,,,,,,,
49443,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54289,,,,,,,,,,,...,,,,,,,,,,
54290,,,,,,,,,,,...,,,,,,,,,,
54291,,,,,,,,,,,...,,,,,,,,,,
54292,,,,,,,,,,,...,,,,,,,,,,


In [337]:
data = data.drop_duplicates()


In [338]:
data.shape

(49439, 39)

In [339]:
# Remove extra spaces from column names
data.columns = data.columns.str.strip()

# Check again
print(data.columns)


Index(['permalink', 'name', 'homepage_url', 'category_list', 'market',
       'funding_total_usd', 'status', 'country_code', 'state_code', 'region',
       'city', 'funding_rounds', 'founded_at', 'founded_month',
       'founded_quarter', 'founded_year', 'first_funding_at',
       'last_funding_at', 'seed', 'venture', 'equity_crowdfunding',
       'undisclosed', 'convertible_note', 'debt_financing', 'angel', 'grant',
       'private_equity', 'post_ipo_equity', 'post_ipo_debt',
       'secondary_market', 'product_crowdfunding', 'round_A', 'round_B',
       'round_C', 'round_D', 'round_E', 'round_F', 'round_G', 'round_H'],
      dtype='object')


In [340]:
# Drop rows where name, category_list, or market are missing
data = data.dropna(subset=['name', 'category_list', 'market'])
data.shape 

(45469, 39)

In [341]:
# Convert funding_total_usd to numeric (remove non-numeric stuff)
data['funding_total_usd'] = (
    data['funding_total_usd']
    .replace('[\$,]', '', regex=True)   # remove $ and commas
    .replace('?', None)                 # handle weird values
)

# Convert to numeric type (floats/ints), errors='coerce' will turn invalid to NaN
data['funding_total_usd'] = pd.to_numeric(data['funding_total_usd'], errors='coerce')

# Fill missing values with 0 (or median/mean, depending on use case)
data['funding_total_usd'] = data['funding_total_usd'].fillna(0)

# Check results
print(data['funding_total_usd'].head())


0    1750000.0
1    4000000.0
2      40000.0
3    1500000.0
4      60000.0
Name: funding_total_usd, dtype: float64


In [342]:
# Remove leading/trailing "|" and extra spaces
data['category_list'] = data['category_list'].str.strip('|').str.strip()

# Then split and take the first category
data['main_category'] = data['category_list'].str.split('|').str[0]

# Check again
print(data[['category_list', 'main_category']].head(10))



                                       category_list  main_category
0           Entertainment|Politics|Social Media|News  Entertainment
1                                              Games          Games
2                               Publishing|Education     Publishing
3  Electronics|Guides|Coffee|Restaurants|Music|iP...    Electronics
4                        Tourism|Entertainment|Games        Tourism
5                                           Software       Software
6                                        Advertising    Advertising
7                                        Curated Web    Curated Web
8                                           Software       Software
9                                              Games          Games


In [343]:
data = data[['name', 'main_category', 'market', 'funding_total_usd',
            'funding_rounds', 'status', 'country_code', 'founded_year']]



In [344]:
data.shape

(45469, 8)

In [345]:
data.columns = data.columns.str.strip()


In [346]:
print(data.columns.tolist())


['name', 'main_category', 'market', 'funding_total_usd', 'funding_rounds', 'status', 'country_code', 'founded_year']


In [347]:
# Numerical
data['funding_rounds'] = data['funding_rounds'].fillna(data['funding_rounds'].median())
data['founded_year'] = data['founded_year'].fillna(data['founded_year'].median())

# Categorical
data['market'] = data['market'].fillna('Unknown')
data['status'] = data['status'].fillna('Unknown')
data['country_code'] = data['country_code'].fillna('Unknown')


In [348]:
data.head()

Unnamed: 0,name,main_category,market,funding_total_usd,funding_rounds,status,country_code,founded_year
0,#waywire,Entertainment,News,1750000.0,1.0,acquired,USA,2012.0
1,&TV Communications,Games,Games,4000000.0,2.0,operating,USA,2009.0
2,'Rock' Your Paper,Publishing,Publishing,40000.0,1.0,operating,EST,2012.0
3,(In)Touch Network,Electronics,Electronics,1500000.0,1.0,operating,GBR,2011.0
4,-R- Ranch and Mine,Tourism,Tourism,60000.0,2.0,operating,USA,2014.0


In [349]:
# Create a 'profile' column using original categorical columns BEFORE encoding
data['profile'] = (
    data['main_category'].astype(str) + " " +
    data['market'].astype(str) + " " +
    data['status'].astype(str)
)


In [350]:
data.head 

<bound method NDFrame.head of                                 name        main_category  \
0                           #waywire        Entertainment   
1                 &TV Communications                Games   
2                  'Rock' Your Paper           Publishing   
3                  (In)Touch Network          Electronics   
4                 -R- Ranch and Mine              Tourism   
...                              ...                  ...   
49433                          Zzish            Analytics   
49434  ZZNode Science and Technology  Enterprise Software   
49435          Zzzzapp Wireless ltd.      Web Development   
49436                  [a]list games                Games   
49437                          [x+1]  Enterprise Software   

                      market  funding_total_usd  funding_rounds     status  \
0                      News           1750000.0             1.0   acquired   
1                     Games           4000000.0             2.0  operating   
2  

In [351]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(data['profile'])


In [356]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_startups(user_input, top_n=5, min_budget=None, max_budget=None, country=None):
    user_profile = " ".join(user_input)
    user_vec = vectorizer.transform([user_profile])
    similarity = cosine_similarity(user_vec, tfidf_matrix).flatten()
    top_indices = similarity.argsort()[::-1]

    recommendations = data.iloc[top_indices]

    # Budget filter
    if min_budget is not None:
        recommendations = recommendations[recommendations['funding_total_usd'] >= min_budget]
    if max_budget is not None:
        recommendations = recommendations[recommendations['funding_total_usd'] <= max_budget]

    # Country filter
    if country is not None:
        recommendations = recommendations[recommendations['country_code'] == country]

    # Format funding column for display
    recommendations['funding_total_usd'] = recommendations['funding_total_usd'].apply(lambda x: f"${x:,.0f}")

    return recommendations.head(top_n)[
        ['name', 'main_category', 'market', 'funding_total_usd',
         'status', 'country_code', 'founded_year']
    ]




In [357]:
recommend_startups(["AI", "Software"], top_n=5, min_budget=1e6, max_budget=1e7, country="USA")



Unnamed: 0,name,main_category,market,funding_total_usd,status,country_code,founded_year
1991,AllPlayers.com,Software,Software,"$2,449,500",operating,USA,2009.0
3284,Arrayent Health,Software,Software,"$1,083,000",operating,USA,2011.0
47139,Webflow,Software,Software,"$1,500,000",operating,USA,2012.0
8899,CloudHealth Technologies,Software,Software,"$7,700,000",operating,USA,2012.0
30083,Ohana Companies,Software,Software,"$2,500,000",operating,USA,2007.0
