In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
data = pd.read_csv("data.csv",encoding='ISO-8859-1')

In [3]:
print(data.info())
print(data.columns.values)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48488 entries, 0 to 48487
Columns: 130 entries, Unnamed: 0 to PitchBook Link
dtypes: float64(58), int64(1), object(71)
memory usage: 48.1+ MB
None
['Unnamed: 0' 'Company ID' 'Company Name' 'Company Former Name'
 'Company Also Known As' 'PBId' 'Description' 'Primary Industry Sector'
 'Primary Industry Group' 'Primary Industry Code' 'All Industries'
 'Industry Vertical' 'Company Financing Status' 'Total Raised'
 'Business Status' 'Ownership Status' 'Universe' 'Website' 'Employees'
 'Exchange' 'Ticker' 'Year Founded' 'Parent Company' 'Daily Updates'
 'Weekly Updates' 'Revenue' 'Gross Profit' 'Net Income' 'Enterprise Value'
 'EBITDA' 'Fiscal Period' 'Primary Contact PBId' 'Primary Contact'
 'Primary Contact Title' 'Primary Contact Email' 'Primary Contact Phone'
 'HQ Location' 'HQ Address Line 1' 'HQ Address Line 2' 'HQ City'
 'HQ State/Province' 'HQ Post Code' 'HQ Country' 'HQ Phone' 'HQ Fax'
 'HQ Email' 'HQ Global Region' 'HQ Global Sub Re

In [4]:
def preprocess_data(df):
    
    cols = ['Description', 'Primary Industry Sector','Primary Industry Group', 'All Industries',
            'Industry Vertical', 'Company Financing Status', 'Business Status', 'Ownership Status', 
            'Financing Status Note', 'Active Investors', 'Acquirers', 'Former Investors', 
           'General Services']
    
    feature_string = df[cols]
    feature_string = feature_string[feature_string.columns[1:]].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)
    df['feature_string'] = feature_string
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['feature_string'])

    return tfidf_matrix

In [7]:
tfidf_matrix = preprocess_data(data)

In [8]:
def similar_comp(company_name, tfidf, top_n):
    
    index = data[data['Company Name'] == company_name].index
    similarity_scores = cosine_similarity(tfidf[index], tfidf)
    similar_idx = sorted(range(len(similarity_scores[0])), key=lambda i: similarity_scores[0][i],reverse=True)[:top_n]
    scores= sorted(similarity_scores[0], reverse=True)[:top_n]
    print("\n\nTop %d companies similar to: %s \n" %(top_n, company_name))
    print(data['Company Name'][similar_idx],"\n\n", "Similarity Scores: ", scores)

    
similar_comp("Roho (Religious Content)",tfidf_matrix,10)
similar_comp("13th Floor Studios",tfidf_matrix,10)
similar_comp("21st Century Newspapers",tfidf_matrix,10)




Top 10 companies similar to: Roho (Religious Content) 

3        Roho (Religious Content)
22302                     OK Play
1887                     Onekloud
34859              Get Lighthouse
23666               Level Therapy
11038                    LeapCure
13552                 BIGcontrols
24656                     Homebot
32813                   Treat App
9839                 HeavyConnect
Name: Company Name, dtype: object 

 Similarity Scores:  [1.0, 0.81638974136768094, 0.79551538977863667, 0.78481783275244388, 0.71323090989866522, 0.7098435134835499, 0.69292051235861907, 0.68504600800077176, 0.68028221305085901, 0.67702748372595734]


Top 10 companies similar to: 13th Floor Studios 

48482    13th Floor Studios
27413              Careeref
36976            SociGroups
43257     Short Stay Global
32359        Smile Stations
47609                Brella
9378      Penny Stock Dream
19649            TaskBiller
42253            Videoscore
42481          Trio Rewards
Name: Company Name,