In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import time
import pdb
from scipy import sparse, io

In [63]:
class import_data(object):
     
    def __init__(self):
        """
        Importing data
        """
        self.data = pd.read_csv("data.csv",encoding='ISO-8859-1').fillna('')
        self.tfidf = io.mmread("tfidf.mtx")
        self.tfidf = tfidf.tocsr()

In [153]:
class similarity(import_data):
    
    def __init__(self, import_data):
        """
        Inheriting data from parent class
        """
        
        self.data = import_data.data
        self.tfidf = import_data.tfidf
        self.length = len(self.data)
        
    def rank_df(self, similarity_scores):
        """
        Function to rank the data by similarity score
        Returns: Ordered Dataframe by desc score
        """
        new_df = self.data.copy()
        new_df['similarity_scores'] = similarity_scores
        
        return new_df.sort_values(by="similarity_scores", ascending=False)
    
    def similar_comp(self, company_name, print_time=True):
        """
        Function Calculates Cosine similarity between a selected company with all companies
        Returns: Similarity Scores
        """
        start = time.time()
        # index of the selected company in the data
        index = self.data[self.data['Company Name'] == company_name].index
        
        # compute cosine similarity
        similarity_scores = cosine_similarity(self.tfidf[index], self.tfidf)
        
        # Reshape array and add it to the 
        similarity_scores = np.reshape(similarity_scores, self.length)
        
        if print_time:
            print("Time Taken to Calculate companies similar to {}: {}".format(company_name,time.time()-start))
        
        return similarity_scores
 
    def similar_investor_comp(self, investor, print_time=True):
        """
        Function finds all companies invested by investor and the corresponding most similar companies
        Returns: Aggregated Similarity Scores 
        """
        
        start = time.time()
        # Find all the companies invested by investor
        index = self.data['Active Investors'].str.contains(investor, case=True)
        companies = self.data[index]['Company Name']
        
        # Calculate an aggregate score of similarity
        aggregate_scores = np.zeros(self.length)
        total_companies = len(companies)
        for c in companies:
            aggregate_scores += self.similar_comp(c, print_time=False)
        aggregate_scores = aggregate_scores/total_companies
        
        if print_time:
            print("Time Taken to Calculate companies similar to {} companies Invested by {}: {}".format(total_companies,investor,time.time()-start))
            
        return aggregate_scores


In [141]:
# Create a data import object
d = import_data()

In [155]:
# Inherit the imported data
s = similarity(d)

similar_to_uber = s.similar_comp('Uber Technologies')
similar_to_uber = s.rank_df(similar_to_uber)

similar_to_fb = s.similar_comp('Facebook (FB)')
similar_to_fb = s.rank_df(similar_to_fb)

similar_to_tesla = s.similar_comp('Tesla (TSLA)')
similar_to_tesla = s.rank_df(similar_to_tesla)

similar_to_mc = s.similar_investor_comp('Mackenzie Capital Management')
similar_to_mc = s.rank_df(similar_to_mc)

similar_to_crv = s.similar_investor_comp('Charles River Ventures')
similar_to_crv = s.rank_df(similar_to_crv)

similar_to_fm = s.similar_investor_comp('Fred McPhail')
similar_to_fm = s.rank_df(similar_to_fm)

Time Taken to Calculate companies similar to Uber Technologies: 0.03702807426452637
Time Taken to Calculate companies similar to Facebook (FB): 0.034883975982666016
Time Taken to Calculate companies similar to Tesla (TSLA): 0.03286123275756836
Time Taken to Calculate companies similar to 2 companies Invested by Mackenzie Capital Management: 0.11142659187316895
Time Taken to Calculate companies similar to 102 companies Invested by Charles River Ventures: 3.0395238399505615
Time Taken to Calculate companies similar to 1 companies Invested by Fred McPhail: 0.06317996978759766


In [146]:
similar_to_uber.head(20)

Unnamed: 0.1,Unnamed: 0,Company ID,Company Name,Company Former Name,Company Also Known As,PBId,Description,Primary Industry Sector,Primary Industry Group,Primary Industry Code,...,Facebook Likes Change,Facebook Likes % Change,Majestic Referring Domains,Majestic Referring Domains Change,Majestic Referring Domains % Change,Twitter Followers,Twitter Followers Change,Twitter Followers % Change,PitchBook Link,similarity_scores
3213,3220,51136-75,Uber Technologies,UberCab,Uber,51136-75,Provider of a mobile application that connects...,Information Technology,Software,Social/Platform Software,...,149218.0,1.72613,49044.0,29.0,0.0577558,719038.0,7293.0,1.02476,,1.0
30201,30208,113310-64,Asterride,,,113310-64,Owner and operator a transportation network co...,Consumer Products and Services (B2C),Transportation,Automotive,...,0.0,-0.0059641,51.0,1.0,1.87793,892.0,1.0,0.192369,,0.445804
28323,28330,119556-10,Arrive,ShuttleBeacon,Trajectory Solutions,119556-10,Developer of platform for transportation compa...,Information Technology,Software,Social/Platform Software,...,,,4.0,0.0,0.0,,,,,0.439246
2311,2318,61568-38,RideLabs,InstantCab,Summon,61568-38,Provider of transportation services and a tran...,Information Technology,Software,Application Software,...,-1.0,-0.0784314,24.0,1.0,3.06748,12.0,0.0,0.0,,0.433252
35872,35879,54595-54,Gett,GetTaxi,,54595-54,Provider of a mobile application for transport...,Information Technology,Software,Social/Platform Software,...,233.0,0.158057,164.0,-3.0,-1.46598,14907.0,17.0,0.118008,,0.413552
24381,24388,100593-19,Information Technologies Curves,,IT Curves,100593-19,Provider of an online transportation managemen...,Business Products and Services (B2B),Commercial Transportation,Other Transportation,...,0.0,0.0,20.0,0.0,0.0,2.0,0.0,0.0,,0.386892
19772,19779,92599-21,Swyft Technologies,,Swyft,92599-21,Developer of an online platform for comparing ...,Information Technology,Software,Social/Platform Software,...,,,34.0,0.0,-2.45902,7.0,0.0,0.0,,0.35565
38144,38151,56809-72,Social Bicycles,,Sobi,56809-72,Provider of a public bike-sharing network. The...,Consumer Products and Services (B2C),Transportation,Other Transportation,...,4.0,0.110825,107.0,-2.0,-1.73837,2522.0,11.0,0.426621,,0.35448
17383,17390,127618-48,Hansom Mind Innovations,,Hansom,127618-48,Developer and provider of peer-to-peer subscri...,Information Technology,Software,Application Software,...,0.0,0.0,5.0,0.0,0.0,255.0,1.0,0.213004,,0.354009
5718,5725,58342-60,VeriTread,Rounders Transportation Management Systems,VT,58342-60,"Provider of a platform to build, manage and op...",Business Products and Services (B2B),Commercial Transportation,Other Transportation,...,9.0,0.0542611,51.0,1.0,1.42045,528.0,2.0,0.397902,,0.345122


In [147]:
similar_to_fb.head(20)

Unnamed: 0.1,Unnamed: 0,Company ID,Company Name,Company Former Name,Company Also Known As,PBId,Description,Primary Industry Sector,Primary Industry Group,Primary Industry Code,...,Facebook Likes Change,Facebook Likes % Change,Majestic Referring Domains,Majestic Referring Domains Change,Majestic Referring Domains % Change,Twitter Followers,Twitter Followers Change,Twitter Followers % Change,PitchBook Link,similarity_scores
5289,5296,10695-52,Facebook (FB),The Facebook,FB,10695-52,Operator of a social-networking platform. The ...,Information Technology,Software,Social/Platform Software,...,327663.0,0.178605,48162.0,107.0,0.221221,14014200.0,1215.0,0.00866948,,1.0
10898,10905,60161-77,Dose Media,Spartz,,60161-77,Operator of a digital media company. The compa...,Information Technology,Software,Social/Platform Software,...,11.0,1.39115,,,,46.0,0.0,0.0,,0.421026
25417,25424,118307-08,Fliver,,,118307-08,Provider of a platform for sharing users socia...,Consumer Products and Services (B2C),Media,Social Content,...,,,2.0,0.0,0.0,74.0,0.0,-0.575816,,0.396515
21134,21141,166368-88,Real Labs (Social Network),,,166368-88,Provider of a social networking platform. The ...,Information Technology,Software,Social/Platform Software,...,,,7.0,1.0,4.25532,,,,,0.38964
7548,7555,94925-89,Redgage,,,94925-89,Owner and operator of a social networking plat...,Consumer Products and Services (B2C),Media,Social Content,...,7.0,1.2216,3842.0,-48.0,-1.24284,1638.0,-1.0,-0.0610128,,0.381786
16646,16653,97315-21,PeggSite,,PeggSite.com,97315-21,Provider of a social media platform. The compa...,Information Technology,Software,Social/Platform Software,...,1.0,0.460829,,,,97.0,0.0,-0.175695,,0.371038
5575,5582,163248-67,Peach,,,163248-67,Developer of a social networking application. ...,Information Technology,Software,Application Software,...,0.0,0.0,34.0,1.0,3.64807,4229.0,-15.0,-0.344015,,0.367715
25420,25427,87966-64,Flipiture,,Pixt,87966-64,Developer of a social sharing platform. The co...,Information Technology,Software,Social/Platform Software,...,,,21.0,1.0,3.52113,47.0,0.0,0.0,,0.363698
2600,2607,98233-93,Shareable Social,,,98233-93,Provider of a software platform for content an...,Information Technology,Software,Social/Platform Software,...,2.0,0.0819504,12.0,0.0,-6.66667,717.0,0.0,0.0199045,,0.3623
35813,35820,53915-95,PitchEngine,,,53915-95,Provider of a digital social media platform. T...,Business Products and Services (B2B),Commercial Services,Media and Information Services (B2B),...,1.0,0.000961881,5814.0,-48.0,-0.824805,8534.0,-5.0,-0.0618946,,0.359591


In [150]:
similar_to_tesla.head(20)

Unnamed: 0.1,Unnamed: 0,Company ID,Company Name,Company Former Name,Company Also Known As,PBId,Description,Primary Industry Sector,Primary Industry Group,Primary Industry Code,...,Facebook Likes Change,Facebook Likes % Change,Majestic Referring Domains,Majestic Referring Domains Change,Majestic Referring Domains % Change,Twitter Followers,Twitter Followers Change,Twitter Followers % Change,PitchBook Link,similarity_scores
5610,5617,10377-37,Tesla (TSLA),Tesla Motors,,10377-37,Manufacturer of a branded line of electric car...,Consumer Products and Services (B2C),Transportation,Automotive,...,10877.0,0.566859,16023.0,271.0,1.72096,1245470.0,13030.0,1.05726,,1.0
13971,13978,56553-67,Detroit Electric,,,56553-67,Manufacturer of electric vehicles. The company...,Consumer Products and Services (B2C),Transportation,Automotive,...,280.0,0.348616,20.0,1.0,3.7037,839.0,1.0,0.124972,,0.415657
40682,40689,97738-57,ShurTrax,,,97738-57,Manufacturer and distributor of automotive acc...,Consumer Products and Services (B2C),Transportation,Automotive,...,,,145.0,0.0,-0.587659,,,,,0.374661
44260,44267,160473-97,Otonomo,,,160473-97,Provider of a cloud platform for autonomous ca...,Information Technology,Software,Social/Platform Software,...,,,,,,,,,,0.282589
8582,8589,117225-73,BestMile,,,117225-73,Developer of software application for driverle...,Information Technology,Software,Application Software,...,-1.0,-0.205128,,,,1080.0,13.0,1.20482,,0.282502
26245,26252,151327-72,Dispatch (US),,Dispatch,151327-72,Manufacturer of an autonomous delivery platfor...,Business Products and Services (B2B),Commercial Products,Distributors/Wholesale,...,,,,,,77.0,0.0,1.03896,,0.282494
1398,1405,125620-39,Drive.ai,,,125620-39,Developer of software technologies for artific...,Information Technology,Software,Vertical Market Software,...,,,277.0,-2.0,-0.681315,,,,,0.276291
31440,31447,101289-43,Damage Hounds,,,101289-43,Developer and provider of an information platf...,Information Technology,Software,Social/Platform Software,...,0.0,0.0,1.0,0.0,0.0,449.0,0.0,0.0424088,,0.266235
40767,40774,56000-17,New Eagle,,New Eagle Products,56000-17,Provider of mechatronic control systems. The c...,Information Technology,Computer Hardware,Electronic Components,...,1.0,0.296443,173.0,10.0,5.97731,138.0,0.0,-0.15448,,0.257012
9590,9597,54670-33,eGO Vehicles,,,54670-33,Manufacturer of electric vehicles. The company...,Consumer Products and Services (B2C),Transportation,Automotive,...,1.0,0.288462,246.0,1.0,0.291206,,,,,0.2561


In [156]:
similar_to_crv.head(20)

Unnamed: 0.1,Unnamed: 0,Company ID,Company Name,Company Former Name,Company Also Known As,PBId,Description,Primary Industry Sector,Primary Industry Group,Primary Industry Code,...,Facebook Likes Change,Facebook Likes % Change,Majestic Referring Domains,Majestic Referring Domains Change,Majestic Referring Domains % Change,Twitter Followers,Twitter Followers Change,Twitter Followers % Change,PitchBook Link,similarity_scores
20605,20612,93282-58,SeatNinja,,,93282-58,Developer of a social platform. The company of...,Information Technology,Software,Social/Platform Software,...,,,24.0,0.0,0.0,18.0,0.0,0.0,,0.164634
17760,17767,64669-96,Widdle,The Nest App,,64669-96,Provider of an online social media platform. T...,Information Technology,Software,Application Software,...,-1.0,-0.0713097,8.0,0.0,-9.67742,201.0,-1.0,-0.775194,,0.154768
16646,16653,97315-21,PeggSite,,PeggSite.com,97315-21,Provider of a social media platform. The compa...,Information Technology,Software,Social/Platform Software,...,1.0,0.460829,,,,97.0,0.0,-0.175695,,0.144556
22961,22968,160220-80,MobilePhire,,,160220-80,Provider of a mobile data management platform....,Information Technology,Software,Social/Platform Software,...,0.0,0.0,47.0,-1.0,-2.08333,37.0,0.0,0.0,,0.141103
23432,23439,99227-71,Madme Technologies,,mAdme,99227-71,Provider of mobile advertising services. The c...,Consumer Products and Services (B2C),Media,Information Services (B2C),...,,,16.0,0.0,0.0,7.0,0.0,0.0,,0.13896
20760,20767,65404-81,SaaS Software,,SSi,65404-81,Developer of business information technology p...,Information Technology,Software,Business/Productivity Software,...,,,7.0,0.0,0.0,,,,,0.138218
47109,47116,100028-80,Component Software Corporation,,,100028-80,Provider of software services.,Information Technology,Software,Business/Productivity Software,...,,,,,,,,,,0.13702
1761,1768,113697-73,Keynectup,,,113697-73,Developer of a mobile application. The company...,Information Technology,Software,Application Software,...,0.0,3.2967,29.0,1.0,2.52525,2817.0,-17.0,-0.614857,,0.136803
34512,34519,63506-62,Liftoff,,Liftoff Mobile,63506-62,Operator of a marketing technology company. Th...,Information Technology,Software,Social/Platform Software,...,30.0,0.755533,32.0,0.0,0.0,1534.0,9.0,0.607111,,0.136441
6774,6781,53600-50,Personal,P3rsonal,,53600-50,Developer of an online platform for sharing da...,Consumer Products and Services (B2C),Media,Social Content,...,-1.0,-0.0230017,590.0,-5.0,-0.931255,5681.0,-7.0,-0.123466,,0.135576
