In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import time
import pdb
from scipy import sparse, io

In [107]:
class import_data(object):
     
    def __init__(self):
        """
        Importing data
        """
        self.data = pd.read_csv("data.csv")
        self.tfidf = io.mmread("tfidf.mtx")
        self.tfidf = self.tfidf.tocsr()

In [136]:
class similarity(import_data):
    
    def __init__(self, import_data):
        """
        Inheriting data from parent class
        """
        
        self.data = import_data.data
        self.tfidf = import_data.tfidf
        self.length = len(self.data)
        
    def rank_df(self, similarity_scores):
        """
        Function to rank the data by similarity score
        Returns: Ordered Dataframe by desc score
        """
        new_df = self.data.copy()
        new_df['similarity_scores'] = similarity_scores
        
        return new_df.sort_values(by="similarity_scores", ascending=False)
    
    def similar_comp(self, company_name, print_time=True):
        """
        Function Calculates Cosine similarity between a selected company with all companies
        Returns: Similarity Scores
        """
        start = time.time()
        # index of the selected company in the data
        index = self.data[self.data['Company Name'] == company_name].index
        
        # compute cosine similarity
        similarity_scores = cosine_similarity(self.tfidf[index], self.tfidf)
        
        # Reshape array and add it to the 
        similarity_scores = np.reshape(similarity_scores, self.length)
        
        if print_time:
            print("Time Taken to Calculate companies similar to {}: {}".format(company_name,time.time()-start))
        
        return similarity_scores
 
    def similar_investor_comp(self, investor, print_time=True):
        """
        Function finds all companies invested by investor and the corresponding most similar companies
        Returns: Aggregated Similarity Scores 
        """
        
        start = time.time()
        # Find all the companies invested by investor
        index = self.data['Active Investors'].str.contains(investor, case=False, na=False)
        companies = self.data['Company Name'][index]
        
        # Calculate an aggregate score of similarity
        aggregate_scores = np.zeros(self.length)
        total_companies = len(companies)
        for c in companies:
            aggregate_scores += self.similar_comp(c, print_time=False)
        aggregate_scores = aggregate_scores/total_companies
        
        if print_time:
            print("Time Taken to Calculate companies similar to {} companies Invested by {}: {}".format(total_companies,investor,time.time()-start))
            
        return aggregate_scores
    
    def sort(self, df, scores):
        """
        Function to rank the data by similarity score
        Returns: Ordered Dataframe by desc score
        """
        df['scores'] = scores

        return df.sort_values(by="scores", ascending=False)


    def calculate_rank(self, df_col):
        """
        Function to calculate rank for a column
        Returns: Pandas Rank Series
        """
        return np.array(df_col.rank(ascending=1))


    def relevance(self, df, column_weights, print_time=True):
        """
        Function to compute rank companies based on the weights and data
        Input: column_weights is a dictionary with col name as key and weight as value
        Returns: Final Relevance Rank (Higher means better)
        """

        start = time.time()
        rank_df = df[list(column_weights.keys())]
        rank_df = rank_df.fillna(-1, inplace=False)

        relevance_score = np.zeros(len(rank_df))
        for column, weight in column_weights.items():

            score = self.calculate_rank(rank_df[column]) 
            relevance_score += score * weight

        relevance_score = relevance_score/len(column_weights)

        if print_time:
            print("Time Taken to Calculate Relevance for criteria {}: {}".format(column_weights,time.time()-start))
        
        return relevance_score



In [127]:
# Create a data import object
d = import_data()

In [138]:
# Inherit the imported data
s = similarity(d)

similar_to_uber = s.similar_comp('Uber Technologies')
similar_to_uber = s.rank_df(similar_to_uber)
similar_to_fm = s.similar_investor_comp('Fred McPhail')
similar_to_fm = s.rank_df(similar_to_fm)

Time Taken to Calculate companies similar to Uber Technologies: 0.038453102111816406
Time Taken to Calculate companies similar to 1 companies Invested by Fred McPhail: 0.1081840991973877


In [131]:
column_weights = {"Total Raised":0.2,"Employees":0.1,"Growth Rate":0.3,"# Active Investors":0.4}
relevance = s.relevance(d.data, column_weights)
relevance = s.rank_df(relevance)

Time Taken to Calculate Relevance for criteria {'Total Raised': 0.2, 'Employees': 0.1, 'Growth Rate': 0.3, '# Active Investors': 0.4}: 0.015549898147583008


In [132]:
relevance

Unnamed: 0.1,Unnamed: 0,Company ID,Company Name,Company Former Name,Company Also Known As,PBId,Description,Primary Industry Sector,Primary Industry Group,Primary Industry Code,...,Facebook Likes Change,Facebook Likes % Change,Majestic Referring Domains,Majestic Referring Domains Change,Majestic Referring Domains % Change,Twitter Followers,Twitter Followers Change,Twitter Followers % Change,PitchBook Link,similarity_scores
92,99,43128-46,Foursquare,Foursquare All-Stars,,43128-46,Provider of location-based social networking s...,Information Technology,Software,Social/Platform Software,...,-564.0,-0.047971,11634.0,-67.0,-0.571759,1727.0,1453.0,529.739583,,11951.9250
184,191,52693-21,Return Path,Ulocate.Com,,52693-21,Provider of an email marketing service that wo...,Information Technology,Software,Business/Productivity Software,...,6.0,0.137883,1286.0,-9.0,-0.722081,15813.0,22.0,0.137957,,11918.3125
3213,3220,51136-75,Uber Technologies,UberCab,Uber,51136-75,Provider of a mobile application that connects...,Information Technology,Software,Social/Platform Software,...,149218.0,1.726133,49044.0,29.0,0.057756,719038.0,7293.0,1.024758,,11871.8375
2224,2231,42936-49,Klarna,Kreditor Europe,,42936-49,Provider of a billing and electronic commerce ...,Information Technology,Software,Vertical Market Software,...,1044.0,3.725980,3998.0,4.0,0.082252,6018.0,22.0,0.376026,,11823.0750
447,454,65897-20,Collective Health,,,65897-20,Provider of cloud-based self-insurance platfor...,Information Technology,Software,Business/Productivity Software,...,,,81.0,1.0,0.889680,4741.0,112.0,2.432491,,11821.9625
2815,2822,54705-79,SigFox,,,54705-79,Operator of a cellular network. The company sp...,Information Technology,Communications and Networking,Wireless Communications Equipment,...,19.0,0.557200,2179.0,14.0,0.633371,12379.0,82.0,0.660975,,11805.4500
958,965,61434-55,Guardant Health,,,61434-55,Provider of digital sequencing technology for ...,Healthcare,Healthcare Services,Laboratory Services (Healthcare),...,2.0,0.550775,428.0,2.0,0.446678,1217.0,18.0,1.464983,,11796.2000
428,435,51500-71,Optoro,,,51500-71,Provider of cloud-based tools for the manageme...,Information Technology,Software,Application Software,...,4.0,0.671592,229.0,2.0,0.837521,1184.0,3.0,0.233748,,11794.4375
2914,2921,60685-21,Ring,Doorbot,,60685-21,Developer of wireless frequency enabled securi...,Consumer Products and Services (B2C),Consumer Durables,Electronics (B2C),...,,,838.0,0.0,-0.102180,11219.0,156.0,1.406141,,11781.4000
2694,2701,45284-86,BrightSource Energy,Luz II,,45284-86,Developer of solar thermal technology. The com...,Energy,"Exploration, Production and Refining",Energy Production,...,-1.0,-0.021539,1997.0,6.0,0.283296,2796.0,-1.0,-0.066377,,11755.9500


In [90]:
c

In [85]:
c

Unnamed: 0.1,Unnamed: 0,Company ID,Company Name,Company Former Name,Company Also Known As,PBId,Description,Primary Industry Sector,Primary Industry Group,Primary Industry Code,...,Facebook Likes,Facebook Likes Change,Facebook Likes % Change,Majestic Referring Domains,Majestic Referring Domains Change,Majestic Referring Domains % Change,Twitter Followers,Twitter Followers Change,Twitter Followers % Change,PitchBook Link
0,7,168732-64,HashCut,,,168732-64,Provider of a platform for creating and editin...,Information Technology,Software,Application Software,...,97327.0,-1612.0,-1.629138,9.0,0.0,0.000000,58.0,16.0,38.095238,
1,8,59218-48,Ministry,Ministry of Supply,,59218-48,Manufacturer and online retailer of business w...,Consumer Products and Services (B2C),Apparel and Accessories,Clothing,...,31630.0,52.0,0.164671,574.0,-8.0,-1.279850,5507.0,-5.0,-0.081206,
2,9,54110-89,Disconnect,,Disconnect.me,54110-89,Developer of online privacy and security softw...,Information Technology,Software,Network Management Software,...,10997.0,-4.0,-0.032464,419.0,-2.0,-0.356295,5980.0,12.0,0.205441,
3,10,163574-65,Roho (Religious Content),,,163574-65,Provider of a religious content sharing platfo...,Information Technology,Software,Social/Platform Software,...,75344.0,1204.0,1.624141,7.0,-1.0,-12.500000,21839.0,-518.0,-2.318181,
4,11,168708-16,Listen (app),,,168708-16,Developer of a communication management applic...,Information Technology,Software,Application Software,...,,,,4.0,1.0,33.333333,,,,
5,12,91125-01,iBiz Software,,iBizSoft Inc.,91125-01,Developer of commerce based applications for e...,Information Technology,Software,Application Software,...,6467.0,-3.0,-0.039743,49.0,0.0,-1.719198,3791.0,-860.0,-18.505666,
6,13,10493-20,Leaf Group (LFGR),Demand Media,,10493-20,Provider of digital media and domain services ...,Business Products and Services (B2B),Commercial Services,Media and Information Services (B2B),...,78.0,1.0,2.222222,553.0,17.0,3.182423,8712.0,-1.0,-0.003552,
7,14,61953-04,Mechio,,Motiv,61953-04,Manufacturer of wearable technology products. ...,Consumer Products and Services (B2C),Consumer Durables,Electronics (B2C),...,1683.0,215.0,14.684702,,,,663.0,41.0,6.599311,
8,15,50982-31,Tryton Medical,Anvil Medical,,50982-31,Provider of stent systems for the treatment of...,Healthcare,Healthcare Devices and Supplies,Surgical Devices,...,3191.0,11.0,0.327869,64.0,-3.0,-4.746835,223.0,0.0,0.298954,
9,16,81672-40,250ok,,,81672-40,Provider of an email analytics platform. The c...,Information Technology,Software,Business/Productivity Software,...,1439.0,-5.0,-0.356083,27.0,1.0,2.717391,8628.0,-173.0,-1.969854,


In [103]:
c = pd.read_csv("data.csv")

In [104]:
c = c[list(column_weights.keys())].fillna(-1,inplace=False)

In [105]:
c

Unnamed: 0,Total Raised,Employees,Growth Rate,# Active Investors
0,-1.00,-1.0,85.165149,1.0
1,8.85,30.0,82.342480,17.0
2,4.10,12.0,70.186290,11.0
3,1.10,4.0,55.036277,2.0
4,0.50,-1.0,49.726486,1.0
5,0.25,66.0,49.099824,-1.0
6,363.21,350.0,47.880172,-1.0
7,14.17,18.0,44.319477,7.0
8,83.09,21.0,43.068174,6.0
9,0.30,15.0,42.163152,5.0
