In [22]:
import pandas as pd
import numpy as np

### Company similarity

#### compute similarity based on ['revenue', 'num_employees_enum', 'description','industries']

In [11]:
company_df = pd.read_csv("company.csv")
company_df.columns

Index(['name', 'headquaters_location_city', 'headquaters_location_state',
       'headquaters_location_country', 'headquaters_regions', 'founder',
       'website_link', 'hub', 'num_employees_enum', 'last_funding_type',
       'rank_org_company', 'IPO_status', 'industries', 'stock_symbol',
       'description', 'founded_date', 'operating_status', 'also_known_as',
       'company_type', 'legal_name', 'contact_phone', 'number_of_exits',
       'founded', 'type', 'revenue', 'website_link_prepro'],
      dtype='object')

#### # Apply hybrid similarity methods to get the most similar result

In [13]:
# filter useless attributes
company_df = company_df[['name', 'num_employees_enum', 'industries', 'description', 'revenue']]

In [16]:
# deal with number of employees
set(company_df['num_employees_enum'])
# map the num_employees to level
num_employees_dict = {'1 to 50 Employees': 1,
                        '10000+ Employees': 5,
                        '5001 to 10000 Employees': 4,
                        '501 to 1000 Employees': 3,
                        '51 to 200 Employees': 2
                        }

company_df['num_employees_enum'] = company_df['num_employees_enum'].apply(lambda x: num_employees_dict[x])

In [27]:
def split_industries(string):
    if type(string) == type(float('nan')):
        return None
    else:
        return string.strip().split(',')

company_df['industries'] = company_df['industries'].apply(lambda x: split_industries(x))

In [30]:
# look at the total number of industry name
industry_names = []
for names in list(company_df['industries']):
    if names != None:
        for name in names:
            industry_names.append(name)
print(len(industry_names))
print(len(set(industry_names)))

# then we should use jaccard similarity to compute the similarity based on industries

1753
362


In [33]:
# reveneue
set(company_df['revenue'])
revenue_dict = {'$1 to $5 billion (USD)': 5,
                '$1 to $5 million (USD)': 1,
                '$10+ billion (USD)': 7,
                '$100 to $500 million (USD)': 3,
                '$5 to $10 billion (USD)': 6,
                '$5 to $25 million (USD)': 2,
                '$500 million to $1 billion (USD)': 4,
                'Unknown / Non-Applicable': None
                }

company_df['revenue'] = company_df['revenue'].apply(lambda x: revenue_dict[x])

In [39]:
# description part can use tf-idf based cosine similarity to compute actuall similarity
# transform description to arrays 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from strsimpy.normalized_levenshtein import NormalizedLevenshtein
from strsimpy.jaro_winkler import JaroWinkler
import string
from sklearn.feature_extraction.text import TfidfVectorizer
# preporcessing preparation
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/august/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/august/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/august/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/august/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [40]:
stop_words = stop_words.union(set(company_df['name']))
jaro = JaroWinkler()
# build punctuation dict
punct_dict = dict((ord(punct), ' ') for punct in string.punctuation)
lemmer = WordNetLemmatizer()

def lemmatizer(sentence):
    # step 1: tokenize
    word_tokens = word_tokenize(sentence.lower().translate(punct_dict))
    # step 2: remove stopwords
    filtered_tokens = [ token for token in word_tokens if token not in stop_words ]
    # step 3: lemmatization
    filtered_tokens = [lemmer.lemmatize(token) for token in filtered_tokens]
    return filtered_tokens

In [49]:
documents = list(set(company_df['description'].dropna()))
TfidfVec = TfidfVectorizer(tokenizer=lemmatizer, stop_words='english')

def cos_similarity(textlist):
    tfidf = TfidfVec.fit_transform(textlist)
    return (tfidf * tfidf.T).toarray()

similarity_res = cos_similarity(documents)



In [47]:
# descrition_dict = dict()
# for data in list(company_df['description']):
#     descrition_dict[data] = descrition_dict.get(data, 0)+ 1
# for key, value in descrition_dict.items():
#     if value > 1:
#         print(key, value)
    


In [82]:
def overall_similarity(c1, c2):
    # similariy over num_employees
    if c1['num_employees_enum'] == None or c2['num_employees_enum'] == None:
        num_employees_sim = 0
    else:
        if c1['num_employees_enum'] == c2['num_employees_enum']:
            num_employees_sim = 1
        else:
            num_employees_sim = 1/abs(c1['num_employees_enum'] - c2['num_employees_enum'])
    # similarity over revenue
    if c1['revenue']==None or c2['revenue'] == None:
        revenue_sim = 0
    else:
        if c1['revenue'] == c2['revenue']:
            revenue_sim = 1
        else:
            revenue_sim = 1/abs(c1['revenue'] - c2['revenue'])
    # similarity over industries with jaccard similarity
    if c1['industries'] == None or c2['industries'] == None:
        industry_sim = 0
    else:
        industry_sim = len(set(c1['industries']).intersection(set(c2['industries']))) / len(set(c1['industries']).union(set(c2['industries'])))
    if c1['description']in [None,np.nan] or c2["description"]in [None,np.nan]:
        desc_sim = 0
    else:
        c1_desc_idx = documents.index(c1['description'])
        c2_desc_idx = documents.index(c2["description"])
        desc_sim = similarity_res[c1_desc_idx][c2_desc_idx]

    return 0.2 * num_employees_sim + 0.2*revenue_sim + 0.3*industry_sim + 0.3*desc_sim

In [88]:
similar_companies = []
for i in range(len(company_df)):
    sim_res = []
    for j in [k for k in range(len(company_df)) if k != i]:
        assert i != j
        sim = overall_similarity(company_df.loc[i], company_df.loc[j])
        sim_res.append((sim, j))
    # sort data
    sorted_res = sorted(sim_res, key=lambda x : x[0], reverse=True)
    # print(sorted_res[:5])
    similar_companies.append((company_df.loc[i]['name'], company_df.loc[sorted_res[0][1]]["name"], \
                                company_df.loc[sorted_res[1][1]]["name"], company_df.loc[sorted_res[2][1]]["name"], \
                                    company_df.loc[sorted_res[3][1]]["name"]))

    # print('For company {}, the most three similar companies are {}, {}'.format(company_df.loc[i]['name'], company_df.loc[sim_res[0][1]]["name"],
                                                                                       # company_df.loc[sim_res[1][1]]["name"]))                                                                                

In [90]:
res_df = pd.DataFrame(similar_companies, columns=['name', 'sim_company_1', 'sim_company_2', 'sim_company_3', 'sim_company_4'])
res_df.to_csv('similar_companies.csv')

In [93]:
import os
files = []
for filename in os.listdir('./glassdoor_reviews/'):
    df = pd.read_csv('./glassdoor_reviews/' + filename)
    files.append(df)

In [100]:
res = pd.concat(files).reset_index(drop=True)
res = res.drop(['Unnamed: 0'], axis=1)

In [102]:
res.to_csv('reviews.csv')