## load data

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import time

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_colwidth', None)

df_raw = pd.read_csv(r"W:\019_Glassdoor\1 Data\1 Glassdoor Links\0815_ReviewAmount_English-Post2008.csv", sep=";", decimal=",", encoding='unicode_escape')

## prepare data

#### regex cheat sheet:

https://www.debuggex.com/cheatsheet/regex/python

https://regex101.com/r/3btjRd/4

### convert links to firm names

In [3]:
df = df_raw.copy()

## set links containing SRCH to nan since they appear to be always wrong
df = df.replace({r'(^.*SRCH.*$)': np.nan, 
    r"^.*\/NA\.htm.*": np.nan, #https://www.glassdoor.com/Reviews/NA.htm
    r"^.*\/index\.htm.*": np.nan, #https://www.glassdoor.com/Reviews/index.htm
    r"^.*Glassdoor-Reviews-E100431.*": np.nan}, #https://www.glassdoor.com/Reviews/Glassdoor-Reviews-E100431.htm
    regex=True) 


for index, column in enumerate(["link_marius", "link_hannes", "link_hannes_strict"], start=1):
    ## insert three empty columns after the link columns
    df.insert(df.columns.get_loc("link_hannes_strict")+index, column.replace("link", "firm"), np.nan)
    ## copy links in the empty new columns
    df[column.replace("link", "firm")] = df[column]
    ## remove the https://www.glassdoor.com/Reviews/ part
    df[column.replace("link", "firm")] = df[column.replace("link", "firm")].str.split('/').str[-1]

## convert links to clean firm names
## https://www.glassdoor.com/Reviews/International-Specialty-Products-Reviews-E1114.htm
## to
## International Specialty Products
df[["firm_marius", "firm_hannes", "firm_hannes_strict"]] = df[["firm_marius", "firm_hannes", "firm_hannes_strict"]].replace(
    {"-Reviews|Working-at-":"", 
    r"(?:(-E\d.*)|(-E.*I_.*)\.htm.*$)": "",
    "-": " ",
    "#NAME?": np.nan}, 
    regex=True)

# df["link_found"].unique().tolist()
# wrong link - kann man vergessen
# df = df[df["link_found"]=="transformed jobs link"].reset_index(drop=True)

df.head()

Unnamed: 0,index,isin,ticker,2008_2022,english,company_name,link_marius,link_hannes,link_hannes_strict,firm_marius,firm_hannes,firm_hannes_strict,marius_cosine_original,link_found,link_found_strict,review_amount,review_amount_strict
0,0,US0577411004,,1,1,ABB Motors and Mechanical,https://www.glassdoor.com/Reviews/ABB-Motors-and-Mechanical-Inc-Reviews-E73.htm,https://www.glassdoor.com/Reviews/ABB-Motors-and-Mechanical-Inc-Reviews-E73.htm,https://www.glassdoor.com/Reviews/ABB-Motors-and-Mechanical-Inc-Reviews-E73.htm,ABB Motors and Mechanical Inc,ABB Motors and Mechanical Inc,ABB Motors and Mechanical Inc,0.818978,transformed overview link,transformed overview link,201.0,201.0
1,1,US4603371083,,1,1,International Specialty Products,https://www.glassdoor.com/Reviews/International-Specialty-Products-Reviews-E1114.htm,https://www.glassdoor.com/Reviews/International-Specialty-Products-Reviews-E1114.htm,https://www.glassdoor.com/Reviews/International-Specialty-Products-Reviews-E1114.htm,International Specialty Products,International Specialty Products,International Specialty Products,0.80993,original review link,original review link,8.0,8.0
2,2,CA61945Q1028,,1,1,Conversant Intellectual Property Management,https://www.glassdoor.com/Reviews/Conversant-Intellectual-Property-Management-Reviews-E1014209.htm,https://www.glassdoor.com/Reviews/Conversant-Intellectual-Property-Management-Reviews-E1014209.htm,https://www.glassdoor.com/Reviews/Conversant-Intellectual-Property-Management-Reviews-E1014209.htm,Conversant Intellectual Property Management,Conversant Intellectual Property Management,Conversant Intellectual Property Management,0.806629,transformed overview link,transformed overview link,5.0,5.0
3,3,CA3449191053,,1,1,Foreign Currency Exchange,https://www.glassdoor.com/Reviews/Foreign-Currency-Exchange-Corp-Reviews-E260472.htm,,,Foreign Currency Exchange Corp,,,0.798668,wrong link,wrong link,,
4,4,US5520747008,,1,1,William Lyon Homes,https://www.glassdoor.com/Reviews/William-Lyon-Homes-Reviews-E258.htm,https://www.glassdoor.com/Reviews/William-Lyon-Homes-Reviews-E258.htm,https://www.glassdoor.com/Reviews/William-Lyon-Homes-Reviews-E258.htm,William Lyon Homes,William Lyon Homes,William Lyon Homes,0.792491,original review link,original review link,52.0,52.0


### string_grouper: calculate cosine similarity

In [31]:
from string_grouper import compute_pairwise_similarities

df_similarity = df.copy()

## compute_pairwise_similarities method cant handle np.nan -> use empty strings
df_similarity[["firm_marius", "firm_hannes", "firm_hannes_strict"]] = df_similarity[["firm_marius", "firm_hannes", "firm_hannes_strict"]].replace(np.nan, "")

for column in ["firm_marius", "firm_hannes", "firm_hannes_strict"]:
    df_similarity[column.replace("firm", "similarity")] = round(
        compute_pairwise_similarities(df_similarity["company_name"], df_similarity[column], ignore_case=True, ngram_size=3),
        2)

## get the mean of the three cosine scores
df_similarity["similarity_mean"] = df_similarity[["similarity_marius", "similarity_hannes", "similarity_hannes_strict"]].mean(axis=1)

## check whether firm name of marius is equal to hannes
df_similarity["marius_equalto_hannes"] = np.where( (df_similarity["firm_marius"] == df_similarity["firm_hannes"]) | (df_similarity["firm_marius"] == df_similarity["firm_hannes_strict"]),  1, 0)

## get the firm name of the max cos value
df_similarity.insert(df_similarity.columns.get_loc("company_name")+1, "best_match", np.nan)
## get the column name with the highest similarity value
best_matches = df_similarity[["similarity_marius", "similarity_hannes", "similarity_hannes_strict"]].idxmax(axis=1).replace({"similarity": "firm"}, regex=True).tolist()
## assign the highest match to the best_match column
for index, columnname in enumerate(best_matches):
    df_similarity.loc[index, "best_match"] = df_similarity.loc[index, columnname]

df_similarity[:4]

Unnamed: 0,index,isin,ticker,2008_2022,english,company_name,best_match,link_marius,link_hannes,link_hannes_strict,...,marius_cosine_original,link_found,link_found_strict,review_amount,review_amount_strict,similarity_marius,similarity_hannes,similarity_hannes_strict,similarity_mean,marius_equalto_hannes
0,0,US0577411004,,1,1,ABB Motors and Mechanical,ABB Motors and Mechanical Inc,https://www.glassdoor.com/Reviews/ABB-Motors-and-Mechanical-Inc-Reviews-E73.htm,https://www.glassdoor.com/Reviews/ABB-Motors-and-Mechanical-Inc-Reviews-E73.htm,https://www.glassdoor.com/Reviews/ABB-Motors-and-Mechanical-Inc-Reviews-E73.htm,...,0.818978,transformed overview link,transformed overview link,201.0,201.0,0.96,0.96,0.96,0.96,1
1,1,US4603371083,,1,1,International Specialty Products,International Specialty Products,https://www.glassdoor.com/Reviews/International-Specialty-Products-Reviews-E1114.htm,https://www.glassdoor.com/Reviews/International-Specialty-Products-Reviews-E1114.htm,https://www.glassdoor.com/Reviews/International-Specialty-Products-Reviews-E1114.htm,...,0.80993,original review link,original review link,8.0,8.0,1.0,1.0,1.0,1.0,1
2,2,CA61945Q1028,,1,1,Conversant Intellectual Property Management,Conversant Intellectual Property Management,https://www.glassdoor.com/Reviews/Conversant-Intellectual-Property-Management-Reviews-E1014209.htm,https://www.glassdoor.com/Reviews/Conversant-Intellectual-Property-Management-Reviews-E1014209.htm,https://www.glassdoor.com/Reviews/Conversant-Intellectual-Property-Management-Reviews-E1014209.htm,...,0.806629,transformed overview link,transformed overview link,5.0,5.0,1.0,1.0,1.0,1.0,1
3,3,CA3449191053,,1,1,Foreign Currency Exchange,Foreign Currency Exchange Corp,https://www.glassdoor.com/Reviews/Foreign-Currency-Exchange-Corp-Reviews-E260472.htm,,,...,0.798668,wrong link,wrong link,,,0.95,0.0,0.0,0.316667,0


### merge with target_dummy and clean duplicates

In [32]:
df_target = pd.read_excel(r"W:\019_Glassdoor\1 Data\1 Glassdoor Links\Archive\target_dummy_list.xlsx")

## one isin can have a 0 and 1 entry -> drop the second (0) entry
df_target.sort_values(by=["target_dummy"], ascending=[0], inplace=True)
df_target.reset_index(drop=True, inplace=True)

df_target.drop_duplicates(subset=["isin"], keep="first", inplace=True)

## vlookup if firm has ever been a target
df_similarity = df_similarity.merge(df_target, how="left", on="isin")

# shift column next to ticker column
poped_col = df_similarity.pop('target_dummy')
df_similarity.insert(df_similarity.columns.get_loc("ticker")+1, 'target_dummy', poped_col)

## drop duplicate rows
df_similarity.drop_duplicates(subset=["isin", "target_dummy", "similarity_mean"], keep="first", inplace=True)

## sort by
df_similarity.sort_values(by=["similarity_mean", "target_dummy", "review_amount"], ascending=[0, 0, 0], inplace=True)

df_similarity[200:230]

Unnamed: 0,index,isin,ticker,target_dummy,2008_2022,english,company_name,best_match,link_marius,link_hannes,...,marius_cosine_original,link_found,link_found_strict,review_amount,review_amount_strict,similarity_marius,similarity_hannes,similarity_hannes_strict,similarity_mean,marius_equalto_hannes
11454,11454,US6294911010,,1,1,1,NYSE,NYSE,https://www.glassdoor.com/Reviews/NYSE-Reviews-E3307.htm,https://www.glassdoor.com/Reviews/NYSE-Reviews-E3307.htm,...,0.122,transformed overview link,transformed overview link,318.0,318.0,1.0,1.0,1.0,1.0,1
2362,2362,US1413371055,,1,1,1,Carbonite,Carbonite,https://www.glassdoor.com/Reviews/Carbonite-Reviews-E304976.htm,https://www.glassdoor.com/Reviews/Carbonite-Reviews-E304976.htm,...,0.496537,original review link,transformed overview link,314.0,294.0,1.0,1.0,1.0,1.0,1
884,884,US5949011002,,1,1,1,MICROS Systems,MICROS Systems,https://www.glassdoor.com/Reviews/MICROS-Systems-Reviews-E1649.htm,https://www.glassdoor.com/Reviews/MICROS-Systems-Reviews-E1649.htm,...,0.607351,transformed overview link,transformed overview link,312.0,312.0,1.0,1.0,1.0,1.0,1
2347,2347,US5537771033,,1,1,1,MTS Systems,MTS Systems,https://www.glassdoor.com/Reviews/MTS-Systems-Reviews-E1608.htm,https://www.glassdoor.com/Reviews/MTS-Systems-Reviews-E1608.htm,...,0.497428,original review link,original review link,309.0,309.0,1.0,1.0,1.0,1.0,1
5412,5412,US1313473043,,1,1,1,Calpine,Calpine,https://www.glassdoor.com/Reviews/Calpine-Reviews-E6150.htm,https://www.glassdoor.com/Reviews/Calpine-Reviews-E6150.htm,...,0.365136,transformed overview link,transformed overview link,306.0,306.0,1.0,1.0,1.0,1.0,1
2342,2342,US45840J1079,,1,1,1,Interactive Data,Interactive Data,https://www.glassdoor.com/Reviews/Interactive-Data-Reviews-E3644.htm,https://www.glassdoor.com/Reviews/Interactive-Data-Reviews-E3644.htm,...,0.497701,original review link,original review link,305.0,305.0,1.0,1.0,1.0,1.0,1
3227,3227,US8978881030,,1,1,1,Trulia,Trulia,https://www.glassdoor.com/Reviews/Trulia-Reviews-E226815.htm,https://www.glassdoor.com/Reviews/Trulia-Reviews-E226815.htm,...,0.453943,transformed overview link,transformed overview link,304.0,304.0,1.0,1.0,1.0,1.0,1
9951,9951,US89969Q1040,,1,1,1,Tumi,Tumi,https://www.glassdoor.com/Reviews/Tumi-Reviews-E20782.htm,https://www.glassdoor.com/Reviews/Tumi-Reviews-E20782.htm,...,0.198976,transformed overview link,transformed overview link,304.0,304.0,1.0,1.0,1.0,1.0,1
318,318,US75968L1052,,1,1,1,Renaissance Learning,Renaissance Learning,https://www.glassdoor.com/Reviews/Renaissance-Learning-Reviews-E6746.htm,https://www.glassdoor.com/Reviews/Renaissance-Learning-Reviews-E6746.htm,...,0.678412,transformed overview link,transformed overview link,303.0,303.0,1.0,1.0,1.0,1.0,1
4692,4692,US71714F1049,,1,1,1,PharMerica,PharMerica,https://www.glassdoor.com/Reviews/PharMerica-Reviews-E894.htm,https://www.glassdoor.com/Reviews/PharMerica-Reviews-E894.htm,...,0.392168,transformed overview link,transformed overview link,303.0,303.0,1.0,1.0,1.0,1.0,1


In [33]:
## save file with dynamic file name
path_with_time = Path.joinpath(Path.cwd().parent, f'{time.strftime("%m%d")}_LinksNamesReviews_bestmatch.csv')
df_similarity.to_csv(path_with_time, sep=";", decimal=",", index=False)
print(f"{path_with_time} saved")

w:\019_Glassdoor\1 Data\1 Glassdoor Links\0816_LinksNamesReviews_bestmatch.csv saved


In [17]:
similarity_thresholds = list(map(lambda x: x/10.0, range(5, 11, 1))) #range doesnt work with floats
min_review_amounts = [i for i in range(30,80,10)]

for min_review_amount in min_review_amounts:
    for similarity_threshold in similarity_thresholds:
        rows = df_similarity[ (df_similarity["mean"] >= similarity_threshold) & (df_similarity["target_dummy"] == 1) & (df_similarity["review_amount"] >= min_review_amount)].shape[0]
        print(f"min review amount treshold: {min_review_amount} & similarity_threshold: {similarity_threshold} -> {rows} targets")
    print("\n")

min review amount treshold: 30 & similarity_threshold: 0.5 -> 1363 targets
min review amount treshold: 30 & similarity_threshold: 0.6 -> 1187 targets
min review amount treshold: 30 & similarity_threshold: 0.7 -> 1046 targets
min review amount treshold: 30 & similarity_threshold: 0.8 -> 884 targets
min review amount treshold: 30 & similarity_threshold: 0.9 -> 723 targets
min review amount treshold: 30 & similarity_threshold: 1.0 -> 684 targets


min review amount treshold: 40 & similarity_threshold: 0.5 -> 1224 targets
min review amount treshold: 40 & similarity_threshold: 0.6 -> 1065 targets
min review amount treshold: 40 & similarity_threshold: 0.7 -> 935 targets
min review amount treshold: 40 & similarity_threshold: 0.8 -> 797 targets
min review amount treshold: 40 & similarity_threshold: 0.9 -> 652 targets
min review amount treshold: 40 & similarity_threshold: 1.0 -> 620 targets


min review amount treshold: 50 & similarity_threshold: 0.5 -> 1119 targets
min review amount treshold: 

## examples

### vectorize strings & calc cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

def text_similarity(row, column_name1, column_name2):
    ## vectorize strings to word count lists
    count_vect = CountVectorizer(lowercase=True) #object needs to be initialized for the (optional) df later on
    texts = [row[column_name1],row[column_name2]]
    vectorized_texts_object = count_vect.fit_transform(texts)
    vectorized_texts = vectorized_texts_object.toarray() #2 list entries

    ## calc cosine similarity (0-1; with 1 being perfectly identical)
    similarity = cosine_similarity([vectorized_texts[0]], [vectorized_texts[1]])
    similarity_score = round(float(similarity),3)
    return similarity_score

In [None]:
df["Max_Cosine_Similarity_Hannes"] = df.apply(text_similarity, args=("company_name", "company_name"), axis=1) #args to set both column names that should be used
df

### example: vectorize strings & calc cosine similarity

In [None]:
def text_similarity(text1, text2):
    ## vectorize strings to word count lists
    count_vect = CountVectorizer(lowercase=True) #object needs to be initialized for the (optional) df later on
    texts = [text1,text2]
    vectorized_texts_object = count_vect.fit_transform(texts)
    vectorized_texts = vectorized_texts_object.toarray() #2 list entries

    df_vectorized_texts = pd.DataFrame(vectorized_texts_object.toarray(),columns=count_vect.get_feature_names_out(),index=["text1", "text2"])
    print(df_vectorized_texts)

    # print(vectorized_texts[0])
    # print(vectorized_texts[1])

    ## calc cosine similarity (0-1; with 1 being perfectly identical)
    similarity = cosine_similarity([vectorized_texts[0]], [vectorized_texts[1]])
    similarity_score = round(float(similarity),3)
    return similarity_score


text1 = "Champignon Brands"
text2 = "Champion Brands"
text_similarity(text1, text2)

#### Levinshtein Distance

In [None]:
from fuzzywuzzy import fuzz
# df['score'] = df[['Name Left','Name Right']].apply(lambda x : fuzz.partial_ratio(*x),axis=1)

text1 = "Champignon Brands"
text2 = "Champion Brands"
fuzz.partial_ratio(text1, text2)

### String_Grouper Module

In [None]:
from string_grouper import compute_pairwise_similarities

# Create a small DataFrame of pairs of strings:
pair_s = pd.DataFrame(
    [
        ('Mega Enterprises Corporation', 'Mega Enterprises Corporation'),
        ('Hyper Startup Inc.', 'Hyper Startup Incorporated'),
        ('Hyper Startup Inc.', 'Hyper Startup Inc.'),
        ('Hyper Startup Inc.', 'Hyper-Startup Inc.'),
        ('Hyper Hyper Inc.', 'Hyper Hyper Inc.'),
        ('Mega Enterprises Corporation', 'Mega Enterprises Corp.')
   ],
   columns=('left', 'right')
)

pair_s['similarity'] = compute_pairwise_similarities(pair_s['left'], pair_s['right'])
pair_s