### Import modules

In [1]:
import spacy
import pandas as pd
import os
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from pyod.models import ocsvm, iforest, sos  
import scipy

### Create UDFs

In [2]:
def calc_distance(df_vectors, df_vector_averages):
    
    # Cosine distance
    cos_dist_matrix = cosine_distances(df_vectors, df_vector_averages)
    cos_dist = pd.Series(np.diagonal(cos_dist_matrix), name='cos_dist')

    # Euclidian distance 
    euc_dist_matrix = euclidean_distances(df_vectors, df_vector_averages)
    euc_dist = pd.Series(np.diagonal(euc_dist_matrix), name='euc_dist')

    # Manhattan distance 
    manh_dist_matrix = manhattan_distances(df_vectors, df_vector_averages)
    manh_dist = pd.Series(np.diagonal(manh_dist_matrix), name='manh_dist')
    
    # Build & return df
    df_dist = pd.concat([cos_dist, euc_dist, manh_dist], axis=1)
    return df_dist
    

In [3]:
def calc_anormality(df_vectors):

    # Isolation forest 
    iforest_ad = iforest.IForest()
    iforest_fitted = iforest_ad.fit(df_vectors)
    iforest_scores = pd.Series(iforest_fitted.decision_scores_, name = 'iforest_ad')

    # One class SVM
    ocsvm_ad = ocsvm.OCSVM()
    ocsvm_fitted = ocsvm_ad.fit(df_vectors)
    ocsvm_scores = pd.Series(ocsvm_fitted.decision_scores_, name = 'ocsvm_ad')

    # Stochastic outlier selection SVM
    sos_ad = sos.SOS()
    sos_fitted = sos_ad.fit(df_vectors)
    sos_scores = pd.Series(sos_fitted.decision_scores_, name = 'sos_ad')
    
    # Build & return df
    df_anorm = pd.concat([iforest_scores, ocsvm_scores, sos_scores], axis=1)
    return df_anorm
    

### Generate company word vectors & distance scores from word embeddings

In [21]:
dir_path = os.path.normpath(r'C:\Users\heinr\OneDrive\Desktop\LARGE DATA\LinkedIn\company_level_individual_profiles')
dir_path_we = os.path.normpath(r'C:\Users\heinr\OneDrive\Desktop\LARGE DATA\LinkedIn\company_level_individual_profiles_word_embeddings')
path_dir_we_dist = os.path.normpath(r'C:\Users\heinr\OneDrive\Desktop\LARGE DATA\LinkedIn\company_level_individual_profiles_word_embeddings_dist')
path_dir_dist_only = os.path.normpath(r'C:\Users\heinr\OneDrive\Desktop\LARGE DATA\LinkedIn\company_level_individual_profiles_we_dist_only')

In [23]:
for file in os.listdir(dir_path):

    # Read file
    file_path = os.path.join(dir_path_we, file)
    df_temp = pd.read_csv(file_path)

    # Get metadata
    df_meta = df_temp.loc[:, 'org_profile_link':'char_length']

    # Calculate company word vectors
    df_vectors = df_temp.loc[:,'0':'299']
    df_vector_averages = (df_vectors.sum(0) - df_vectors)/float(df_vectors.shape[0]-1)

    # Calculate distances 
    df_dist = calc_distance(df_vectors, df_vector_averages)
    
    # Calculate anormality 
    df_anorm = calc_anormality(df_vectors)

    # Merge dataframes 
    df_vectors = df_vectors.add_prefix('emp_')
    df_vector_averages = df_vector_averages.add_prefix('comp_')
    df_we_dist = pd.concat([df_meta, df_vectors, df_vector_averages, df_dist, df_anorm], axis=1)
    df_we_dist_only = pd.concat([df_meta, df_dist, df_anorm], axis=1)
    df_we_dist_only = df_we_dist_only.drop('person_summary', axis=1)
    
    # Export files 
    df_we_dist.to_csv(os.path.join(path_dir_dist, file))
    df_we_dist_only.to_csv(os.path.join(path_dir_dist_only, file))


  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > se

### Generate company word vectors & distance scores from LIWC

In [23]:
dir_path_liwc = os.path.normpath(r'C:\Users\heinr\OneDrive\Desktop\LARGE DATA\LinkedIn\LIWC_data')
path_dir_liwc_dist = os.path.normpath(r'C:\Users\heinr\OneDrive\Desktop\LARGE DATA\LinkedIn\company_level_individual_profiles_liwc_dist')
path_dir_liwc_dist_only = os.path.normpath(r'C:\Users\heinr\OneDrive\Desktop\LARGE DATA\LinkedIn\company_level_individual_profiles_liwc_dist_only')

In [24]:
file = 'LIWC_idividual_company_mapped.csv'
file_path = os.path.join(dir_path_liwc, file)

In [28]:
df_liwc = pd.read_csv(file_path)
df_liwc = df_liwc.filter(regex='\.x|company|person', axis=1)
df_liwc_comp_names = df_liwc.company.drop_duplicates()

In [51]:
df_liwc[df_liwc.company=='24-hour-fitness']

Unnamed: 0,company,Filename.x,Segment.x,WC.x,Analytic.x,Clout.x,Authentic.x,Tone.x,WPS.x,Sixltr.x,...,Colon.x,SemiC.x,QMark.x,Exclam.x,Dash.x,Quote.x,Apostro.x,Parenth.x,OtherP.x,person_id
0,24-hour-fitness,24-hour-fitness__37335508_747_First_1000_1.txt,1,61,96.74,74.41,5.64,83.88,15.25,39.34,...,0.00,0.0,0.00,0.0,3.28,3.28,0.00,3.28,4.92,37335508_747_First_1000_1
1,24-hour-fitness,24-hour-fitness__37749383_755_First_1000_1.txt,1,46,99.00,50.00,40.55,66.89,11.50,39.13,...,2.17,0.0,0.00,0.0,0.00,4.35,0.00,0.00,0.00,37749383_755_First_1000_1
2,24-hour-fitness,24-hour-fitness__37743049_755_First_1000_1.txt,1,46,96.43,80.77,40.55,99.00,15.33,50.00,...,0.00,0.0,0.00,0.0,2.17,4.35,0.00,0.00,0.00,37743049_755_First_1000_1
3,24-hour-fitness,24-hour-fitness__37345321_747_First_1000_1.txt,1,83,96.69,83.25,24.54,25.77,11.86,49.40,...,0.00,0.0,0.00,0.0,1.20,2.41,0.00,0.00,7.23,37345321_747_First_1000_1
4,24-hour-fitness,24-hour-fitness__37402293_749_First_1000_1.txt,1,73,91.90,89.11,1.00,99.00,18.25,50.68,...,1.37,0.0,0.00,0.0,0.00,2.74,0.00,0.00,0.00,37402293_749_First_1000_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2719,24-hour-fitness,24-hour-fitness__52201200_1045_1000_1500.txt,1,63,99.00,68.29,27.77,25.77,21.00,57.14,...,3.17,0.0,0.00,0.0,14.29,3.17,0.00,0.00,0.00,52201200_1045_1000_1500
2720,24-hour-fitness,24-hour-fitness__52301817_1047_1000_1500.txt,1,88,99.00,50.00,14.96,99.00,17.60,48.86,...,0.00,0.0,0.00,0.0,0.00,2.27,0.00,0.00,0.00,52301817_1047_1000_1500
2721,24-hour-fitness,24-hour-fitness__1623556_33_First_1000_0.txt,1,49,81.96,20.71,82.17,99.00,24.50,34.69,...,0.00,0.0,0.00,0.0,4.08,4.08,0.00,0.00,0.00,1623556_33_First_1000_0
2722,24-hour-fitness,24-hour-fitness__71300218_1427_1000_1500.txt,1,44,98.93,32.48,85.21,68.66,8.80,20.45,...,0.00,0.0,0.00,0.0,0.00,4.55,0.00,0.00,4.55,71300218_1427_1000_1500


In [None]:
failed = []

for comp_name in df_liwc_comp_names:

    try:
        # Cut dataset
        df_liwc_temp = df_liwc[df_liwc.company == comp_name].reset_index()
        df_meta = df_liwc_temp[['company', 'Filename.x', 'person_id']]
        df_liwc_temp_emp = df_liwc_temp.loc[:,'Segment.x':'OtherP.x']

        # Calculate company matrix
        df_liwc_averages = (df_liwc_temp_emp.sum(0) - df_liwc_temp_emp)/float(df_liwc_temp_emp.shape[0]-1)

        # Calculate distances 
        df_dist = calc_distance(df_liwc_temp_emp, df_liwc_averages)

        # Calculate anormality 
        df_anorm = calc_anormality(df_liwc_temp_emp)

        # Merge dataframes 
        df_liwc_temp_emp = df_liwc_temp_emp.add_prefix('emp_')
        df_liwc_averages = df_liwc_averages.add_prefix('comp_')
        df_liwc_dist = pd.concat([df_meta, df_liwc_temp_emp, df_liwc_averages, df_dist, df_anorm], axis=1)
        df_liwc_dist_only = pd.concat([df_meta, df_dist, df_anorm], axis=1)

        # Export files 
        df_liwc_dist.to_csv(os.path.join(path_dir_liwc_dist, comp_name+'.csv'))
        df_liwc_dist_only.to_csv(os.path.join(path_dir_liwc_dist_only, comp_name+'.csv'))
    
    except Exception as e:
        
        failed.append(comp_name)
        print(comp_name)
        print(e)

  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  B = A / A.sum(axis=1)[:, np.newaxis]
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  B = A / A.sum(axis=1)[:, np.newaxis]
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).