### Import modules

In [2]:
import spacy
import pandas as pd
import os
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from pyod.models import ocsvm, iforest, sos  
import scipy

### Create UDFs

In [2]:
def calc_distance(df_vectors, df_vector_averages):
    
    # Cosine distance
    cos_dist_matrix = cosine_distances(df_vectors, df_vector_averages)
    cos_dist = pd.Series(np.diagonal(cos_dist_matrix), name='cos_dist')

    # Euclidian distance 
    euc_dist_matrix = euclidean_distances(df_vectors, df_vector_averages)
    euc_dist = pd.Series(np.diagonal(euc_dist_matrix), name='euc_dist')

    # Manhattan distance 
    manh_dist_matrix = manhattan_distances(df_vectors, df_vector_averages)
    manh_dist = pd.Series(np.diagonal(manh_dist_matrix), name='manh_dist')
    
    # Build & return df
    df_dist = pd.concat([cos_dist, euc_dist, manh_dist], axis=1)
    return df_dist
    

In [3]:
def calc_anormality(df_vectors):

    # Isolation forest 
    iforest_ad = iforest.IForest()
    iforest_fitted = iforest_ad.fit(df_vectors)
    iforest_scores = pd.Series(iforest_fitted.decision_scores_, name = 'iforest_ad')

    # One class SVM
    ocsvm_ad = ocsvm.OCSVM()
    ocsvm_fitted = ocsvm_ad.fit(df_vectors)
    ocsvm_scores = pd.Series(ocsvm_fitted.decision_scores_, name = 'ocsvm_ad')

    # Stochastic outlier selection SVM
    sos_ad = sos.SOS()
    sos_fitted = sos_ad.fit(df_vectors)
    sos_scores = pd.Series(sos_fitted.decision_scores_, name = 'sos_ad')
    
    # Build & return df
    df_anorm = pd.concat([iforest_scores, ocsvm_scores, sos_scores], axis=1)
    return df_anorm
    

### Generate company word vectors & distance scores from word embeddings

In [3]:
dir_path = os.path.normpath(r'C:\Users\heinr\OneDrive\Desktop\LARGE DATA\LinkedIn\company_level_individual_profiles')
dir_path_we = os.path.normpath(r'C:\Users\heinr\OneDrive\Desktop\LARGE DATA\LinkedIn\company_level_individual_profiles_word_embeddings')
path_dir_we_dist = os.path.normpath(r'C:\Users\heinr\OneDrive\Desktop\LARGE DATA\LinkedIn\company_level_individual_profiles_word_embeddings_dist')
path_dir_dist_only = os.path.normpath(r'C:\Users\heinr\OneDrive\Desktop\LARGE DATA\LinkedIn\company_level_individual_profiles_we_dist_only')

In [23]:
for file in os.listdir(dir_path):

    # Read file
    file_path = os.path.join(dir_path_we, file)
    df_temp = pd.read_csv(file_path)

    # Get metadata
    df_meta = df_temp.loc[:, 'org_profile_link':'char_length']

    # Calculate company word vectors
    df_vectors = df_temp.loc[:,'0':'299']
    df_vector_averages = (df_vectors.sum(0) - df_vectors)/float(df_vectors.shape[0]-1)

    # Calculate distances 
    df_dist = calc_distance(df_vectors, df_vector_averages)
    
    # Calculate anormality 
    df_anorm = calc_anormality(df_vectors)

    # Merge dataframes 
    df_vectors = df_vectors.add_prefix('emp_')
    df_vector_averages = df_vector_averages.add_prefix('comp_')
    df_we_dist = pd.concat([df_meta, df_vectors, df_vector_averages, df_dist, df_anorm], axis=1)
    df_we_dist_only = pd.concat([df_meta, df_dist, df_anorm], axis=1)
    df_we_dist_only = df_we_dist_only.drop('person_summary', axis=1)
    
    # Export files 
    df_we_dist.to_csv(os.path.join(path_dir_dist, file))
    df_we_dist_only.to_csv(os.path.join(path_dir_dist_only, file))


  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > se

### Generate company word vectors & distance scores from LIWC

In [4]:
dir_path_liwc = os.path.normpath(r'C:\Users\heinr\OneDrive\Desktop\LARGE DATA\LinkedIn\LIWC_data')
path_dir_liwc_dist = os.path.normpath(r'C:\Users\heinr\OneDrive\Desktop\LARGE DATA\LinkedIn\company_level_individual_profiles_liwc_dist')
path_dir_liwc_dist_only = os.path.normpath(r'C:\Users\heinr\OneDrive\Desktop\LARGE DATA\LinkedIn\company_level_individual_profiles_liwc_dist_only')

In [76]:
failed = []

for comp_name in df_liwc_comp_names:

    try:
        # Cut dataset
        df_liwc_temp = df_liwc[df_liwc.company == comp_name].reset_index()
        df_meta = df_liwc_temp[['company', 'Filename.x', 'person_id']]
        df_liwc_temp_emp = df_liwc_temp.loc[:,'Segment.x':'OtherP.x']

        # Calculate company matrix
        df_liwc_averages = (df_liwc_temp_emp.sum(0) - df_liwc_temp_emp)/float(df_liwc_temp_emp.shape[0]-1)

        # Calculate distances 
        df_dist = calc_distance(df_liwc_temp_emp, df_liwc_averages)

        # Calculate anormality 
        df_anorm = calc_anormality(df_liwc_temp_emp)

        # Merge dataframes 
        df_liwc_temp_emp = df_liwc_temp_emp.add_prefix('emp_')
        df_liwc_averages = df_liwc_averages.add_prefix('comp_')
        df_liwc_dist = pd.concat([df_meta, df_liwc_temp_emp, df_liwc_averages, df_dist, df_anorm], axis=1)
        df_liwc_dist_only = pd.concat([df_meta, df_dist, df_anorm], axis=1)

        # Export files 
        df_liwc_dist.to_csv(os.path.join(path_dir_liwc_dist, comp_name+'.csv'))
        df_liwc_dist_only.to_csv(os.path.join(path_dir_liwc_dist_only, comp_name+'.csv'))
    
    except Exception as e:
        
        failed.append(comp_name)
        print(comp_name)
        print(e)

  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  B = A / A.sum(axis=1)[:, np.newaxis]
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  B = A / A.sum(axis=1)[:, np.newaxis]
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).

  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > se

  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > se

us-army
Unable to allocate 30.7 GiB for an array with shape (64168, 64168) and data type float64


  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(
  beta[i] = beta[i] * 2.0
  self.labels_ = (self.decision_scores_ > self.threshold_).astype(


In [5]:
path_dir_liwc_diff = os.path.normpath(r'C:\Users\heinr\OneDrive\Desktop\LARGE DATA\LinkedIn\company_level_individual_profiles_liwc_diff')
path_dir_liwc_diff_only = os.path.normpath(r'C:\Users\heinr\OneDrive\Desktop\LARGE DATA\LinkedIn\company_level_individual_profiles_liwc_diff_only')

In [6]:
failed = []

for file in os.listdir(path_dir_liwc_dist): 
    
    try: 
        # Read file
        file_path = os.path.join(path_dir_liwc_dist, file)
        df_temp = pd.read_csv(file_path, index_col=0)

        # Cut dataset
        df_meta = df_temp[['company', 'Filename.x', 'person_id']]
        df_liwc_temp_emp = df_temp.loc[:,'emp_Segment.x':'emp_OtherP.x']
        df_liwc_temp_comp = df_temp.loc[:,'comp_Segment.x':'comp_OtherP.x']

        # Calculate difference scores
        df_diff = df_liwc_temp_emp - df_liwc_temp_comp.values
        df_diff.columns = df_diff.columns.str.strip('emp_')
        df_diff = df_diff.add_prefix('diff_')

        # Merge data
        df_liwc_diff = pd.concat([df_temp, df_diff], axis=1)
        df_liwc_diff_only = pd.concat([df_meta, df_diff], axis=1)

        # Export files 
        df_liwc_diff.to_csv(os.path.join(path_dir_liwc_diff, file))
        df_liwc_diff_only.to_csv(os.path.join(path_dir_liwc_diff_only, file))
    
    except Exception as e:

        failed.append(comp_name)
        print(comp_name)
        print(e)   
    
    