##### Import the necessary libraries

In [None]:
import pandas as pd
import gensim
import gensim.models.word2vec as w2v
import nltk
import re
import os
from tqdm import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import string
import ace_tools as tools

In [3]:
nltk.download('stopwords', download_dir=r'D:\nltk_data')
nltk.data.path.append(r'D:\nltk_data')

[nltk_data] Downloading package stopwords to D:\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [7]:
# Set Input working directory
data_folder = r'D:/wrdsTables/ciqtranscriptcomponent_chunks_scored'

# Set output directory for processed CSV files
output_directory = r'D:/wrdsTables/corporate_culture_analysis'
os.makedirs(output_directory, exist_ok=True) # create the folder if it does not exist


In [None]:
csv_files = [f for f in os.listdir(data_folder) if f.endswith('.csv')]
print(f"Found {len(csv_files)} CSV files.")

##### Function to load all CSV files into a single DataFrame

In [None]:
def load_csv_files(folder, file_list):
    dataframes = []
    for file in file_list:
        file_path = os.path.join(folder, file)
        try:
            df = pd.read_csv(file_path)
            df['source_file'] = file
            dataframes.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")
    return pd.concat(dataframes, ignore_index=True)

# Load all files
df = load_csv_files(data_folder, csv_files)

print(f"Total rows loaded: {df.shape[0]}")

##### Preprocess text

In [16]:
# Cache stopwords list as a set for faster lookup:
stop_words = set(stopwords.words('english'))

# Define text pre processing function
def preprocess_text(text):
    if pd.isnull(text):
        return []
    text = text.lower().translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Apply preprocessing
df['tokens'] = df['componenttext'].apply(preprocess_text)

##### Train Word2Vec on All Transcripts

In [17]:
# Convert tokenized text into a list of sentences
sentences = df['tokens'].tolist()

In [20]:
word2vec_model = w2v.Word2Vec(sentences, vector_size=300, window=5, min_count=5, workers = 2)
# Vector_size = 300 dimensions to each vector
# window = 5 needs clarification
# min_count = 5 means words that apper fewer than 5 times cannot be included
# workers = 2 is for processing, should be number of processors you have

##### Save model

In [21]:
word2vec_model.save('w2v_corpculture.model')

##### Generate the Corporate Culture Dictionary
###### Pulled the top 30 words associated with each value from Li et al Internet Appendix

In [23]:
# Define cultural seed words
culture_values = {
    "innovation" : ["brand", "technology", "focus", "great", "platform", "ability", "best", "design", "create", "solution", "develop", "success", "content", "capability", "effort", "successful", "efficiency", "productivity", "learn", "unique", "tool", "innovation", "efficient", "terrific", "execution", "exciting", "enhance", "business_model", "enable", "discipline"],
    "integrity" : ["control", "management", "careful", "honestly", "regulator", "honest", "safety", "assure", "compliance", "trust", "disciplined", "responsible", "proper", "responsibility", "thoughtful", "convince", "seriously", "transparent", "expert", "consistency", "candidly", "transparency", "authority", "responsive", "truth", "principle", "comply", "board_director", "thorough", "conflict"],
    "quality" : ["customer", "product", "client", "service", "build", "deliver", "network", "support", "quality", "sales_force", "infrastructure", "supplier", "serve", "commit", "field", "commitment", "delivery", "vendor", "customer_base", "supply_chain", "critical", "requirement", "ensure", "speed", "desire", "productive", "guest", "service_provider", "capable", "functionality"],
    "respect" : ["people", "team", "company", "hire", "folk", "organization", "resource", "employee", "management_team", "train", "training", "senior", "staff", "member", "leader", "person", "proud", "talent", "leadership", "manager", "ceo", "knowledge", "engineer", "recruit", "salespeople", "sales_team", "consultant", "culture", "sales_organization", "advisor"],
    "teamwork" : ["partner", "relationship", "discussion", "together", "integrate", "involve", "conversation", "integration", "partnership", "engage", "align", "explore", "communication", "dialogue", "engagement", "contact", "conduct", "on_behalf_of", "joint", "collaboration", "sponsor", "conjunction", "supportive", "alliance", "merge", "interaction", "put_together", "organize", "embrace", "assist"]
}

##### Function to get similar words

In [None]:
def generate_culture_dictionary(model, culture_values, top_n=50):
    culture_dictionary = {}
    for value, seed_words in culture_values.items():
        similar_words = []
        for seed in seed_words:
            if seed in model.wv:
                similar_words += [word for word, _ in model.wv.most_similar(seed, topn=top_n)]
        culture_dictionary[value] = list(set(similar_words))
    return culture_dictionary

# Generate dictionary
culture_dictionary = generate_culture_dictionary(word2vec_model, culture_values)

# Display first few words for each category
for key, words in culture_dictionary.items():
    print(f"\n{key.upper()}: {words[:10]}")

##### Convert tokenized text back into a corpus format

In [26]:
df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x))

##### Compute TF-IDF Scores

In [28]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['processed_text'])
tfidf_feature_names = vectorizer.get_feature_names_out()

##### Convert to dataframe

In [30]:
tfidf_df = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, 
                                               columns=tfidf_feature_names, 
                                               index=df.index)


##### Compute Culture Scores

In [None]:
def compute_culture_scores(tfidf_df, culture_dictionary):
    scores = {}
    for value, words in culture_dictionary.items():
        relevant_words = [word for word in words if word in tfidf_df.columns]
        if relevant_words:
            scores[value] = tfidf_df[relevant_words].sum(axis=1)
        else:
            scores[value] = np.zeros(len(tfidf_df))
    return pd.DataFrame(scores, index=tfidf_df.index)

# Generate culture scores
culture_scores = compute_culture_scores(tfidf_df, culture_dictionary)

# Merge with original dataset
df = pd.concat([df, culture_scores], axis=1)

# Save results
df.to_csv("corporate_culture_scores.csv", index=False)