# Specificity 
Hope, O. K., Hu, D., & Lu, H. (2016). The benefits of specific risk-factor disclosures. Review of Accounting Studies, 21(4), 1005-1045.

# Code Reference
The code is from https://www.linguisticsweb.org/doku.php?id=linguisticsweb:tutorials:automaticannotation:stanford_ner_tagger_python

In [5]:
import os
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

In [6]:
from collections import Counter
import pandas as pd

In [3]:
os.environ['JAVA_HOME'] = r'C:\Program Files\Java\jre-1.8'
os.environ['PATH'] += os.pathsep + r'C:\Program Files\Java\jre-1.8\bin'


In [4]:
# Paths to model and jar
model = "D:\Data\Stanford_NER\stanford-ner-4.2.0\stanford-ner-2020-11-17\classifiers\english.muc.7class.distsim.crf.ser.gz"
jar = "D:\Data\Stanford_NER\stanford-ner-4.2.0\stanford-ner-2020-11-17\stanford-ner-4.2.0.jar"

In [5]:
# Create the tagger
ner_tagger = StanfordNERTagger(model, jar, encoding="utf-8")

# Test with an example

In [15]:
# Your input text (replace with your own file or string)
text = "Joe Riddle was born in Hawaii at 7:00 AM on August 4, 1961, holding 400 dollars. Also it is grate to meet you at this early hour."

In [16]:
# Tokenize
words = word_tokenize(text)
# Tag
classified_words = ner_tagger.tag(words)
print(classified_words)

[('Joe', 'PERSON'), ('Riddle', 'PERSON'), ('was', 'O'), ('born', 'O'), ('in', 'O'), ('Hawaii', 'LOCATION'), ('at', 'O'), ('7:00', 'TIME'), ('AM', 'TIME'), ('on', 'O'), ('August', 'DATE'), ('4', 'DATE'), (',', 'DATE'), ('1961', 'DATE'), (',', 'O'), ('holding', 'O'), ('400', 'MONEY'), ('dollars', 'MONEY'), ('.', 'O'), ('Also', 'O'), ('it', 'O'), ('is', 'O'), ('grate', 'O'), ('to', 'O'), ('meet', 'O'), ('you', 'O'), ('at', 'O'), ('this', 'O'), ('early', 'O'), ('hour', 'O'), ('.', 'O')]


# Practice with real text

In [7]:
def read_txt_files_to_df(txt_dir):
    data = []
    for filename in os.listdir(txt_dir):
        if filename.endswith('.txt'):
            filepath = os.path.join(txt_dir, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                content = file.read()
                data.append({'filename': filename, 'content': content})
    return pd.DataFrame(data)

# Directory containing your txt files
file_dir = r'D:\Data\CapitalIQ_Transcript\Txt_TestRun_v1'

# Read files into DataFrame
df = read_txt_files_to_df(file_dir)
print(df.head())  # preview

    filename                                            content
0  10098.txt  The military revenue for the fourth quarter wa...
1  11069.txt  It's a couple of questions in there. Let me st...
2   1157.txt  Let me start, Gregg, by saying that with respe...
3  11681.txt  Sure, Simon. First of all, thank you for your ...
4  11812.txt  Okay. Matt, I'll take that. And we're looking ...


In [7]:
# --- Efficient Batch NER Tagging ---
def batch_ner_tag(text_list, tagger, batch_size=500):
    tagged_results = []
    total = len(text_list)
    for i in range(0, total, batch_size):
        batch = text_list[i:i+batch_size]
        tokenized_batch = [word_tokenize(text) if isinstance(text, str) else [] for text in batch]
        tagged_batch = tagger.tag_sents(tokenized_batch)
        tagged_results.extend(tagged_batch)
    return tagged_results

# --- Count Entities ---
def count_entities(tagged):
    counts = Counter(tag for _, tag in tagged)
    return counts

In [8]:
# --- Process DataFrame ---
# 1. Efficient NER tagging (add ner_tags column)
df['ner_tags'] = batch_ner_tag(df['content'].tolist(), ner_tagger, batch_size=500)

# 2. Count entity types for each row (add temp counts DataFrame)
entity_counts_df = df['ner_tags'].apply(count_entities).apply(pd.Series)

# 3. Always have all 8 columns (fill missing with zeros)
entity_columns = ['O', 'PERSON', 'LOCATION', 'ORGANIZATION', 'MONEY', 'PERCENT', 'DATE', 'TIME']
entity_counts_df = entity_counts_df.reindex(columns=entity_columns, fill_value=0)

In [9]:
# 4. Concatenate back to the original df
df_final = pd.concat([df, entity_counts_df], axis=1)

# --- Preview results ---
print(df_final.head())

    filename                                            content  \
0  10098.txt  The military revenue for the fourth quarter wa...   
1  11069.txt  It's a couple of questions in there. Let me st...   
2   1157.txt  Let me start, Gregg, by saying that with respe...   
3  11681.txt  Sure, Simon. First of all, thank you for your ...   
4  11812.txt  Okay. Matt, I'll take that. And we're looking ...   

                                            ner_tags       O  PERSON  \
0  [(The, O), (military, O), (revenue, O), (for, ...  1656.0     3.0   
1  [(It, O), ('s, O), (a, O), (couple, O), (of, O...  1175.0     1.0   
2  [(Let, O), (me, O), (start, O), (,, O), (Gregg...  6262.0    50.0   
3  [(Sure, O), (,, O), (Simon, PERSON), (., O), (...  2313.0     1.0   
4  [(Okay, O), (., O), (Matt, O), (,, O), (I, O),...  5066.0    19.0   

   LOCATION  ORGANIZATION  MONEY  PERCENT  DATE  TIME  
0       1.0          18.0   36.0      NaN  37.0   7.0  
1       2.0          15.0    NaN      2.0   1.0   Na

In [10]:
entity_columns = ['O', 'PERSON', 'LOCATION', 'ORGANIZATION', 'MONEY', 'PERCENT', 'DATE', 'TIME']
df_final['sum_token'] = df_final[entity_columns].fillna(0).sum(axis=1)
print(df_final.head())  # Shows all columns, including sum_token

    filename                                            content  \
0  10098.txt  The military revenue for the fourth quarter wa...   
1  11069.txt  It's a couple of questions in there. Let me st...   
2   1157.txt  Let me start, Gregg, by saying that with respe...   
3  11681.txt  Sure, Simon. First of all, thank you for your ...   
4  11812.txt  Okay. Matt, I'll take that. And we're looking ...   

                                            ner_tags       O  PERSON  \
0  [(The, O), (military, O), (revenue, O), (for, ...  1656.0     3.0   
1  [(It, O), ('s, O), (a, O), (couple, O), (of, O...  1175.0     1.0   
2  [(Let, O), (me, O), (start, O), (,, O), (Gregg...  6262.0    50.0   
3  [(Sure, O), (,, O), (Simon, PERSON), (., O), (...  2313.0     1.0   
4  [(Okay, O), (., O), (Matt, O), (,, O), (I, O),...  5066.0    19.0   

   LOCATION  ORGANIZATION  MONEY  PERCENT  DATE  TIME  sum_token  
0       1.0          18.0   36.0      NaN  37.0   7.0     1758.0  
1       2.0          15.0    N

# Forward Looking Information
Muslu, V., Radhakrishnan, S., Subramanyam, K. R., & Lim, D. (2015). Forward-looking MD&A disclosures and the information environment. Management Science, 61(5), 931-948.

Muslu et al classify future sentence different way I use machine learning method. Muslu et al further distinguish operations; finance; accounting and I follow the method. 

# Code reference
 https://huggingface.co/FinanceInc/finbert_fls
 This is from Allen Huang 
 https://www.allenhuang.org/coding.html

 specific FLS means specific like date or dates or targets.

In [2]:
pip install torch transformers


Collecting torch
  Downloading torch-2.7.1-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting transformers
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
     ---------------------------------------- 0.0/40.9 kB ? eta -:--:--
     -------------------------------------- 40.9/40.9 kB 989.3 kB/s eta 0:00:00
Collecting filelock (from torch)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2025.7.0-py3-none-any.whl.metadata (12 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.33.4-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp311-cp311-win_amd64.whl.metadata (2.1 kB)
Collecting tokenizers<0.22,>=0.21 (fro


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
# This assumes df is your DataFrame from before
# We'll create a new DataFrame with each sentence as a row, with its source filename

sentences = []

for _, row in df.iterrows():
    filename = row['filename']
    content = row['content']
    for sentence in sent_tokenize(content):
        sentences.append({'filename': filename, 'sentence': sentence})

sent_df = pd.DataFrame(sentences)

print(sent_df)

        filename                                           sentence
0      10098.txt  The military revenue for the fourth quarter wa...
1      10098.txt  Well, we're expecting the military business to...
2      10098.txt  So we're looking at about $6 million in milita...
3      10098.txt  We expect our imaging physics business to gain...
4      10098.txt  That will most likely be in the second half of...
...          ...                                                ...
16239   9728.txt  Well if that’s all the questions, we like to t...
16240   9728.txt         It was a good quarter and was a good year.
16241   9728.txt  We made a lot of progress through the year and...
16242   9728.txt                               Thank you very much.
16243   9728.txt  If you haves any other questions, feel free to...

[16244 rows x 2 columns]


In [12]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

# Load model and tokenizer
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls')

# Build pipeline
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer, truncation=True)

# Function to classify a single sentence
def classify_sentence(sentence):
    result = nlp(str(sentence))[0]
    return result['label']

# Apply the classification to each sentence in the DataFrame
sent_df['FLS_label'] = sent_df['sentence'].apply(classify_sentence)

# If you want to see the result
print(sent_df[['sentence', 'FLS_label']].head())


Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


                                            sentence     FLS_label
0  The military revenue for the fourth quarter wa...       Not FLS
1  Well, we're expecting the military business to...  Specific FLS
2  So we're looking at about $6 million in milita...       Not FLS
3  We expect our imaging physics business to gain...  Specific FLS
4  That will most likely be in the second half of...  Specific FLS


In [13]:
# If you want to see the result
print(sent_df.head())

    filename                                           sentence     FLS_label
0  10098.txt  The military revenue for the fourth quarter wa...       Not FLS
1  10098.txt  Well, we're expecting the military business to...  Specific FLS
2  10098.txt  So we're looking at about $6 million in milita...       Not FLS
3  10098.txt  We expect our imaging physics business to gain...  Specific FLS
4  10098.txt  That will most likely be in the second half of...  Specific FLS


# Add Muslu's classification for operation, finance, and accounting

In [20]:
import re

In [16]:
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [32]:
# WordNet POS tag converter
def wordnet_pos_tags(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Your custom preprocessing function
def txt_preprocess_pipeline(text):
    standard_txt = text.lower()
    clean_txt = re.sub(r'\n', ' ', standard_txt)
    clean_txt = re.sub(r'\s+', ' ', clean_txt)
    clean_txt = clean_txt.strip()
    tokens = word_tokenize(clean_txt)
    filtered_tokens_alpha = [word for word in tokens if word.isalpha() and not re.match(r'^[ivxlcdm]+$', word)]
    stop_words = stopwords.words('english')
    
    filtered_tokens_final = [w for w in filtered_tokens_alpha if not w in stop_words]
    lemmatizer = WordNetLemmatizer()
    pos_tags = nltk.pos_tag(filtered_tokens_final)
    lemma_tokens = [lemmatizer.lemmatize(token, wordnet_pos_tags(pos_tag)) for token, pos_tag in pos_tags]
    return lemma_tokens

In [34]:
# --- PROCESS DATAFRAME ---
sent_df['lemmas'] = sent_df['sentence'].apply(txt_preprocess_pipeline)

In [43]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser

# Prepare your token lists from the 'lemmas' column
texts = sent_df['lemmas'].tolist()

NGRAM_TYPE = 'bigram'   # Change as needed

bigram = Phrases(texts, min_count=5, threshold=100)
trigram = Phrases(bigram[texts], threshold=100)

bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

if NGRAM_TYPE == 'bigram':
    texts_ngrams = make_bigrams(texts)
    print("Bigram transformation applied.")
elif NGRAM_TYPE == 'trigram':
    texts_ngrams = make_trigrams(texts)
    print("Trigram transformation applied.")
else:
    texts_ngrams = texts
    print("Unigram: no n-gram transformation applied.")

# Put ngrams back into your DataFrame, e.g., as 'lemmas_ngrams'
sent_df['lemmas_ngrams'] = texts_ngrams

print(sent_df[['sentence','lemmas', 'lemmas_ngrams']].head())


Bigram transformation applied.
                                            sentence  \
0  The military revenue for the fourth quarter wa...   
1  Well, we're expecting the military business to...   
2  So we're looking at about $6 million in milita...   
3  We expect our imaging physics business to gain...   
4  That will most likely be in the second half of...   

                                              lemmas  \
0  [military, revenue, fourth, quarter, million, ...   
1  [well, expect, military, business, fairly, lin...   
2            [look, million, military, fiscal, year]   
3  [expect, image, physic, business, gain, tracti...   
4               [likely, second, half, fiscal, year]   

                                       lemmas_ngrams  
0  [military, revenue, fourth, quarter, million, ...  
1  [well, expect, military, business, fairly, lin...  
2            [look, million, military, fiscal, year]  
3  [expect, image_physic, business, gain, tractio...  
4               [lik

In [45]:
operation_keywords_ngram = [
    "performance", "perform", "sales", "revenue", "earnings", "income", "profit", "loss", "expense", "ebt", "ebit", "ebitda",
    "depreciation", "amortization", "administrative", "cost_of_sales", "cost_of_goods_sold", "cogs", "tax",
    "impairment", "margin", "working_capital", "receivable", "payable", "inventory", "materials", "supplies", "bad_debt",
    "doubtful_account", "allowance", "collect", "accrual", "operating_cash_flow", "cash_flow_from_operations", "cash_flow_from_operating",
    "free_cash_flow", "bankruptcy", "chapter_7", "chapter_11", "operations", "operating", "operational", "product", "service", "technology",
    "contract", "overhead", "vendor", "supplier", "consumer", "customer", "client", "marketing", "order", "backlog", "advertising",
    "commission", "import", "export", "freight", "transportation", "utilities", "energy", "unit", "power", "compete", "competitive", "demand",
    "supply", "market", "business", "segment", "subsidy", "industry", "outsource", "promotion",  "compensation", "salary",
    "bonus", "grant", "award", "pension", "retirement", "health_care", "employee", "labor", "union", "director", "chairman", "president",
    "ceo", "cfo", "coo", "cio", "manager", "executive", "worker", "economic", "world", "country", "population", "environment", "government", "inflation","write-off",
]

investment_keywords_ngram = [
    "research", "develop", "r&d", "project", "invest", "expand", "dispose", "asset_sale", "asset_purchase", "spend", "capital_expenditure", "capex", "acquire", 
    "construct", "install", "capacity", "relocate", "remodel", "refresh", "overhaul", "upgrade", "maintain", "repair", "open",
    "close",  "pp&e", "subsidiary", "joint_venture", "jv", "partner", "license", "patent", "goodwill", 
]

finance_keywords_ngram = [
    "finance", "financing", "financial", "liquid", "borrow", "covenant", "debt", "debenture", "principal", "creditor", "liability",
    "equity", "capital_resource", "loan", "line_of_credit", "leverage", "fund", "repurchase", "stock_purchase", "share_purchase",
    "commercial_paper", "bank_credit", "pay_interest", "principal", "swap", "lease", "hedge", "dividend", "interest"
]

accounting_keywords_ngram = [
    "accounting", "gaap", "fas", "sfas", "fasb", "sec", "contingency", "record", "impairment_test", "financial_statement"
]


In [46]:
def contains_keyword(lemmas, keywords):
    return int(any(word in keywords for word in lemmas))

sent_df['operation'] = sent_df['lemmas'].apply(lambda x: contains_keyword(x, operation_keywords))
sent_df['investment'] = sent_df['lemmas'].apply(lambda x: contains_keyword(x, investment_keywords))
sent_df['finance'] = sent_df['lemmas'].apply(lambda x: contains_keyword(x, finance_keywords))
sent_df['accounting'] = sent_df['lemmas'].apply(lambda x: contains_keyword(x, accounting_keywords))

print(sent_df.head())

    filename                                           sentence     FLS_label  \
0  10098.txt  The military revenue for the fourth quarter wa...       Not FLS   
1  10098.txt  Well, we're expecting the military business to...  Specific FLS   
2  10098.txt  So we're looking at about $6 million in milita...       Not FLS   
3  10098.txt  We expect our imaging physics business to gain...  Specific FLS   
4  10098.txt  That will most likely be in the second half of...  Specific FLS   

                                              lemmas  operation  finance  \
0  [military, revenue, fourth, quarter, million, ...          1        0   
1  [well, expect, military, business, fairly, lin...          1        0   
2            [look, million, military, fiscal, year]          0        0   
3  [expect, image, physic, business, gain, tracti...          1        0   
4               [likely, second, half, fiscal, year]          0        0   

   accounting                                      lemma

# Sentiment Analysis 
Code reference
https://huggingface.co/yiyanghkust/finbert-tone

In [56]:
# Load the model and tokenizer
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

# Create the pipeline
nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

# Analyze all sentences in your DataFrame (batched for speed)
sentences = sent_df['sentence'].tolist()
results = nlp(sentences)

# Convert Hugging Face's output to DataFrame columns
# If you want only the label, or both label and score, you can extract as below
sent_df['finbert_sentiment'] = [res['label'] for res in results]
sent_df['finbert_sentiment_score'] = [res['score'] for res in results]


print(sent_df[['sentence', 'finbert_sentiment', 'finbert_sentiment_score']].head())

Device set to use cpu


                                            sentence finbert_sentiment  \
0  The military revenue for the fourth quarter wa...           Neutral   
1  Well, we're expecting the military business to...           Neutral   
2  So we're looking at about $6 million in milita...           Neutral   
3  We expect our imaging physics business to gain...          Positive   
4  That will most likely be in the second half of...           Neutral   

   finbert_sentiment_score  
0                 0.998738  
1                 0.999335  
2                 0.999997  
3                 1.000000  
4                 0.999997  


# Fog index

In [58]:
pip install py-readability-metrics

Collecting py-readability-metrics
  Downloading py_readability_metrics-1.4.5-py3-none-any.whl.metadata (8.8 kB)
Downloading py_readability_metrics-1.4.5-py3-none-any.whl (26 kB)
Installing collected packages: py-readability-metrics
Successfully installed py-readability-metrics-1.4.5
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [62]:
from readability import Readability

def get_fog(text):
    # skip calculation for short texts
    if not isinstance(text, str) or len(text.split()) < 100:
        return None
    try:
        r = Readability(text)
        gf = r.gunning_fog()
        return gf.score
    except Exception as e:
        return None

# Example: assuming 'content' is the column with your text
df['fog_index'] = df['content'].apply(get_fog)

In [63]:
print(df)

     filename                                            content  fog_index
0   10098.txt  The military revenue for the fourth quarter wa...  13.541756
1   11069.txt  It's a couple of questions in there. Let me st...  12.231049
2    1157.txt  Let me start, Gregg, by saying that with respe...  12.742846
3   11681.txt  Sure, Simon. First of all, thank you for your ...  13.452771
4   11812.txt  Okay. Matt, I'll take that. And we're looking ...  11.566013
..        ...                                                ...        ...
95   8022.txt  Kathryn, we got some favorable mix, but also a...  13.701432
96   8525.txt  Right. Exactly. I think we've got to make a de...  10.156648
97   8982.txt  Yes. So just to -- going back to 2021, overall...   9.915541
98   9240.txt  Joe, that's absolutely correct. You analyzed i...  11.893798
99   9728.txt  I think it’s a blend of a couple of things. I ...  13.326679

[100 rows x 3 columns]
