# Environment Setup

In [1]:
!pip install datasets



In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Upload your files here:

In [3]:
import os
import ipywidgets as widgets

folder_path = '/content/drive/MyDrive/NLP Trend Tool/Earnings Call Transcript Uploads'

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

def upload_files(change):
    upload_widget = change['owner']
    for filename, file_info in upload_widget.value.items():
        content = file_info['content']
        with open(os.path.join(folder_path, filename), 'wb') as f:
            f.write(content)
        print(f'Uploaded {filename} to {folder_path}')

upload_widget = widgets.FileUpload(
    accept='',
    multiple=True
)

upload_widget.observe(upload_files, names='value')

display(upload_widget)

FileUpload(value={}, description='Upload', multiple=True)

Uploaded Brinker International Inc., Q3 2011 Earnings Call, Apr 27, 2011.pdf to /content/drive/MyDrive/NLP Trend Tool/Earnings Call Transcript Uploads
Uploaded Brinker International Inc., Q4 2008 Earnings Call, Aug-05-2008.pdf to /content/drive/MyDrive/NLP Trend Tool/Earnings Call Transcript Uploads
Uploaded Brinker International, Inc., Q1 2013 Earnings Call, Oct 24, 2012.pdf to /content/drive/MyDrive/NLP Trend Tool/Earnings Call Transcript Uploads
Uploaded Brinker International, Inc., Q1 2014 Earnings Call, Oct 23, 2013.pdf to /content/drive/MyDrive/NLP Trend Tool/Earnings Call Transcript Uploads


# Running the Main Engine

In [4]:
!pip install transformers
!pip install pandas scikit-learn nltk
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install tqdm
!pip install tensorflow torch
!pip install PyPDF2


import PyPDF2
import os
import pandas as pd
import re
import nltk
import spacy
import json
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from transformers import pipeline
from nltk.tokenize import sent_tokenize
from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
pd.set_option('display.max_colwidth', None)

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m71.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# Import necessary libraries
from google.colab import data_table
from contextlib import redirect_stderr
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML, Markdown
import matplotlib.pyplot as plt
import matplotlib.backends.backend_agg as agg
import io
import re
import base64
import os
import warnings
warnings.filterwarnings('ignore')
import spacy
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
import functools

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

from collections import Counter

In [6]:
os.chdir(folder_path)
print("Current working directory:", os.getcwd())

Current working directory: /content/drive/MyDrive/NLP Trend Tool/Earnings Call Transcript Uploads


In [23]:

def extract_details_from_filename(filename):
    parts = filename.split(',')
    company = parts[0].strip()
    quarter_match = re.search(r"Q\d+", filename)
    quarter = quarter_match.group() if quarter_match else 'Unknown'
    year_match = re.search(r"\d{4}", filename)  # Updated to match only the 4-digit year
    year = year_match.group() if year_match else '0000'
    return company, quarter, int(year)

def load_texts_to_dataframe(folder_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(('.txt', '.pdf', '.docx')):
            company, quarter, year = extract_details_from_filename(filename)
            file_path = os.path.join(folder_path, filename)

            if filename.endswith('.txt'):
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
            elif filename.endswith('.pdf'):
                content = extract_text_from_pdf(file_path)
            elif filename.endswith('.docx'):
                content = extract_text_from_docx(file_path)
            else:
                continue

            data.append({'Company': company, 'Quarter': quarter, 'Year': year, 'File Name': filename, 'Text': content})
    return pd.DataFrame(data)

def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() or ''
    return text.strip()

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text.strip()

eda_df = load_texts_to_dataframe(folder_path)

In [24]:
cutoff_regex = r"These materials have been prepared solely for information purposes based upon information generally available to the public\s+and from sources believed to be reliable.*"

eda_df['Text'] = eda_df['Text'].apply(lambda x: re.sub(cutoff_regex, '', x, flags=re.DOTALL))

copyright_regex = re.compile(r"""
COPYRIGHT\s+©\s+\d{4}\s+(by\s+)?S&P\s+Global\s+Market\s+Intelligence,\s+a\s+division\s+of\s+S&P\s+Global\s+Inc\.\s+All\s+rights\s+reserved|
spglobal\.com/marketintelligence(\s*\d+)?
""", re.VERBOSE | re.IGNORECASE)

def clean_text(text):
    text = re.sub(cutoff_regex, '', text, flags=re.DOTALL)
    text = copyright_regex.sub('', text)
    return text

eda_df['Text'] = eda_df['Text'].apply(clean_text)

def remove_text_before_second_call_participants(text):
    first_occurrence = text.find("Call Participants")
    second_occurrence = text.find("Call Participants", first_occurrence + 1)

    if first_occurrence != -1 and second_occurrence != -1:
        text = text[second_occurrence:]

    return text

eda_df['Text'] = eda_df['Text'].apply(remove_text_before_second_call_participants)

def remove_text_before_presentation(text):
    presentation_index = text.find("Presentation")

    if presentation_index != -1:
        text = text[presentation_index:]

    return text

eda_df['Text'] = eda_df['Text'].apply(remove_text_before_presentation)

In [25]:
def preprocess_text(text):
    text = re.sub(r'(?<!\.)\n(?!\n)', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

eda_df['Text'] = eda_df['Text'].apply(preprocess_text)

rows = []
for _, row in eda_df.iterrows():
    sentences = sent_tokenize(row['Text'])
    for sentence in sentences:
        rows.append({'Company': row['Company'], 'Year': row['Year'], 'Quarter': row['Quarter'], 'Sentence': sentence})

sentence_df = pd.DataFrame(rows)
print(sentence_df.head())

               Company  Year  Quarter  \
0  Atkins Nutritionals     0  Unknown   
1  Atkins Nutritionals     0  Unknown   
2  Atkins Nutritionals     0  Unknown   
3  Atkins Nutritionals     0  Unknown   
4  Atkins Nutritionals     0  Unknown   

                                                                                                                      Sentence  
0                                         Presentation Operator Good morning, and welcome to the Conyers Park conference call.  
1                                                                                              Today's call is being recorded.  
2  [Operator Instructions] It is now my pleasure to turn the floor over to Dave West, Chief Executive Officer of Conyers Park.  
3                                                                                                                     David J.  
4                 West Executive Vice Chairman Thanks, operator, and welcome, everyone, to today's Conyers P

In [26]:
def chunk_sentences(dataframe, chunk_size=20):
    chunks = []
    num_chunks = len(dataframe) // chunk_size + (1 if len(dataframe) % chunk_size != 0 else 0)

    for i in range(num_chunks):
        start_index = i * chunk_size
        end_index = start_index + chunk_size
        chunk_sentences = dataframe.iloc[start_index:end_index]['Sentence'].tolist()
        chunk_paragraph = ' '.join(chunk_sentences)
        company = dataframe.iloc[start_index]['Company']
        year = dataframe.iloc[start_index]['Year']
        quarter = dataframe.iloc[start_index]['Quarter']
        chunks.append({'Company': company, 'Year': year, 'Quarter': quarter, 'Paragraph': chunk_paragraph})

    return pd.DataFrame(chunks)

chunked_df = chunk_sentences(sentence_df, chunk_size=20)
print(chunked_df.head())

               Company  Year  Quarter  \
0  Atkins Nutritionals     0  Unknown   
1  Atkins Nutritionals     0  Unknown   
2  Atkins Nutritionals     0  Unknown   
3  Atkins Nutritionals     0  Unknown   
4  Atkins Nutritionals     0  Unknown   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [27]:
import pandas as pd
from tqdm import tqdm
from transformers import pipeline
from datasets import Dataset

tqdm.pandas()

# Initializing the summarizer pipeline with T5 model using PyTorch
device = 0  # Use 0 for the first GPU, -1 for CPU
summarizer = pipeline("summarization", model="t5-base", device=device, framework='pt')

# Converting the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(chunked_df)

# Function to summarize in batches of paragraphs
def summarize_batch(batch):
    summaries = summarizer(batch['Paragraph'], max_length=130, min_length=30, do_sample=False)
    return {'Summary': [summary['summary_text'] for summary in summaries]}

# Applying summarizer with batch processing
batch_size = 16
result_dataset = dataset.map(summarize_batch, batched=True, batch_size=batch_size)

result_df = result_dataset.to_pandas()

filtered_df = result_df.copy()

Map:   0%|          | 0/174 [00:00<?, ? examples/s]

Your max_length is set to 130, but your input_length is only 56. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)


In [28]:
!pip install transformers pandas scikit-learn nltk spacy gensim
!python -m spacy download en_core_web_sm

import os
import pandas as pd
import re
import nltk
import spacy
import json
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import numpy as np
from transformers import pipeline
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import ngrams
from nltk import pos_tag
from gensim.models import Word2Vec
from transformers import T5Tokenizer, T5ForConditionalGeneration
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

pd.set_option('display.max_colwidth', None)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def split_into_sentences(df, text_column):
    rows = []
    for index, row in df.iterrows():
        sentences = sent_tokenize(row[text_column])
        for sentence in sentences:
            rows.append({
                'Company': row['Company'],
                'Year': row['Year'],
                'Quarter': row['Quarter'],
                'Sentence': sentence
            })
    return pd.DataFrame(rows)

# Splitting the "Summary" column of filtered_df into individual sentences
sentence_df = split_into_sentences(filtered_df, 'Summary')

print(sentence_df)

nlp = spacy.load('en_core_web_sm')

df = sentence_df.copy()

# Extracting the "Sentence" column
sentences = df['Sentence'].tolist()

def generate_ngrams(sentence, n):
    words = nltk.word_tokenize(sentence)
    return list(ngrams(words, n))

for n in range(1, 6):
    df[f'{n}-grams'] = pd.Series([[] for _ in range(len(df))])

for i, sentence in enumerate(sentences):
    for n in range(1, 6):
        df.at[i, f'{n}-grams'] = generate_ngrams(sentence, n)

def parse_ngrams(ngram_str):
    try:
        return ast.literal_eval(ngram_str)
    except Exception as e:
        print(f"Error parsing n-gram: {ngram_str} - {e}")
        return []

def filter_ngrams(ngrams):
    filtered_ngrams = []
    for ngram in ngrams:
        pos_tags = pos_tag(ngram)
        pos_sequence = [tag for word, tag in pos_tags]

        if len(ngram) == 1:
            if pos_sequence[0] in ['NN', 'NNS', 'NNP', 'NNPS']:
                filtered_ngrams.append(ngram)

        elif len(ngram) == 2:
            if (ngram[0].istitle() and ngram[1].istitle()) or \
               (pos_sequence == ['JJ', 'NN']) or (pos_sequence == ['ADJ', 'NN']) or (pos_sequence == ['VBD', 'NN']) or \
               (pos_sequence == ['JJ', 'NNS']) or (pos_sequence == ['NNS', 'VBD']) or (pos_sequence == ['NN', 'VBD']) or (pos_sequence == ['VBD', 'NNS']):
                filtered_ngrams.append(ngram)

        elif len(ngram) == 3:
            if pos_sequence in [['JJ', 'NN', 'NN'], ['NN', 'NN', 'NN'], ['NN', 'IN', 'NN'], ['JJ', 'JJ', 'NN'], ['JJ', 'NN', 'NNS'], ['NN', 'ADP', 'NN']]:
                filtered_ngrams.append(ngram)

        elif len(ngram) == 4:
            if pos_sequence in [['NN', 'IN', 'JJ', 'NN'], ['NN', 'NN', 'NN', 'NN'], ['JJ', 'JJ', 'NN', 'NN'], ['JJ', 'NN', 'NNS', 'NN'], ['NN', 'NNS', 'IN', 'NN'], ['NNS', 'VBD', 'DT', 'NN']]:
                filtered_ngrams.append(ngram)

        elif len(ngram) == 5:
            if pos_sequence in [['NN', 'IN', 'JJ', 'NN', 'NN'], ['JJ', 'JJ', 'NN', 'NN', 'NN'], ['JJ', 'NN', 'NNS', 'NN', 'NN']]:
                filtered_ngrams.append(ngram)

    return filtered_ngrams

for n in range(1, 6):
    col_name = f'{n}-grams'
    df[f'Filtered_{n}-grams'] = df[col_name].apply(filter_ngrams)

relevant_sent = df.copy()


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m72.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Using device: cuda
                   Company  Year  Quarter  \
0      Atkins Nutritionals     0  Unknown   
1      Atkins Nutritionals     0  Unknown   
2      Atkins Nutritionals     0  Unknown   
3      Atkins Nutritionals     0  Unknown   
4      Atkins Nutritionals     0  Unknown   
..                     ...   ...      ...   
484  Brinker International  2014       Q1   
485  Brinker International  2014       Q1   
486  Brinker International  2014       Q1   
487  Brinker International  2014       Q1   
488  Brinker International  2014       Q1   

                                                                                                    Sentence  
0                                                    today's call is prerecorded, and there will be no Q&A .  
1    we are excited to discuss with you our plans to consummate a transaction with Atkins Nutritionals, Inc.  
2             the proposed merger between conyers park and atkins nutritionals will take place on april 11 

In [29]:
sentences_tokenized = [nltk.word_tokenize(sentence) for sentence in sentences]
word2vec_model = Word2Vec(sentences_tokenized, vector_size=100, window=5, min_count=1, workers=4)

def ngram_to_vector(ngram, model):
    vectors = [model.wv[word] for word in ngram if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

def similarity(ngram1, ngram2, model):
    vec1 = ngram_to_vector(ngram1, model)
    vec2 = ngram_to_vector(ngram2, model)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def filter_redundant_ngrams(ngrams, model, threshold=0.8):
    unique_ngrams = []
    for ngram in ngrams:
        if all(similarity(ngram, unique_ngram, model) < threshold for unique_ngram in unique_ngrams):
            unique_ngrams.append(ngram)
    return unique_ngrams

for n in range(1, 6):
    col_name = f'{n}-grams'
    df[f'Filtered_{n}-grams'] = df[col_name].apply(filter_ngrams)
    df[f'Unique_{n}-grams'] = df[f'Filtered_{n}-grams'].apply(lambda x: filter_redundant_ngrams(x, word2vec_model))

def ner_and_filter(df):
    for index, row in df.iterrows():
        unique_2_grams = row['Unique_2-grams']
        unique_1_grams = row['Unique_1-grams']

        unique_2_grams_to_remove = []
        unique_1_grams_to_remove = []

        for ngram in unique_2_grams:
            ngram_str = " ".join(ngram)
            doc = nlp(ngram_str)

            if any(ent.label_ == "PERSON" for ent in doc.ents):
                for word in ngram:
                    if word in [w[0] for w in unique_1_grams]:
                        unique_2_grams_to_remove.append(ngram)
                        unique_1_grams_to_remove.append(word)

        df.at[index, 'Unique_2-grams'] = [ngram for ngram in unique_2_grams if ngram not in unique_2_grams_to_remove]
        df.at[index, 'Unique_1-grams'] = [word for word in unique_1_grams if word[0] not in unique_1_grams_to_remove]
    return df

df = ner_and_filter(df)

def remove_ngrams_in_larger_ngrams(df, smaller_n, larger_n):
    for index, row in df.iterrows():
        smaller_ngrams = row[f'Unique_{smaller_n}-grams']
        larger_ngrams = row[f'Unique_{larger_n}-grams']

        smaller_ngrams_to_remove = []

        for smaller_ngram in smaller_ngrams:
            if any(set(smaller_ngram).issubset(larger_ngram) for larger_ngram in larger_ngrams):
                smaller_ngrams_to_remove.append(smaller_ngram)

        df.at[index, f'Unique_{smaller_n}-grams'] = [ngram for ngram in smaller_ngrams if ngram not in smaller_ngrams_to_remove]
    return df

for smaller_n, larger_ns in [(1, [2, 3, 4, 5]), (2, [3, 4, 5]), (3, [4, 5]), (4, [5])]:
    for larger_n in larger_ns:
        df = remove_ngrams_in_larger_ngrams(df, smaller_n, larger_n)

def remove_verbs_adjs_adv_pron_aux_in_1grams(df):
    for index, row in df.iterrows():
        unique_1_grams = row['Unique_1-grams']

        unique_1_grams_to_remove = []

        for word_tuple in unique_1_grams:
            word = word_tuple[0]
            doc = nlp(word)
            if doc[0].pos_ in ['VERB', 'ADJ', 'ADV', 'PRON', 'AUX']:
                unique_1_grams_to_remove.append(word_tuple)

        df.at[index, 'Unique_1-grams'] = [word for word in unique_1_grams if word not in unique_1_grams_to_remove]
    return df

def remove_adv_pron_aux_in_2grams(df):
    for index, row in df.iterrows():
        unique_2_grams = row['Unique_2-grams']

        unique_2_grams_to_remove = []

        for ngram in unique_2_grams:
            if any(nlp(word)[0].pos_ in ['ADV', 'PRON', 'AUX'] for word in ngram):
                unique_2_grams_to_remove.append(ngram)

        df.at[index, 'Unique_2-grams'] = [ngram for ngram in unique_2_grams if ngram not in unique_2_grams_to_remove]
    return df

def remove_rows_with_company_in_1grams(df):
    rows_to_remove = []
    for index, row in df.iterrows():
        unique_1_grams = [word[0].lower() for word in row['Unique_1-grams']]
        company_words = str(row['Company']).lower().split()
        if any(word in unique_1_grams for word in company_words):
            rows_to_remove.append(index)
    df = df.drop(rows_to_remove).reset_index(drop=True)
    return df

df = remove_rows_with_company_in_1grams(df)

# Removing rows where Unique_1-grams is the only column with value unless there are more than two 1-grams in the column
def remove_rows_with_only_1grams(df):
    rows_to_remove = []
    for index, row in df.iterrows():
        if len(row['Unique_1-grams']) > 0 and all(len(row[f'Unique_{n}-grams']) == 0 for n in range(2, 6)):
            if len(row['Unique_1-grams']) <= 2:
                rows_to_remove.append(index)
    df = df.drop(rows_to_remove).reset_index(drop=True)
    return df

# Removing rows where Unique_1-grams is the only column with value unless there are more than two 1-grams
df = remove_rows_with_only_1grams(df)

def remove_rows_with_specific_conditions_in_1grams(df):
    rows_to_remove = []
    for index, row in df.iterrows():
        unique_1_grams = [word[0].lower() for word in row['Unique_1-grams']]
        if 'lot' in unique_1_grams or any('%' in word for word in unique_1_grams) or any(re.search(r'\d', word) for word in unique_1_grams):
            rows_to_remove.append(index)
    df = df.drop(rows_to_remove).reset_index(drop=True)
    return df

df = remove_rows_with_specific_conditions_in_1grams(df)

df = remove_verbs_adjs_adv_pron_aux_in_1grams(df)
df = remove_adv_pron_aux_in_2grams(df)
df = remove_rows_with_company_in_1grams(df)
df = remove_rows_with_only_1grams(df)
df = remove_rows_with_specific_conditions_in_1grams(df)


In [30]:
penalty_words = ['%', 'A', 'Administrative', 'Administratives', 'April', 'August', 'B', 'C', 'CEO', 'Chief', 'Chiefs', 'D', 'Day', 'Days', 'December', 'E', 'EBITDA', 'EBITDAS', 'EMEA', 'EMEAS', 'EP', 'EPS', 'Eighth', 'Eighths', 'F', 'February', 'Fifth', 'Fifths', 'First', 'Food', 'Foods', 'Form', 'Forms', 'Fourth', 'Fourths', 'Friday', 'Fridays', 'G', 'Good', 'Goods', 'Great', 'Greats', 'H', 'I', 'J', 'January', 'July', 'June', 'K', 'L', 'M', 'March', 'May', 'Monday', 'Mondays', 'Morning', 'Mornings', 'N', 'Ninth', 'Ninths', 'November', 'O', 'October', 'Officer', 'Officers', 'P', 'Project', 'Projects', 'Q', 'Q2', 'Q1', 'Q3', 'Q4', 'R', 'S', 'SOME', 'Saturday', 'Saturdays', 'Second', 'Seconds', 'September', 'Septembers', 'Seventh', 'Sevenths', 'Sixth', 'Sixths', 'Sunday', 'Sundays', 'T', 'Tenth', 'Tenths', 'Third', 'Thirds', 'Thursday', 'Thursdays', 'Tuesday', 'Tuesdays', 'U', 'US', 'V', 'VS', 'W', 'WS', 'Wednesday', 'Wednesdays', 'X', 'Y', 'Z', 'about', 'abouts', 'accelerated', 'accounted', 'accounting', 'accountings', 'accretion', 'accretions', 'acquisition', 'acquisitions', 'action', 'actions', 'active', 'actives', 'additional', 'additionals', 'address', 'addresses', 'administrative', 'administratives', 'advisor', 'advisors', 'agenda', 'agendas', 'agendum', 'algorithm', 'algorithms', 'allocation', 'allocations', 'allow', 'allows', 'amount', 'amounts', 'analyst', 'analysts', 'analytic', 'analytics', 'annual', 'annuals', 'answer', 'answers', 'aoi', 'aois', 'appropriate', 'appropriates', 'area', 'areas', 'as', 'aspect', 'aspects', 'asset', 'assets', 'at', 'attach', 'ats', 'august', 'augusts', 'average', 'averages', 'award', 'awards', 'balance', 'balances', 'base', 'bases', 'basi', 'basic', 'basics', 'basis', 'become', 'becomes', 'benefit', 'benefits', 'bifurcated', 'big', 'bill', 'bills', 'bit', 'bits', 'blocking', 'blockings', 'bookkeeper', 'bookkeepers', 'bottom', 'bottoms', 'brand', 'brands', 'brand-new', 'bright', 'broad', 'broads', 'build', 'builds', 'business', 'businesses', 'by', 'cadence', 'cadences', 'calendar', 'calendars', 'call', 'calls', 'came', 'cames', 'capabilities', 'capability', 'capacities', 'capacity', 'capex', 'capexes', 'capital', 'capitals', 'case', 'cases', 'cash', 'categories', 'category', 'certain', 'certains', 'chain', 'chains', 'chairman', 'chairmen', 'change', 'changed', 'changes', 'channel', 'channels', 'chef', 'chefs', 'chief', 'chiefs', 'china', 'chinas', 'chunk', 'chunks', 'clarities', 'clarity', 'combination', 'combinations', 'come', 'comes', 'comment', 'comments', 'commissaries', 'commissary', 'commodities', 'commodity', 'comp', 'companies', 'company', 'comparison', 'comparisons', 'competition', 'competitions', 'competitive', 'competitives', 'component', 'components', 'comprehensive', 'comprehensives', 'comps', 'concentration', 'concentrations', 'concept', 'concepts', 'conclusion', 'conclusions', 'conference', 'conferences', 'confidence', 'confidences', 'confidential', 'confidentials', 'consecutive', 'consecutives', 'consistent', 'consistents', 'consolidated', 'consolidation', 'consolidations', 'constant', 'constants', 'construction', 'constructions', 'consumer', 'consumers', 'consumption', 'consumptions', 'contact', 'contacts', 'continued', 'contribution', 'contributions', 'contributor', 'contributors', 'context', 'copies', 'copy', 'corporate', 'corporates', 'cost', 'costs', 'couple', 'course', 'courses', 'credit', 'credits', 'cross', 'crosses', 'culinaries', 'culinary', 'cumulative', 'cumulatives', 'currencies', 'currency', 'current', 'currents', 'cycle', 'cycles', 'data', 'database', 'dKB', 'dKBs', 'day', 'date', 'dates', 'deal', 'deals', 'debt', 'debts', 'decline', 'declined', 'declines', 'decreased', 'deep', 'defensible', 'degree', 'degrees', 'delivered', 'deploy', 'deploys', 'depreciation', 'depreciations', 'desire', 'desires', 'detail', 'details', 'development', 'developments', 'difference', 'differences', 'different', 'difficult', 'digit', 'digits', 'dilutive', 'dimension', 'dimensions', 'discern', 'discerns', 'discipline', 'disciplined', 'disciplines', 'discounting', 'discountings', 'discrete', 'discretes', 'discussion', 'discussions', 'distant', 'distants', 'distributor', 'distributors', 'diversified', 'dividend', 'dividends', 'division', 'divisions', 'dollar', 'dollars', 'double', 'doubles', 'double-digit', 'drive', 'driver', 'drivers', 'drives', 'drove', 'droves', 'earning', 'earnings', 'EBITDA', 'economic', 'economics', 'edge', 'edges', 'effect', 'effective', 'effectiveness', 'effectives', 'effects', 'efficiencies', 'efficiency', 'effort', 'efforts', 'element', 'elements', 'emphases', 'emphasi', 'emphasis', 'enabler', 'enablers', 'encouraged', 'encourageds', 'end', 'ends', 'energy' 'enhanced', 'enhanceds', 'enterprise', 'entire', 'enterprises', 'environment', 'environments', 'equities', 'equity', 'equivalent', 'equivalents', 'estimate', 'estimates', 'evaluation', 'evaluations', 'event', 'events', 'exact', 'exacts', 'example', 'examples', 'exceed', 'exceeds', 'exchange', 'exclude', 'excludes', 'executed', 'executeds', 'execution', 'executions', 'executive', 'executives', 'excite', 'expanded', 'expansion', 'expansions', 'expectation', 'expectations', 'expense', 'expenses', 'experiential', 'experientials', 'extension', 'extensions', 'facilities', 'facility', 'fact', 'factor', 'factors', 'facts', 'fall', 'falls', 'few', 'fews', 'final', 'finals', 'finance', 'finances', 'financial', 'financials', 'fine', 'fines', 'fiscal', 'fiscals', 'flat', 'flats', 'flow', 'flows', 'focus', 'focuses', 'food', 'foods', 'foodservice', 'foodservices', 'for', 'forefront', 'foreign', 'forefronts', 'format', 'formats', 'former', 'formers', 'fors', 'fourth', 'fourths', 'free', 'frees', 'frequent', 'frequents', 'from', 'froms', 'fueling', 'fuelings', 'full', 'fulls', 'function', 'functions', 'fund', 'funds', 'future', 'futures', 'gain', 'gained', 'gaineds', 'gains', 'general', 'generals', 'generated', 'goal', 'goals', 'good', 'goods', 'great', 'greats', 'grew', 'grews', 'gros', 'gross', 'grosses', 'ground', 'grounds', 'group', 'groups', 'growth', 'growths', 'guidance', 'guidances', 'had', 'half', 'halves', 'headlines', 'headline', 'hedging', 'hedgings', 'help', 'helps', 'high', 'highs', 'hour', 'hours' 'hung', 'idea', 'ideas', 'impact', 'impacts', 'important', 'importants', 'improvement', 'improvements', 'in', 'incentive', 'incentives', 'income', 'incomes', 'increase', 'increased', 'increaseds', 'increases', 'incremental', 'incrementals', 'independent', 'independents', 'industrial', 'industrials', 'industries', 'industry', 'inefficient', 'inefficients', 'inflation', 'inflations', 'initiative', 'initiatives', 'innovation', 'innovations', 'innovative', 'innovatives', 'ins', 'insight', 'insights', 'integrated', 'integrateds', 'integration', 'integrations', 'intention', 'intentions', 'interaction', 'interactions', 'interest', 'interests', 'introduced', 'introduceds', 'introduction', 'introductions', 'investment', 'investments', 'investor', 'investors', 'item', 'items', 'january', 'job', 'jobs', 'joint', 'joints', 'key', 'keyed', 'keyeds', 'keys', 'kind', 'kinds', 'label', 'labels', 'labor', 'labors', 'lack', 'lacks', 'large', 'larges', 'last', 'lasts', 'launch', 'launched', 'launcheds', 'launches', 'leader', 'leaders', 'level', 'levels', 'leveraging', 'leveragings', 'lieu', 'lieus', 'lighter', 'limit', 'limits', 'line', 'lines', 'little', 'littles', 'local', 'locals', 'logistic', 'logistics', 'long', 'long-term', 'longs', 'loping', 'lopings', 'los', 'loser', 'losers', 'loss', 'losses', 'lot', 'lots', 'low', 'lows', 'magnitude', 'magnitudes', 'major', 'majors', 'manage', 'management', 'managements', 'manages', 'manufacture', 'manufactures', 'many', 'march', 'margin', 'marginal', 'marginals', 'margins', 'market', 'marketing', 'marketings', 'markets', 'meaningful', 'meaningfuls', 'measure', 'measures', 'media', 'medias', 'memories', 'memory', 'mentor', 'mentors', 'metric', 'metrics', 'mix', 'mixes', 'model', 'models', 'moment', 'moments', 'momentum', 'momentums', 'month', 'months', 'morning', 'mornings', 'multiunit', 'multiunits', 'named', 'nameds', 'nascent', 'national', 'nationals', 'near-term', 'need', 'needs', 'negative', 'negatives', 'net', 'nets', 'network', 'networks', 'new', 'news', 'next', 'nexts', 'normalized', 'normalizeds', 'non-GAAP', 'non-executive', 'number', 'numbers', 'numerou', 'numerous', 'numerouses', 'objective', 'objectives', 'of', 'officer', 'officers', 'offset', 'offsets', 'ofs', 'okay', 'okays', 'on', 'ons', 'on-trend', 'on-brand', 'operate', 'operates', 'operating', 'operatings', 'operational', 'operationals', 'opportunities', 'opportunity', 'opposite', 'opposites', 'option', 'options', 'original', 'originals', 'other', 'others', 'outlook', 'outlooks', 'over', 'overall', 'overalls', 'overs', 'own', 'owns', 'page', 'pages', 'paid', 'paids', 'part', 'particular', 'particulars', 'parties', 'partner', 'partners', 'partnership', 'partnerships', 'parts', 'party', 'past', 'pasts', 'pattern', 'patterns', 'penetration', 'penetrations', 'people', 'peoples', 'per', 'percent', 'percentage', 'percentages', 'percents', 'performance', 'performances', 'performer', 'performers', 'period', 'periods', 'pers', 'person', 'perspective', 'perspectives', 'phenomena', 'phenomenon', 'piece', 'pieces', 'pipeline', 'pipelines', 'place', 'places', 'plan', 'planning', 'plannings', 'plans', 'platform', 'platforms', 'please', 'pleases', 'point', 'points', 'portfolio', 'portfolios', 'portion', 'portions', 'position', 'positions', 'positive', 'positives', 'posted', 'posteds', 'preeminent', 'preeminents', 'premier', 'premiers', 'premium', 'premiums', 'president', 'presidents', 'pretty', 'prevailing', 'prevailings', 'previou', 'previous', 'previouses', 'price', 'prices', 'pricing', 'pricings', 'primaries', 'primary', 'principle', 'principles', 'prior', 'priorities', 'priority', 'prior', 'priors', 'private', 'privates', 'problem', 'problems', 'proces', 'process', 'processes', 'producer', 'producers', 'product', 'production', 'productions', 'products', 'profile', 'profiles', 'profit', 'profitabilities', 'profitability', 'profits', 'progres', 'progress', 'progresses', 'project', 'projects', 'promotion', 'promotions', 'properties', 'property', 'proposition', 'propositions', 'pullback', 'pullbacks', 'purpose', 'purposes', 'put', 'qualities', 'quality', 'quantities', 'quantity', 'quarter', 'quarterlies', 'quarterly', 'quarters', 'question', 'questions', 'queue', 'queues', 'range', 'ranges', 'rate', 'rates', 'rating', 'ratings', 'rationalization', 'rationalizations', 'real', 'realized', 'realizeds', 'reals', 'recapitalization', 'recapitalizations', 'reconciliation', 'reconciliations', 'reconsider', 'reconsiders', 'record', 'records', 'reduction', 'reductions', 'refer', 'reference', 'references', 'refers', 'reflect', 'reflects', 'regional', 'regionals', 'reinvest', 'reinvestment', 'reinvestments', 'reinvests', 'reiterate', 'reiterates', 'related', 'relateds', 'relationship', 'relationships', 'relative', 'relatives', 'remain', 'remains', 'repeat', 'repeats', 'replay', 'replays', 'report', 'reports', 'research', 'researches', 'resource', 'resources', 'restaurant', 'restaurants', 'result', 'resulted', 'resulteds', 'results', 'revenue', 'revenues', 'reversal', 'reversals', 'right', 'rights', 'risk', 'risks', 'robust', 'robusts', 'role', 'roles', 'row', 'runway', 'runways', 'sale', 'sales', 'same', 'sames', 'saving', 'savings', 'saw', 'saws', 'schedule', 'schedules', 'score', 'scores', 'search', 'searches', 'second', 'seconds', 'segment', 'segments', 'senior', 'seniors', 'sequential', 'sequentials', 'several', 'severals', 'share', 'shareholder', 'shareholders', 'shares', 'shift', 'shifts', 'short', 'shorts', 'side', 'sides', 'sight', 'sights', 'significant', 'significants', 'single-digit', 'similar', 'similars', 'since', 'sinces', 'sizable', 'sizables', 'size', 'sizes', 'sku', 'skus', 'slowdown', 'slowdowns', 'small', 'smalls', 'sold', 'solds', 'solid', 'solids', 'some', 'sort', 'sorts', 'specific', 'specifics', 'spend', 'spends', 'spot', 'spots', 'stage', 'stages', 'standard', 'standards', 'standpoint', 'standpoints', 'state', 'statement', 'statements', 'states', 'store', 'stores', 'stories', 'story', 'straight', 'straight-line', 'strategic', 'strategics', 'strategies', 'strategy', 'street', 'streets', 'strength', 'strengths', 'strong', 'strongs', 'structure', 'structures', 'such', 'suches', 'supplier', 'suppliers', 'supplies', 'supply', 'support', 'supported', 'supporteds', 'supports', 'survey', 'surveys', 'sustained', 'sustaineds', 'sweet', 'tactical', 'tacticals', 'tad', 'tads', 'take', 'takes', 'talent', 'talents', 'target', 'targets', 'tax', 'taxes', 'team', 'teams', 'tender', 'tenders', 'tenth', 'tenths', 'term', 'terms', 'that', 'thing', 'things', 'thinking', 'thinkings', 'third', 'thirds', 'those', 'thought', 'thoughts', 'threat', 'threats', 'ticket', 'tickets', 'time', 'times', 'timing', 'timings', 'today', 'todays', 'tool', 'tools', 'top', 'tops', 'total', 'totals', 'tough', 'toughs', 'toward', 'towards', 'track', 'tracks', 'trade', 'trades', 'trading', 'tradings', 'trajectories', 'trajectory', 'transaction', 'transactions', 'transformational', 'transformationals', 'transition', 'transitions', 'tremendou', 'tremendous', 'tremendouses', 'triple', 'triples', 'tuesday', 'tuesdays', 'unattractive', 'unattractives', 'underlying', 'underlyings', 'underway', 'underways', 'uneven', 'unevens', 'unit', 'units', 'unlock', 'unlocks', 'unmeasured', 'unmeasureds', 'until', 'untils', 'unusual', 'unusuals', 'update', 'updates', 'upper', 'uppers', 'urgencies', 'urgency', 'utilization', 'utilizations', 'valuable', 'valuables', 'value', 'values', 'variou', 'various', 'variouses', 'velocities', 'velocity', 'versu', 'versus', 'versuses', 'vertical', 'verticals', 'viabilities', 'viability', 'vice', 'vices', 'visibilities', 'visibility', 'volatilities', 'volatility', 'volume', 'volumes', 'wa', 'was', 'wave', 'waves', 'way', 'ways', 'we', 'weak', 'weaknes', 'weakness', 'weaknesses', 'weaks', 'week', 'weeks', 'weight', 'weights', 'well', 'wells', 'went', 'wents', 'were', 'whole', 'wholes', 'willingnes', 'willingness', 'willingnesses', 'with', 'withs', 'work', 'works', 'world', 'worlds', 'year', 'year-to-date','year-on-year', 'year-over-year', 'years', 'leadership', 'marketplace', 'personal', 'integrity']

def calculate_basis_score(ngram):
    return len(ngram)

def apply_penalties(ngram, penalty_words):
    penalty = sum(1 for word in ngram if word.lower() in penalty_words)
    if any(char.isdigit() for word in ngram for char in word):
        penalty += 1
    return penalty

def adjust_and_filter_ngrams(ngrams, penalty_words):
    adjusted_ngrams = []
    for ngram in ngrams:
        basis_score = calculate_basis_score(ngram)
        penalty = apply_penalties(ngram, penalty_words)
        adjusted_score = basis_score - penalty
        if adjusted_score > 0:
            adjusted_ngrams.append(ngram)
    return adjusted_ngrams

for n in range(1, 6):
    col_name = f'Unique_{n}-grams'
    df[col_name] = df[col_name].apply(lambda x: adjust_and_filter_ngrams(x, penalty_words))

# Removing unique 2-grams containing words from the "Company" column
def remove_2grams_with_company_words(df):
    for index, row in df.iterrows():
        company_words = str(row['Company']).lower().split()
        unique_2_grams = row['Unique_2-grams']
        filtered_2_grams = [ngram for ngram in unique_2_grams if not any(word.lower() in company_words for word in ngram)]
        df.at[index, 'Unique_2-grams'] = filtered_2_grams
    return df

# Removing rows where unique 2-gram, 3-gram, 4-gram, 5-gram columns are blank but 1-gram column is not AND has only one word
def remove_rows_with_only_one_1gram(df):
    rows_to_remove = []
    for index, row in df.iterrows():
        if len(row['Unique_1-grams']) == 1 and all(len(row[f'Unique_{n}-grams']) == 0 for n in range(2, 6)):
            rows_to_remove.append(index)
    df = df.drop(rows_to_remove).reset_index(drop=True)
    return df

# Removing rows where all unique n-gram column values are blank
def remove_rows_with_all_blank_ngrams(df):
    rows_to_remove = []
    for index, row in df.iterrows():
        if all(len(row[f'Unique_{n}-grams']) == 0 for n in range(1, 6)):
            rows_to_remove.append(index)
    df = df.drop(rows_to_remove).reset_index(drop=True)
    return df

# Removing rows where "Sentence" contains "?" or ":" or the word "says" or "asks"
def remove_rows_with_specific_phrases(df):
    rows_to_remove = []
    for index, row in df.iterrows():
        sentence = row['Sentence'].lower()
        if '?' in sentence or ':' in sentence or 'says' in sentence or 'asks' in sentence:
            rows_to_remove.append(index)
    df = df.drop(rows_to_remove).reset_index(drop=True)
    return df


df = remove_2grams_with_company_words(df)
df = remove_rows_with_only_one_1gram(df)
df = remove_rows_with_all_blank_ngrams(df)
df = remove_rows_with_specific_phrases(df)

def ngrams_in_row_to_string(df):
    ngrams_string_list = []
    for index, row in df.iterrows():
        combined_ngrams = []
        for n in range(1, 6):
            col_name = f'Unique_{n}-grams'
            combined_ngrams.extend([" ".join(ngram) for ngram in row[col_name]])
        ngrams_string_list.append(" ".join(combined_ngrams))
    df['Combined_Ngrams'] = ngrams_string_list
    return df

df = ngrams_in_row_to_string(df)

ngrams_string_list = df['Combined_Ngrams'].tolist()

sentences_tokenized = [nltk.word_tokenize(sentence) for sentence in ngrams_string_list]
word2vec_model = Word2Vec(sentences_tokenized, vector_size=100, window=5, min_count=1, workers=4)

def ngram_string_to_vector(ngram_string, model):
    words = ngram_string.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

def similarity_string(ngram1, ngram2, model):
    vec1 = ngram_string_to_vector(ngram1, model)
    vec2 = ngram_string_to_vector(ngram2, model)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def filter_redundant_ngrams_strings(ngram_strings, model, threshold=0.8):
    unique_ngrams = []
    for ngram in ngram_strings:
        if all(similarity_string(ngram, unique_ngram, model) < threshold for unique_ngram in unique_ngrams):
            unique_ngrams.append(ngram)
    return unique_ngrams

filtered_ngrams_string_list = filter_redundant_ngrams_strings(ngrams_string_list, word2vec_model)

filtered_df = pd.DataFrame(columns=['Company', 'Year', 'Quarter', 'Keyword', 'Sentence'])

for ngram_string in filtered_ngrams_string_list:
    matching_rows = df[df['Combined_Ngrams'].str.contains(re.escape(ngram_string))]
    for _, row in matching_rows.iterrows():
        new_row = pd.DataFrame({
            'Company': [row['Company']],
            'Year': [row['Year']],
            'Quarter': [row['Quarter']],
            'Keyword': [ngram_string],
            'Sentence': [row['Sentence']],
        })
        filtered_df = pd.concat([filtered_df, new_row], ignore_index=True)

print(filtered_df)

all_keywords = filtered_df['Keyword'].tolist()
unique_keywords = list(set(all_keywords))


                   Company  Year  Quarter  \
0      Atkins Nutritionals     0  Unknown   
1      Atkins Nutritionals     0  Unknown   
2      Atkins Nutritionals     0  Unknown   
3      Atkins Nutritionals     0  Unknown   
4      Atkins Nutritionals     0  Unknown   
..                     ...   ...      ...   
217  Brinker International  2014       Q1   
218  Brinker International  2014       Q1   
219  Brinker International  2014       Q1   
220  Brinker International  2014       Q1   
221  Brinker International  2014       Q1   

                                 Keyword  \
0    original inventor protein-rich idea   
1     affordability asset-light business   
2             attributes IPO last summer   
3                        retail business   
4                 core snacking business   
..                                   ...   
217       tough backdrop amount of check   
218                   individual company   
219                    individual brands   
220             Non

In [31]:

sentences_tokenized = [nltk.word_tokenize(keyword) for keyword in unique_keywords]
word2vec_model = Word2Vec(sentences_tokenized, vector_size=100, window=5, min_count=1, workers=4)

def keyword_to_vector(keyword, model):
    words = keyword.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

def similarity_keyword(keyword1, keyword2, model):
    vec1 = keyword_to_vector(keyword1, model)
    vec2 = keyword_to_vector(keyword2, model)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def filter_redundant_keywords(keywords, model, threshold=0.8):
    unique_keywords = []
    for keyword in keywords:
        if all(similarity_keyword(keyword, unique_keyword, model) < threshold for unique_keyword in unique_keywords):
            unique_keywords.append(keyword)
    return unique_keywords

filtered_keywords = filter_redundant_keywords(unique_keywords, word2vec_model)

print(filtered_keywords)

filtered_keywords_df = pd.DataFrame(filtered_keywords, columns=['Keyword'])

def calculate_basis_score(keyword):
    return len(keyword.split())

def apply_penalties(keyword, penalty_words):
    words = keyword.split()
    penalty = sum(1 for word in words if word.lower() in penalty_words)
    if any(char.isdigit() for word in words for char in word):
        penalty += 1
    return penalty

def calculate_adjusted_score(keyword, penalty_words):
    basis_score = calculate_basis_score(keyword)
    penalty = apply_penalties(keyword, penalty_words)
    adjusted_score = basis_score - penalty
    return adjusted_score

filtered_keywords_df['Adjusted_Score'] = filtered_keywords_df['Keyword'].apply(lambda x: calculate_adjusted_score(x, penalty_words))
final_keywords_df = filtered_keywords_df[filtered_keywords_df['Adjusted_Score'] > 2]

print(final_keywords_df)


['new menu platforms introduction of new menu', 'clean snacking consumer desires for clean', 'point-of-sale December back-office systems', 'border Mexican food world ’ s', 'combos air with new lunch news', 'dinner leverage grille space unique position position in bar', 'return on investment', 'chiliheads raised help children with cancer', 'message one-hit wonder', 'investment in reimage', 'relations treasurer', 'franchisees closures Grill company-owned restaurants', 'Chili favorites messages', 'outperform peer', 'inflationary pressures recent history', 'Maggiana sites', 'franchise fees', 'early third quarter', 'lunch diners category for full-service', 'attributes IPO last summer', 'Team Service', 'Border education members hospitality recertification program', 'phone wonderful day', 'team member engagement', 'zero ZOG', 'guests right path', 'individual brands', 'chili power demonstrate impressive year-over-year', 'progress made', 'stock stock-based compensation expense', 'bulk reimage i

In [32]:
# Calling the summarization pipeline
summarizer = pipeline('summarization', model='t5-base', device=device)
grouped = filtered_df.groupby(['Company', 'Year', 'Quarter'])

def split_text(text, sentences_per_chunk=5):
    sentences = nltk.sent_tokenize(text)
    chunks = [' '.join(sentences[i:i+sentences_per_chunk]) for i in range(0, len(sentences), sentences_per_chunk)]
    return chunks

all_summaries = []
summary_rows = []

for name, group in grouped:
    company, year, quarter = name
    combined_text = ' '.join(group['Sentence'].tolist())
    text_chunks = split_text(combined_text)
    chunk_summaries = [summarizer(chunk, max_length=80, min_length=30, do_sample=False)[0]['summary_text'] for chunk in text_chunks]
    combined_summary = ' '.join(chunk_summaries)

    all_summaries.append(combined_summary)
    summary_rows.append([company, year, quarter, combined_summary])

summary_df = pd.DataFrame(summary_rows, columns=['Company', 'Year', 'Quarter', 'Summary'])
final_summary = '\n\n'.join(all_summaries)

print(summary_df)


Your max_length is set to 80, but your input_length is only 54. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)
Your max_length is set to 80, but your input_length is only 44. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)
Your max_length is set to 80, but your input_length is only 47. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
Your max_length is set to 80, but your input_length is only 71. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)


                                                                                                 Company  \
0                                                                                    Atkins Nutritionals   
1                                                                                  Brinker International   
2                                                                                  Brinker International   
3                                                                             Brinker International Inc.   
4                                                                             Brinker International Inc.   
5  Brinker International Inc. Presents at Bank of America Merrill Lynch 2012 Consumer  Retail Conference   
6    Brinker International Inc. Presents at Barclays Capital PLCs 2012 Retail and Restaurants Conference   

   Year  Quarter  \
0     0  Unknown   
1  2013       Q1   
2  2014       Q1   
3  2008       Q4   
4  2011       Q3   
5  2012  Unknow

In [33]:

model_id = "jaelynnkk/sentence_compression"
tokenizer = T5Tokenizer.from_pretrained(model_id)
model = T5ForConditionalGeneration.from_pretrained(model_id)
compression_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=device)

def remove_special_characters(text):
    return re.sub(r'[^A-Za-z0-9\s]', '', text)

def compress_summaries(summaries):
    compressed = compression_pipeline(summaries, max_length=50, min_length=10, do_sample=False)
    return [c['generated_text'] for c in compressed]

# Preprocessing and compression
relevant_sent['Cleaned_Sentence'] = relevant_sent['Sentence'].apply(remove_special_characters)

# Process in batches
batch_size = 16
compressed_sentences = []

for i in range(0, len(relevant_sent), batch_size):
    batch = relevant_sent['Cleaned_Sentence'][i:i+batch_size].tolist()
    compressed_batch = compress_summaries(batch)
    compressed_sentences.extend(compressed_batch)

relevant_sent['Compressed_Sentence'] = compressed_sentences

print(relevant_sent)


                   Company  Year  Quarter  \
0      Atkins Nutritionals     0  Unknown   
1      Atkins Nutritionals     0  Unknown   
2      Atkins Nutritionals     0  Unknown   
3      Atkins Nutritionals     0  Unknown   
4      Atkins Nutritionals     0  Unknown   
..                     ...   ...      ...   
484  Brinker International  2014       Q1   
485  Brinker International  2014       Q1   
486  Brinker International  2014       Q1   
487  Brinker International  2014       Q1   
488  Brinker International  2014       Q1   

                                                                                                    Sentence  \
0                                                    today's call is prerecorded, and there will be no Q&A .   
1    we are excited to discuss with you our plans to consummate a transaction with Atkins Nutritionals, Inc.   
2             the proposed merger between conyers park and atkins nutritionals will take place on april 11 .   
3          

In [34]:
import pandas as pd
from transformers import pipeline

sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="jaelynnkk/sentimentanalysistilia",
    tokenizer="jaelynnkk/sentimentanalysistilia",
    device=device
)

def analyze_sentiment(sentence):
    result = sentiment_pipeline(sentence)
    return result[0]['label'], result[0]['score']

relevant_sent[['Sentiment', 'Score']] = relevant_sent['Sentence'].apply(lambda x: pd.Series(analyze_sentiment(x)))


In [35]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def read_and_process_excel(file_path, sheet_name=0):
    df = pd.read_excel(file_path, sheet_name=sheet_name)
    df['Processed_Word'] = df['Phrase'].apply(lambda x: ' '.join([lemmatizer.lemmatize(stemmer.stem(word)) for word in word_tokenize(x.lower())]))
    return df

def process_sentence(sentence):
    return ' '.join([lemmatizer.lemmatize(stemmer.stem(word)) for word in word_tokenize(sentence.lower())])

def classify_and_score_sentences(sentence, df):
    scores = {}
    processed_sentence = process_sentence(sentence)
    for _, row in df.iterrows():
        processed_phrase = row['Processed_Word']
        classification = row['Topic']
        points = row['Points']
        if re.search(r'\b' + re.escape(processed_phrase) + r'\b', processed_sentence):
            if classification in scores:
                scores[classification] += points
            else:
                scores[classification] = points
    return scores

def label_sentences(df, classification_df):
    all_labels = classification_df['Topic'].unique()
    labeled_sentences = []
    for _, row in df.iterrows():
        sentence = row['Sentence']
        scores = classify_and_score_sentences(sentence, classification_df)

        if scores:
            sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)
            highest_label = sorted_scores[0][0] if sorted_scores[0][1] > 1 else 'Null'
            second_highest_label = sorted_scores[1][0] if len(sorted_scores) > 1 and sorted_scores[1][1] > 1 else 'Null'
        else:
            highest_label, second_highest_label = 'Null', 'Null'

        labeled_sentence = {
            'Company': row['Company'],
            'Year': row['Year'],
            'Quarter': row['Quarter'],
            'Sentence': row['Sentence'],
            'Sentiment': row['Sentiment'],
            'Score': row['Score'],
            'Topic 1': highest_label,
            'Topic 2': second_highest_label
        }
        for label in all_labels:
            labeled_sentence[label] = scores.get(label, 0)
        labeled_sentences.append(labeled_sentence)
    return pd.DataFrame(labeled_sentences)

excel_file_path = '/content/drive/MyDrive/NLP Trend Tool/Dictionary.xlsx'
classification_df = read_and_process_excel(excel_file_path, sheet_name='Sheet1')

relevant_sent = label_sentences(relevant_sent, classification_df)

columns_to_remove = [
    'Acquisition', 'Alternative Processing', 'Better-for-You', 'Commodities', 'Distribution', 'Divestiture', 'Flavor',
    'Food Safety', 'Food Waste', 'Innovation', 'Investment', 'Outsourcing', 'Packaging', 'Pricing', 'Sauces',
    'Specialty Ingredients', 'Storage', 'Supply Chain', 'Sustainability'
]
relevant_sent = relevant_sent.drop(columns=columns_to_remove)

print(relevant_sent)
sentences_sent = relevant_sent['Sentence'].tolist()

# Initialize n-gram columns
for n in range(1, 6):
    relevant_sent[f'{n}-grams'] = relevant_sent['Sentence'].apply(lambda x: generate_ngrams(x, n))


def filter_ngrams(ngrams):
    filtered_ngrams = []
    for ngram in ngrams:
        pos_tags = pos_tag(ngram)
        pos_sequence = [tag for word, tag in pos_tags]

        if len(ngram) == 1:
            if pos_sequence[0] in ['NN', 'NNS', 'NNP', 'NNPS']:
                filtered_ngrams.append(ngram)

        elif len(ngram) == 2:
            if (ngram[0].istitle() and ngram[1].istitle()) or \
               (pos_sequence == ['JJ', 'NN']) or (pos_sequence == ['VBD', 'NN']) or \
               (pos_sequence == ['JJ', 'NNS']) or (pos_sequence == ['NNS', 'VBD']) or (pos_sequence == ['NN', 'VBD']) or (pos_sequence == ['VBD', 'NNS']):
                filtered_ngrams.append(ngram)

        elif len(ngram) == 3:
            if pos_sequence in [['JJ', 'NN', 'NN'], ['NN', 'NN', 'NN'], ['NN', 'IN', 'NN'], ['JJ', 'JJ', 'NN'], ['JJ', 'NN', 'NNS'], ['NN', 'IN', 'NN']]:
                filtered_ngrams.append(ngram)

        elif len(ngram) == 4:
            if pos_sequence in [['NN', 'IN', 'JJ', 'NN'], ['NN', 'NN', 'NN', 'NN'], ['JJ', 'JJ', 'NN', 'NN'], ['JJ', 'NN', 'NNS', 'NN'], ['NN', 'NNS', 'IN', 'NN'], ['NNS', 'VBD', 'DT', 'NN']]:
                filtered_ngrams.append(ngram)

        elif len(ngram) == 5:
            if pos_sequence in [['NN', 'IN', 'JJ', 'NN', 'NN'], ['JJ', 'JJ', 'NN', 'NN', 'NN'], ['JJ', 'NN', 'NNS', 'NN', 'NN']]:
                filtered_ngrams.append(ngram)

    return filtered_ngrams

for n in range(1, 6):
    col_name = f'{n}-grams'
    relevant_sent[f'Filtered_{n}-grams'] = relevant_sent[col_name].apply(filter_ngrams)

df = relevant_sent.copy()

def remove_ngrams_in_larger_ngrams_filtered(df, smaller_n, larger_n):
    for index, row in df.iterrows():
        smaller_ngrams = row[f'Filtered_{smaller_n}-grams']
        larger_ngrams = row[f'Filtered_{larger_n}-grams']

        smaller_ngrams_to_remove = []

        for smaller_ngram in smaller_ngrams:
            if any(set(smaller_ngram).issubset(larger_ngram) for larger_ngram in larger_ngrams):
                smaller_ngrams_to_remove.append(smaller_ngram)

        df.at[index, f'Filtered_{smaller_n}-grams'] = [ngram for ngram in smaller_ngrams if ngram not in smaller_ngrams_to_remove]
    return df

for smaller_n, larger_ns in [(1, [2, 3, 4, 5]), (2, [3, 4, 5]), (3, [4, 5]), (4, [5])]:
    for larger_n in larger_ns:
        df = remove_ngrams_in_larger_ngrams_filtered(df, smaller_n, larger_n)

def ngrams_in_row_to_string2(df):
    ngrams_string_list = []
    for index, row in df.iterrows():
        combined_ngrams = []
        for n in range(1, 6):
            col_name = f'Filtered_{n}-grams'
            combined_ngrams.extend([" ".join(ngram) for ngram in row[col_name]])
        ngrams_string_list.append(" ".join(combined_ngrams))
    df['Combined_Ngrams'] = ngrams_string_list
    return df

df = ngrams_in_row_to_string2(df)

columns_to_keep = ['Company', 'Year', 'Quarter', 'Sentence', 'Sentiment', 'Score', 'Topic 1', 'Topic 2', 'Combined_Ngrams']

print(df.head())

def calculate_new_score(group):
    new_score = 0
    for index, row in group.iterrows():
        if row['Sentiment'] == 'positive':
            new_score += row['Score']
        elif row['Sentiment'] == 'negative':
            new_score -= row['Score']
        elif row['Sentiment'] == 'neutral':
            new_score += row['Score'] / 2
    return new_score

topic_frequency = df['Topic 1'].value_counts().to_dict()
sentiment_df = df.groupby(['Year', 'Quarter', 'Topic 1']).apply(calculate_new_score).reset_index(name='New Score')
sentiment_df['Frequency'] = sentiment_df['Topic 1'].map(topic_frequency)
sentiment_df['New Score'] = sentiment_df['New Score'] / sentiment_df['Frequency']
sentiment_df = sentiment_df.drop(columns=['Frequency'])
sentiment_df['Percentile'] = sentiment_df['New Score'].rank(pct=True) * 100

def classify_sentiment(score):
    if score < 0.4:
        return 'negative'
    elif 0.4 <= score <= 0.7:
        return 'neutral'
    else:
        return 'positive'

sentiment_df['Sentiment'] = sentiment_df['New Score'].apply(classify_sentiment)
print(sentiment_df)

filtered_df[['Sentiment', 'Score']] = filtered_df['Sentence'].apply(lambda x: pd.Series(analyze_sentiment(x)))

def calculate_new_score(group):
    new_score = 0
    for index, row in group.iterrows():
        if row['Sentiment'] == 'positive':
            new_score += row['Score']
        elif row['Sentiment'] == 'negative':
            new_score -= row['Score']
        elif row['Sentiment'] == 'neutral':
            new_score += row['Score'] / 2
    return new_score

keyword_frequency = filtered_df['Keyword'].value_counts().to_dict()
keysent_df = filtered_df.groupby(['Year', 'Quarter', 'Keyword']).apply(calculate_new_score).reset_index(name='New Score')
keysent_df['Frequency'] = keysent_df['Keyword'].map(keyword_frequency)
keysent_df['New Score'] = keysent_df['New Score'] / keysent_df['Frequency']
keysent_df = keysent_df.drop(columns=['Frequency'])
keysent_df['Percentile'] = keysent_df['New Score'].rank(pct=True) * 100

def classify_sentiment(score):
    if score < 0.4:
        return 'negative'
    elif 0.4 <= score <= 0.7:
        return 'neutral'
    else:
        return 'positive'

keysent_df['Sentiment'] = keysent_df['New Score'].apply(classify_sentiment)
print(keysent_df)


                   Company  Year  Quarter  \
0      Atkins Nutritionals     0  Unknown   
1      Atkins Nutritionals     0  Unknown   
2      Atkins Nutritionals     0  Unknown   
3      Atkins Nutritionals     0  Unknown   
4      Atkins Nutritionals     0  Unknown   
..                     ...   ...      ...   
484  Brinker International  2014       Q1   
485  Brinker International  2014       Q1   
486  Brinker International  2014       Q1   
487  Brinker International  2014       Q1   
488  Brinker International  2014       Q1   

                                                                                                    Sentence  \
0                                                    today's call is prerecorded, and there will be no Q&A .   
1    we are excited to discuss with you our plans to consummate a transaction with Atkins Nutritionals, Inc.   
2             the proposed merger between conyers park and atkins nutritionals will take place on april 11 .   
3          

In [36]:
grouped_topic = df.groupby(['Topic 1', 'Year', 'Quarter'])['Sentence']

summary_data = []

for (topic, year, quarter), sentences in grouped_topic:

    combined_text = ' '.join(sentences)

    text_chunks = split_text(combined_text, sentences_per_chunk=5)

    chunk_summaries = [summarizer(chunk, max_length=130, min_length=30, do_sample=False)[0]['summary_text'] for chunk in text_chunks]
    combined_summary = ' '.join(chunk_summaries)

    summary_data.append({
        'Topic 1': topic,
        'Year': year,
        'Quarter': quarter,
        'Summary': combined_summary
    })

topic_sum_df = pd.DataFrame(summary_data)


print(topic_sum_df.head())


Your max_length is set to 130, but your input_length is only 40. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=20)
Your max_length is set to 130, but your input_length is only 31. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=15)
Your max_length is set to 130, but your input_length is only 19. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)
Your max_length is set to 130, but your input_length is only 24. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your 

       Topic 1  Year  Quarter  \
0  Acquisition     0  Unknown   
1  Commodities  2008       Q4   
2  Commodities  2011       Q3   
3  Commodities  2012  Unknown   
4  Commodities  2013       Q1   

                                                                                                                                                                                      Summary  
0                                                          the proposed merger between conyers park and atkins nutritionals will take place on april 11 . atkins is a scalable M&A platform .  
1                                                                   we're hoping that maybe there is a light at the end of the tunnel on some of these commodity and inflationary pressures .  
2  beef continues to present the most significant inflationary pressure in our commodity basket . beef is the most important commodity in the basket - the most expensive beef in the world .  
3                       commoditi

In [37]:
filtered_df.to_csv('/content/drive/MyDrive/NLP Trend Tool/UI Data Holder/keyword_extraction.csv', index=False)

df.to_csv('/content/drive/MyDrive/NLP Trend Tool/UI Data Holder/topic_detection.csv', index=False)

summary_df.to_csv('/content/drive/MyDrive/NLP Trend Tool/UI Data Holder/keyword_summary.csv', index=False)

final_keywords_df.to_csv('/content/drive/MyDrive/NLP Trend Tool/UI Data Holder/final_keywords.csv', index=False)

topic_sum_df.to_csv('/content/drive/MyDrive/NLP Trend Tool/UI Data Holder/topic_summary.csv', index=False)

keysent_df.to_csv('/content/drive/MyDrive/NLP Trend Tool/UI Data Holder/keywords_sentiment.csv', index=False)

# UI Run

In [39]:
import warnings

# Suppressing User Warnings
warnings.filterwarnings("ignore", category=UserWarning, message=".*matplotlib.legend.*")

import os
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output
from google.colab import drive
import random
import matplotlib.pyplot as plt

drive.mount('/content/drive')

UPLOAD_FOLDER = '/content/drive/MyDrive/NLP Trend Tool/Earnings Call Transcript Uploads'
os.makedirs(UPLOAD_FOLDER, exist_ok=True)

upload_button = widgets.FileUpload(
    accept='.txt',
    multiple=True
)
upload_button.label = "Upload"

def upload_files(change):
    for name, file_info in upload_button.value.items():
        file_path = os.path.join(UPLOAD_FOLDER, name)
        with open(file_path, 'wb') as file:
            file.write(file_info['content'])
    upload_button.label = "Upload"  # Reset label to "Upload"

upload_button.observe(upload_files, names='value')

file_path1 = '/content/drive/MyDrive/NLP Trend Tool/UI Data Holder/keyword_extraction.csv'
file_path2 = '/content/drive/MyDrive/NLP Trend Tool/UI Data Holder/final_keywords.csv'
file_path3 = '/content/drive/MyDrive/NLP Trend Tool/UI Data Holder/topic_summary.csv'
file_path4 = '/content/drive/MyDrive/NLP Trend Tool/UI Data Holder/keyword_summary.csv'
file_path5 = '/content/drive/MyDrive/NLP Trend Tool/UI Data Holder/topic_detection.csv'
file_path6 = '/content/drive/MyDrive/NLP Trend Tool/UI Data Holder/keywords_sentiment.csv'


keyword_df = pd.read_csv(file_path1)
final_keywords = pd.read_csv(file_path2)
topic_sum_df = pd.read_csv(file_path3)
summary_df = pd.read_csv(file_path4)
df = pd.read_csv(file_path5)
keysent_df = pd.read_csv(file_path6)

topics = pd.concat([df['Topic 1'], df['Topic 2']]).unique()


result_data = []

# Function to calculate New Sentiment Score
def calculate_new_sentiment_score(sentiments, scores):
    new_sentiment_score = 0
    for sentiment, score in zip(sentiments, scores):
        if sentiment == 'positive':
            new_sentiment_score += score
        elif sentiment == 'negative':
            new_sentiment_score -= score
        elif sentiment == 'neutral':
            new_sentiment_score += score / 2
    return new_sentiment_score

# Iterate through each unique topic
for topic in topics:
    # Filter rows containing the topic in either 'Topic 1' or 'Topic 2'
    topic_df = df[(df['Topic 1'] == topic) | (df['Topic 2'] == topic)]

    # Group by 'Year' and 'Quarter' and calculate New Sentiment Score
    grouped = topic_df.groupby(['Year', 'Quarter'])
    for (year, quarter), group in grouped:
        sentiments = group['Sentiment']
        scores = group['Score']

        new_sentiment_score = calculate_new_sentiment_score(sentiments, scores)

        if new_sentiment_score < 0.3:
            sentiment_label = 'negative'
        elif new_sentiment_score <= 0.7:
            sentiment_label = 'neutral'
        else:
            sentiment_label = 'positive'

        frequency = len(group)

        result_data.append({
            'Topic': topic,
            'Year': year,
            'Quarter': quarter,
            'New Sentiment Score': new_sentiment_score,
            'Sentiment Label': sentiment_label,
            'Frequency': frequency
        })


sentiment_topic = pd.DataFrame(result_data)

sentiment_topic = sentiment_topic[sentiment_topic['Topic'] != 'Null']
sentiment_topic = sentiment_topic[sentiment_topic['Quarter'] != 'Unknown']

topic_sum_df = topic_sum_df[topic_sum_df['Quarter'] != 'Unknown']
topic_sum_df = topic_sum_df[topic_sum_df['Topic 1'] != 'Null']
topic_sum_df = topic_sum_df[topic_sum_df['Year'] != 'Unknown']


# Calculate the Percentile to know where the New Sentiment Score falls under holistically
sentiment_topic['Percentile Sentiment'] = sentiment_topic['New Sentiment Score'].rank(pct=True)

keyword_df['Company'] = keyword_df['Company'].str.title()
summary_df['Company'] = summary_df['Company'].str.title()

filtered_keyword_df = keyword_df.merge(final_keywords[['Keyword']], on='Keyword', how='inner')

companies = filtered_keyword_df['Company'].unique()
years = sorted(filtered_keyword_df['Year'].unique())


quarters = [quarter for quarter in filtered_keyword_df['Quarter'].unique() if quarter.lower() != 'unknown']

company_buttons = [widgets.ToggleButton(description=company, value=False, layout=widgets.Layout(margin='2px', width='150px')) for company in companies]
year_buttons = [widgets.ToggleButton(description=str(year), value=False, layout=widgets.Layout(margin='2px')) for year in years]
quarter_buttons = [widgets.ToggleButton(description=quarter, value=False, layout=widgets.Layout(margin='2px')) for quarter in quarters]

# Keyword buttons
keyword_container = widgets.VBox([], layout=widgets.Layout(width='60%'))
result_container = widgets.VBox([], layout=widgets.Layout(width='40%'))
summary_container = widgets.VBox(layout=widgets.Layout(width='40%'))

# Sentiment outlook filter buttons
negative_outlook_button = widgets.ToggleButton(
    description="Negative Outlook",
    value=False,
    layout=widgets.Layout(margin='2px', width='150px'),
    style={'button_color': '#f4cccc'}
)

positive_outlook_button = widgets.ToggleButton(
    description="Positive Outlook",
    value=False,
    layout=widgets.Layout(margin='2px', width='150px'),
    style={'button_color': '#cfe2f3'}
)

clear_filters_button = widgets.Button(
    description="Clear All Keyword Filters",
    layout=widgets.Layout(margin='2px', width='200px')
)

def clear_all_filters(b):
    for btn in company_buttons + year_buttons + quarter_buttons + [negative_outlook_button, positive_outlook_button]:
        btn.value = False
    summary_container.children = [widgets.HTML(value="")]
    result_container.children = []
    keyword_container.children = [widgets.HTML(value="<p>Please select companies, years, or quarters of interest to review trending keywords.</p>")]

clear_filters_button.on_click(clear_all_filters)

def on_keyword_filter_change(change):
    selected_companies = [btn.description for btn in company_buttons if btn.value]
    selected_years = [int(btn.description) for btn in year_buttons if btn.value]
    selected_quarters = [btn.description for btn in quarter_buttons if btn.value]

    # filter conditions
    filtered_df_filtered = filtered_keyword_df
    if selected_companies or selected_years or selected_quarters or negative_outlook_button.value or positive_outlook_button.value:
        if selected_companies:
            filtered_df_filtered = filtered_df_filtered[filtered_df_filtered['Company'].isin(selected_companies)]
        if selected_years:
            filtered_df_filtered = filtered_df_filtered[filtered_df_filtered['Year'].isin(selected_years)]
        if selected_quarters:
            filtered_df_filtered = filtered_df_filtered[filtered_df_filtered['Quarter'].isin(selected_quarters)]

        if negative_outlook_button.value:
            filtered_df_filtered = filtered_df_filtered[filtered_df_filtered['Keyword'].isin(
                keysent_df[keysent_df['Sentiment'] == 'negative']['Keyword'])]
        if positive_outlook_button.value:
            filtered_df_filtered = filtered_df_filtered[filtered_df_filtered['Keyword'].isin(
                keysent_df[keysent_df['Sentiment'] == 'positive']['Keyword'])]

        # Displaying summary only if a company and a year are selected
        if selected_companies and selected_years:
            summary_html = ""
            for company in selected_companies:
                for year in selected_years:
                    for quarter in selected_quarters or [None]:
                        summary_row = summary_df
                        summary_row = summary_row[(summary_row['Company'] == company) & (summary_row['Year'] == year)]
                        if quarter:
                            summary_row = summary_row[summary_row['Quarter'] == quarter]
                        if not summary_row.empty:
                            for _, row in summary_row.iterrows():
                                summary = row['Summary']
                                summary_html += f"<div><b>{row['Company']} {row['Year']} {row['Quarter']}:</b> {summary}</div><br>"
            if summary_html:
                summary_container.children = [widgets.HTML(value=summary_html)]
            else:
                summary_container.children = [widgets.HTML(value="<p>Please select companies, years, or quarters of interest to review trending keywords.</p>")]
        else:
            summary_container.children = [widgets.HTML(value="<p>Select at least one company and one year to review summaries!</p>")]
    else:
        filtered_df_filtered = pd.DataFrame(columns=filtered_keyword_df.columns)
        summary_container.children = [widgets.HTML(value="<p>Please select companies, years, or quarters of interest to review trending keywords.</p>")]

    display_keyword_boxes(filtered_df_filtered)

def get_sentiment_emoji(sentiment):
    if sentiment is None:
        return ''
    elif sentiment == 'positive':
        return '😊'
    elif sentiment == 'negative':
        return '☹️'
    elif sentiment == 'neutral':
        return '😐'
    return ''


def display_keyword_boxes(df):
    keywords = df['Keyword'].unique()
    if len(keywords) == 0:
        keyword_container.children = [widgets.HTML(value="<p>Please select companies, years, or quarters of interest to review trending keywords.</p>")]
        result_container.children = []
    else:
        keywords = list(keywords)
        random.shuffle(keywords)

        keyword_buttons = [
            widgets.Button(
                description=keyword,
                layout=widgets.Layout(min_width='250px', max_width='auto', padding='5px', height='auto'),
                style={'text-align': 'center', 'white-space': 'normal'}
            ) for keyword in keywords
        ]

        keyword_rows = [widgets.HBox(keyword_buttons[i:i + 3], layout=widgets.Layout(width='100%')) for i in range(0, len(keyword_buttons), 3)]

        keyword_container.children = keyword_rows
        result_container.children = []


        def on_keyword_click(b):
            keyword = b.description
            matching_rows = df[df['Keyword'] == keyword]

            result_content = []
            for _, row in matching_rows.iterrows():
                result_content.append(widgets.HTML(f"<div><b>{row['Company']}</b> <b>{row['Year']}</b> <b>{row['Quarter']}: </b> {row['Sentence']}</div>"))

            sentiment_row = keysent_df[keysent_df['Keyword'] == keyword]
            if not sentiment_row.empty:
                sentiment = sentiment_row['Sentiment'].values[0]
                emoji = get_sentiment_emoji(sentiment)
                percentile = round(sentiment_row['Percentile'].values[0], 2)
                result_content.append(widgets.HTML(f"<div>{emoji} {percentile}</div>"))

            result_container.children = result_content
            summary_container.children = []

        for btn in keyword_buttons:
            btn.on_click(on_keyword_click)

def on_outlook_button_change(change):
    if change['owner'] == positive_outlook_button and change['new']:
        negative_outlook_button.value = False
    elif change['owner'] == negative_outlook_button and change['new']:
        positive_outlook_button.value = False

negative_outlook_button.observe(on_outlook_button_change, 'value')
positive_outlook_button.observe(on_outlook_button_change, 'value')

for btn in company_buttons + year_buttons + quarter_buttons + [negative_outlook_button, positive_outlook_button]:
    btn.observe(on_keyword_filter_change, 'value')

ui_header = widgets.HTML(value="<h1>Trend Analysis from Earnings Call Transcripts</h1>")
ui_subheader = widgets.HTML(value="<h2>Talk of the Town: Hot Topics of Discussion</h2>")

company_box = widgets.HBox(company_buttons, layout=widgets.Layout(margin='5px 0'))
year_quarter_box = widgets.VBox([
    widgets.HBox(year_buttons, layout=widgets.Layout(margin='5px 0')),
    widgets.HBox(quarter_buttons, layout=widgets.Layout(margin='5px 0')),
    widgets.HBox([negative_outlook_button, positive_outlook_button, clear_filters_button], layout=widgets.Layout(margin='5px 0'))
])

# Stack the result and summary containers vertically
result_summary_container = widgets.VBox([result_container, summary_container], layout=widgets.Layout(min_width='auto', max_width='600px', padding='5px', height='auto'))

# Plotting sentiment and frequency graphs
def update_graphs(selected_topics, selected_years, selected_quarters):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 6))

    if not selected_topics and not selected_years:
        sentiment_graph_output.clear_output()
        with sentiment_graph_output:
            display(widgets.HTML(value="<p>Please select a Topic, Year, or a combination of both to see the sentiment and frequency graphs.</p>"))
        return

    if not selected_topics:
        selected_topics = sentiment_topic['Topic'].unique()
    if not selected_years:
        selected_years = sentiment_topic['Year'].unique()

    plot_df = sentiment_topic[(sentiment_topic['Topic'].isin(selected_topics)) & (sentiment_topic['Year'].isin(selected_years))]

    if plot_df.empty:
        sentiment_graph_output.clear_output()
        with sentiment_graph_output:
            display(widgets.HTML(value="<p>No data available for the selected filters.</p>"))
        return

    plot_df = plot_df.copy()  # Added a "make a copy" step due to SettingWithCopyWarning message popping up

    # Mapping for start date of each quarter
    quarter_start_dates = {
        'Q1': '01-01',  # January 1st
        'Q2': '04-01',  # April 1st
        'Q3': '07-01',  # July 1st
        'Q4': '10-01'   # October 1st
    }

    def get_start_date(row):
        return f"{row['Year']}-{quarter_start_dates[row['Quarter']]}"

    plot_df['Date'] = plot_df.apply(get_start_date, axis=1)
    plot_df['Date'] = pd.to_datetime(plot_df['Date'], format='%Y-%m-%d')
    plot_df = plot_df.sort_values(by=['Topic', 'Date'])

    global_max_sentiment = sentiment_topic['New Sentiment Score'].max()
    global_min_sentiment = sentiment_topic['New Sentiment Score'].min()
    global_max_frequency = sentiment_topic['Frequency'].max()

    for topic in plot_df['Topic'].unique():
        topic_df = plot_df[plot_df['Topic'] == topic].copy()

        topic_df = topic_df.sort_values('Date')

        ax1.plot(topic_df['Date'], topic_df['New Sentiment Score'], label=topic)
        ax2.plot(topic_df['Date'], topic_df['Frequency'], label=topic)

    ax1.set_xlabel('Year-Quarter')
    ax1.set_ylabel('Sentiment Score')
    ax1.set_title('Sentiment Score Over Time')
    ax1.legend(title='Topics', bbox_to_anchor=(1.05, 1), loc='upper left')
    for label in ax1.get_xticklabels():
        label.set_rotation(45)

    ax2.set_xlabel('Year-Quarter')
    ax2.set_ylabel('Frequency')
    ax2.set_title('Frequency Over Time')
    ax2.legend(title='Topics', bbox_to_anchor=(1.05, 1), loc='upper left')
    for label in ax2.get_xticklabels():
        label.set_rotation(45)

    ax1.set_ylim(global_min_sentiment, global_max_sentiment)
    ax2.set_ylim(0, global_max_frequency)

    plt.tight_layout()
    sentiment_graph_output.clear_output()
    with sentiment_graph_output:
        display(fig)
    plt.close(fig)


sentiment_graph_output = widgets.Output()

# Future forecasting filters using `topic_sum_df`
topics = topic_sum_df['Topic 1'].unique()
topic_buttons = [widgets.ToggleButton(description=topic, value=False, layout=widgets.Layout(margin='2px')) for topic in topics]

future_years = topic_sum_df['Year'].unique()
future_year_buttons = [widgets.ToggleButton(description=str(year), value=False, layout=widgets.Layout(margin='2px')) for year in future_years]

future_quarters = topic_sum_df['Quarter'].unique()
future_quarter_buttons = [widgets.ToggleButton(description=quarter, value=False, layout=widgets.Layout(margin='2px')) for quarter in future_quarters]


forecast_result_container = widgets.VBox([])

def on_forecast_filter_change(change):
    selected_topics = [btn.description for btn in topic_buttons if btn.value]
    selected_future_years = [int(btn.description) for btn in future_year_buttons if btn.value]
    selected_future_quarters = [btn.description for btn in future_quarter_buttons if btn.value]

    filtered_df = topic_sum_df
    if selected_topics:
        filtered_df = filtered_df[filtered_df['Topic 1'].isin(selected_topics)]
    if selected_future_years:
        filtered_df = filtered_df[filtered_df['Year'].isin(selected_future_years)]
    if selected_future_quarters:
        filtered_df = filtered_df[filtered_df['Quarter'].isin(selected_future_quarters)]

    display_forecast_results(filtered_df)
    update_graphs(selected_topics, selected_future_years, selected_future_quarters)

def display_forecast_results(df):
    if df.empty:
        forecast_result_container.children = [widgets.HTML(value="<p>No data available for the selected filters.</p>")]
    else:
        forecast_content = []
        for _, row in df.iterrows():
            forecast_content.append(widgets.HTML(f"<div><b>{row['Topic 1']}</b> <b>{row['Year']}</b> <b>{row['Quarter']}</b>: {row['Summary']}</div>"))
        forecast_result_container.children = forecast_content

for btn in topic_buttons + future_year_buttons + future_quarter_buttons:
    btn.observe(on_forecast_filter_change, 'value')

sentiment_graph_container = widgets.VBox([sentiment_graph_output])

future_forecast_container = widgets.VBox([
    widgets.HTML(value="<h2>Market Outlook: Investment Topics of Interest</h2>"),
    widgets.HBox(topic_buttons, layout=widgets.Layout(margin='5px 0')),
    widgets.HBox(future_year_buttons, layout=widgets.Layout(margin='5px 0')),
    widgets.HBox(future_quarter_buttons, layout=widgets.Layout(margin='5px 0')),
    sentiment_graph_container,
    forecast_result_container
])


# "Back to the Top" button
back_to_top_button = widgets.Button(
    description="Back to the Top",
    layout=widgets.Layout(margin='2px', width='200px'),
    style={'button_color': '#B6D7A8', 'color': 'white'}
)

from IPython.display import HTML

# CSS styles
css = """
<style>
    .negative-button {
        background-color: #f4cccc !important;
        color: black;
    }
    .positive-button {
        background-color: #cfe2f3 !important;
        color: black;
    }
</style>
"""

display(HTML(css))

negative_outlook_button.add_class("negative-button")
positive_outlook_button.add_class("positive-button")


from IPython.display import display, Javascript

def on_back_to_top_click(b):
    display(Javascript('window.scrollTo(0, 0);'))

back_to_top_button.on_click(on_back_to_top_click)

future_forecast_container.children += (back_to_top_button,)

display(ui_header, upload_button, ui_subheader, company_box, year_quarter_box, widgets.HTML("<div style='height:10px;'></div>"), widgets.HBox([keyword_container, result_summary_container], layout=widgets.Layout(align_items='flex-start', width='100%')))
display(future_forecast_container)

empty_df = pd.DataFrame(columns=filtered_keyword_df.columns)
display_keyword_boxes(empty_df)

sentiment_graph_output.clear_output()
with sentiment_graph_output:
    display(widgets.HTML(value="<p>Please select a Topic, Year, or a combination of both to see the sentiment graph.</p>"))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


HTML(value='<h1>Trend Analysis from Earnings Call Transcripts</h1>')

FileUpload(value={}, accept='.txt', description='Upload', multiple=True)

HTML(value='<h2>Talk of the Town: Hot Topics of Discussion</h2>')

HBox(children=(ToggleButton(value=False, description='Atkins Nutritionals', layout=Layout(margin='2px', width=…

VBox(children=(HBox(children=(ToggleButton(value=False, description='0', layout=Layout(margin='2px')), ToggleB…

HTML(value="<div style='height:10px;'></div>")

HBox(children=(VBox(layout=Layout(width='60%')), VBox(children=(VBox(layout=Layout(width='40%')), VBox(layout=…

VBox(children=(HTML(value='<h2>Market Outlook: Investment Topics of Interest</h2>'), HBox(children=(ToggleButt…