# Imports

In [1]:
import pandas as pd
import os
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from datetime import datetime
import numpy as np
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Get the Data

In [2]:
data_directory = '../data/raw/'
transcript_files = glob.glob(os.path.join(data_directory, '*_transcripts.json'))

all_transcripts = []

for file in transcript_files:
    transcript_df = pd.read_json(file)
    all_transcripts.append(transcript_df)

# Combine all DataFrames into a single DataFrame
combined_transcripts_df = pd.concat(all_transcripts, ignore_index=True)

combined_transcripts_df.head()

Unnamed: 0,symbol,quarter,year,date,content
0,AAL,3,2023,2023-10-19 12:37:10,Operator: Thank you for standing by and welcom...
1,AAL,2,2023,2023-07-20 11:51:04,"Operator: Thank you for standing by, and welco..."
2,AAL,1,2023,2023-04-27 11:15:04,"Operator: Thank you for standing by, and welco..."
3,AAL,4,2022,2023-01-26 12:53:07,"Operator: Thank you for standing by, and welco..."
4,AAL,3,2022,2022-10-20 14:26:04,"Operator: Thank you for standing by, and welco..."


In [5]:
current_date = datetime(2021,12,1)

In [7]:
current_date.strftime('%Y-%m-%d')

'2021-12-01'

In [11]:
import requests
import io
import pandas as pd
from datetime import datetime, timedelta
from dotenv import load_dotenv
# Load environment variables and API key
load_dotenv()
FMP_API_KEY = os.getenv('FMP_API_KEY')

# Define your tickers and date range
tickers = ['AAL','DAL']  # Your list of tickers
# Loop over each date and fetch data
current_date = datetime(2021,12,1)

url = f"https://financialmodelingprep.com/api/v4/batch-request-end-of-day-prices?date={current_date.strftime('%Y-%m-%d')}&apikey={FMP_API_KEY}"
response = requests.get(url)
if response.status_code == 200:
    data = pd.read_csv(io.BytesIO(response.content))



In [12]:
data

Unnamed: 0,symbol,date,open,low,high,close,adjClose,volume
0,0R2Y.L,2021-12-01,677.250000,663.500000,678.15000,668.430000,668.430000,3038.0
1,0R55.L,2021-12-01,0.970000,0.936000,0.98300,0.958000,0.958000,2879.0
2,0RD1.L,2021-12-01,160.400000,157.600000,160.55200,160.362300,160.362300,1704.0
3,0R2I.L,2021-12-01,24.080000,23.390000,24.12000,23.390000,23.390000,40.0
4,0HOY.L,2021-12-01,38.520000,38.520000,39.29200,38.768000,38.768000,165.0
...,...,...,...,...,...,...,...,...
67954,XAGHKD,2021-12-01,174.240000,173.087000,175.23930,174.298100,174.298100,2643.0
67955,XAUCAD,2021-12-01,2282.100000,2258.500000,2284.40000,2267.600000,2267.600000,1316.0
67956,XAUTRY,2021-12-01,23765.290000,23411.510000,24406.07000,24232.420000,24232.420000,2748.0
67957,ARSZAR,2021-12-01,0.158714,0.156218,0.15885,0.157772,0.157772,601.0


In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# Create a count vectorizer instance
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
transcripts_vectorized = vectorizer.fit_transform(combined_transcripts_df['clean_content'])

# Fit LDA model
lda = LatentDirichletAllocation(n_components=5, random_state=0)
lda.fit(transcripts_vectorized)

# Function to print topics discovered by LDA
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

print_top_words(lda, vectorizer.get_feature_names_out(), 10)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Create a TF-IDF vectorizer instance
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(combined_transcripts_df['clean_content'])

# Compute the co-occurrence matrix
co_occurrence_matrix = (tfidf_matrix.T * tfidf_matrix)  # This is a sparse matrix
co_occurrence_matrix.setdiag(0)  # Remove self-co-occurrences

# Convert sparse matrix to DataFrame for easier manipulation
co_occurrence_df = pd.DataFrame(co_occurrence_matrix.todense(), 
                                index=vectorizer.get_feature_names_out(), 
                                columns=vectorizer.get_feature_names_out())

# Print a portion of the co-occurrence matrix
print(co_occurrence_df.iloc[:10, :10])  # Adjust the slicing for the size you want to view


In [None]:
import spacy

# Load SpaCy model for NER
nlp = spacy.load("en_core_web_sm")

# Function to extract named entities
def named_entity_recognition(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

# Apply NER to a sample of the transcripts
sample_content = combined_transcripts_df['clean_content'].sample(5).apply(named_entity_recognition)
print(sample_content)


In [None]:
# Extract all entities from the sample content
all_entities = [ent for sublist in sample_content for ent in sublist]

# Count the frequency of each entity type
entity_types = Counter([entity[1] for entity in all_entities])

# Create a bar chart
sns.barplot(x=list(entity_types.keys()), y=list(entity_types.values()))
plt.title('Frequency of Named Entity Types')
plt.xlabel('Entity Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Extract all POS tags from the sample content
all_pos_tags = [tag for sublist in sample_content for word, tag in sublist]

# Count the frequency of each POS tag
pos_tags_frequency = Counter(all_pos_tags)

# Create a bar chart
sns.barplot(x=list(pos_tags_frequency.keys()), y=list(pos_tags_frequency.values()))
plt.title('Frequency of Part-of-Speech Tags')
plt.xlabel('POS Tag')
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Create a pie chart
plt.figure(figsize=(8, 8))
plt.pie(pos_tags_frequency.values(), labels=pos_tags_frequency.keys(), autopct='%1.1f%%')
plt.title('Proportion of Part-of-Speech Tags')
plt.show()

It could be cool to try to have a visual that shows the timeline of the different calls, with sentiment scores (or a red/green for sentiment) and the prices of those stocks and the industry price over that same time