# *Temporal Text Analysis Model*

## **Leveraging on OpenAI**

In [None]:
!pip install openai
!pip install cohere
!pip install tiktoken
!pip install striprtf # to parse .rtf files
!python -m spacy download en_core_web_lg

In [8]:
import openai
import os

openai_api_key = os.getenv('OPENAI_API_KEY')

## **Load Data Step**

In [None]:
# Import necessary libraries
import glob
import pandas as pd
from striprtf.striprtf import rtf_to_text
from google.colab import drive

# Mount Google Drive to access files stored there
drive.mount('/content/drive')

# Define the path to the directory containing the RTF files and store all file paths
file_paths = glob.glob('/content/drive/My Drive/Entrepreneurial Ventures/Relion AIAA/Clients/Online Education/BlackFacts/historical_docs/*.rtf')

# Initialize a list to hold the data extracted from each file
data = []

# Loop through each file path in the list of file paths
for file_path in file_paths:
  with open(file_path, 'r', encoding='utf-8') as file:  # Open the file for reading
    rtf_content = file.read()  # Read the content of the file
    content = rtf_to_text(rtf_content)  # Convert the content from RTF to plain text
    # Append a dictionary with the file path and content to the data list
    data.append({'file_path': file_path,
                 'content': content
    })

# Create a DataFrame from the list of data, which contains the file paths and contents
df = pd.DataFrame(data)

# Define a function to extract the year from the content of a document
def extract_year(content):
  # Find the positions of the opening and closing parentheses that enclose the year
  start_pos = content.find('(') + 1  # Start position of the year
  end_pos = content.find(')')  # End position of the year
  year = content[start_pos:end_pos]  # Extract the substring that represents the year
  return int(year)  # Convert the year to an integer and return it

# Apply the function to extract the year from each document's content and store it in a new 'year' column
df['year'] = df['content'].apply(extract_year)

# Display the resulting DataFrame
df


## **Preprocessing Step**

In [10]:
# Import the necessary SpaCy components for natural language processing
import spacy
from spacy.tokens import Token

# Load the large English model from SpaCy, which provides a rich set of features for text processing
nlp = spacy.load('en_core_web_lg') # 'en_core_web_lg' is one of SpaCy's English models with a large vocabulary and word vectors

# Define a custom attribute for Token objects to identify archaic words
Token.set_extension('is_archaic', default=False, force=True)  # Add a new attribute 'is_archaic' to tokens

# Initialize a set containing archaic words to be identified in the text
archaic_words = {
    'thou', 'thee', 'thy', 'thine', 'hast', 'hath', 'doth', 'dost', 'ere', 'whence', 'wherefore', 'thereunto',
    'abide', 'afore', 'aforesaid', 'art', 'behold', 'betwixt', 'ere', 'fain', 'forsooth', 'hark', 'hither',
    'thither', 'whence', 'wherefore', 'wherewith', 'yon', 'yonder', 'dost', 'doth', 'hath', 'hast', 'wilt',
    'shan\'t', 'canst', 'couldst', 'wouldst', 'shouldst', '\'twas', '\'tis', 'nay', 'verily', 'unto', 'upon',
    'whereby', 'wherein', 'whereupon', 'whosoever'
}

# Define a function to check if a word is an archaic word
def is_archaic(word):
  # Return True if the word is in the archaic_words set, else return False
  return word.lower() in archaic_words

# Define the main preprocessing function to apply to the text
def preprocess(text):
  # Process the text with the SpaCy NLP model, generating a sequence of token objects
  doc = nlp(text)

  # Initialize an empty list to hold the processed tokens
  processed_tokens = []

  # Iterate over each token in the document
  for token in doc:
    # Ignore punctuation tokens
    if token.is_punct:
      continue

    # Keep named entities in their original form without further processing
    if token.ent_type_:
      processed_tokens.append(token.text)
      continue

    # Check if the token is an archaic word and handle accordingly
    if is_archaic(token.text):
      # Append the token with an '_archaic' suffix for special handling
      processed_tokens.append(token.text + "_archaic")
    else:
      # For non-archaic, non-entity tokens, lemmatize and convert to lowercase
      processed_tokens.append(token.lemma_.lower())

  # Join the processed tokens back into a single string and return it
  return " ".join(processed_tokens)

# Apply the preprocess function to each item in the 'content' column of the dataframe and store the result in a new column
df['processed_content'] = df['content'].apply(preprocess)

## **Text Analysis and Feature Engineering Step**

In [None]:
# Import necessary libraries for text analysis and feature engineering
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import pandas as pd

# Advanced sentiment analysis for further iterations
# Define a function to perform LDA topic modeling, focused on media perceptions
def lda_topic_modeling(data, n_topics=5, n_words=10):
    # Customize the vectorizer to better suit media-related content
    tfidf_vect = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english', ngram_range=(1, 3))
    dtm = tfidf_vect.fit_transform(data)

    # LDA Model
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(dtm)

    # Displaying the top words in the topics
    for index, topic in enumerate(lda.components_):
        print(f'The top {n_words} words for topic #{index}')
        print([tfidf_vect.get_feature_names_out()[i] for i in topic.argsort()[-n_words:]])
        print('\n')

# Applying LDA topic modeling to the processed content
lda_topic_modeling(df['processed_content'].tolist())

# Function to assign the dominant topic to each document
def assign_topic(row):
  topic = np.argmax(lda.transform(tfidf_vect.transform([row]))[0])
  return topic

# Assign the dominant topic to each document
df['topic'] = df['processed_content'].apply(assign_topic)


In [None]:
# Import necessary libraries for time-series analysis
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure the 'year' column is treated as an integer for proper sorting and grouping
df['year'] = df['year'].astype(int)

# Group the data by year and calculate the average sentiment for each year
yearly_sentiment = df.groupby('year')['sentiment'].mean().reset_index()

# Plot the average sentiment over time
plt.figure(figsize=(10, 6))
sns.lineplot(x='year', y='sentiment', data=yearly_sentiment)
plt.title('Average Sentiment Over Time')
plt.xlabel('Year')
plt.ylabel('Average Sentiment')
plt.show()

# Group the data by year and topic to see the distribution of topics over time
topic_distribution = df.groupby(['year', 'topic']).size().reset_index(name='count')

# Pivot the data to have years as rows, topics as columns, and counts as values
topic_distribution_pivot = topic_distribution.pivot(index='year', columns='topic', values='count').fillna(0)

# Plot the distribution of topics over time
topic_distribution_pivot.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Topic Distribution Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Documents')
plt.legend(title='Topic', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
