# Imports and Configurations 

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
INPUT_PATH = "C:/Users/mushj/Downloads/RAW FINANCE DATA/"
OUTPUT_PATH = "C:/Users/mushj/Downloads/PROCESSED FINANCE DATA/"

In [3]:
df = pd.read_csv(INPUT_PATH+"FNSPID_NVDA.csv")

# Text preprocessing

In [4]:
# Download NLTK resources (Run only once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize resources
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mushj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mushj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mushj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mushj\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
def preprocess_text_vectorized(text_series):
    # Lowercase all text
    text_series = text_series.str.lower()
    
    # Remove URLs
    text_series = text_series.str.replace(r"http\S+|www\S+|https\S+", '', regex=True)
    
    # Remove HTML tags
    text_series = text_series.str.replace(r'<.*?>', '', regex=True)
    
    # Remove special characters and numbers
    text_series = text_series.str.replace(r"[^a-zA-Z\s]", '', regex=True)
    
    # Tokenization, Stopword Removal, and Lemmatization (row-wise)
    text_series = text_series.apply(lambda text: ' '.join(
        [lemmatizer.lemmatize(word) for word in word_tokenize(text) if word not in stop_words]
    ))
    
    return text_series

In [6]:
sub_df = (
    df
    .dropna(subset=['Lsa_summary'])
    [['Date', 'Lsa_summary']]
)

In [7]:
%%time
sub_df['Lsa_summary_cleaned'] = preprocess_text_vectorized(sub_df['Lsa_summary'])

CPU times: total: 5.78 s
Wall time: 5.87 s


In [8]:
# display examples
for _,row in sub_df.iloc[:5].iterrows():
    print(row['Lsa_summary'], '\n')
    print(row['Lsa_summary_cleaned'], '\n\n-----')

Stock splits, meanwhile, continue to get attention from investors after nearly every big tech stock split its shares in 2021 and 2022, including Tesla, Apple, Alphabet, Amazon, Nvidia (NASDAQ: NVDA), and Shopify. The launch of OpenAI's ChatGPT in late 2022 set off a new race to harness generative AI technologies, which some tech CEOs think could be as transformative as the internet has been over the past three decades. Its graphics processing units (GPUs) and accelerators have been in high demand from cloud infrastructure companies and others looking to scale up and build capacity for AI applications. 

stock split meanwhile continue get attention investor nearly every big tech stock split share including tesla apple alphabet amazon nvidia nasdaq nvda shopify launch openais chatgpt late set new race harness generative ai technology tech ceo think could transformative internet past three decade graphic processing unit gpus accelerator high demand cloud infrastructure company others look

In [9]:
# export processed data
sub_df.to_csv(OUTPUT_PATH+'FNSPID_NVDA_cleaned.csv', index=False)