In [1]:
pip install pandas nltk

Note: you may need to restart the kernel to use updated packages.




In [2]:
pip install textblob

Collecting textblob
  Using cached textblob-0.17.1-py2.py3-none-any.whl (636 kB)
Installing collected packages: textblob
Successfully installed textblob-0.17.1
Note: you may need to restart the kernel to use updated packages.




In [1]:
import os
import pandas as pd
from textblob import TextBlob
import re

# Function to extract stock code and year from the filename
def extract_stock_code_and_year(filename):
    stock_code, year = None, None
    match = re.search(r'\b[A-Z]+\b', filename)  # Assuming stock codes are in uppercase letters
    if match:
        stock_code = match.group()
    match = re.search(r'\b\d{4}\b', filename)  # Assuming year is a 4-digit number
    if match:
        year = int(match.group())
    return stock_code, year

# Function to clean up the data
def clean_text(text):
    # Remove random codes
    cleaned_text = re.sub(r'\b[A-Z]+\b', '', text)
    # Remove accounting subjects and amounts (assuming they are numeric)
    cleaned_text = re.sub(r'\b\d+(?:\.\d+)?\b', '', cleaned_text)
    return cleaned_text.strip()

# Function to preprocess the text using TextBlob
def preprocess_text(text):
    blob = TextBlob(text)
    # Extract keywords from the text
    keywords = blob.noun_phrases
    # Perform any other preprocessing steps if needed
    preprocessed_text = " ".join(keywords)
    return preprocessed_text

# Main function to process the dataset
def process_dataset(file_path):
    # Extract the stock code and year from the filename
    stock_code, year = extract_stock_code_and_year(file_path)
    
    # Read the content of the file
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Clean up the data
    cleaned_text = clean_text(text)
    
    # Preprocess the text
    preprocessed_text = preprocess_text(cleaned_text)
    
    # Create a DataFrame
    data = {
        'Stock Code': [stock_code],
        'Year': [year],
        'Preprocessed Text': [preprocessed_text]
    }
    df = pd.DataFrame(data)
    return df

# File paths for the datasets
file_paths = [
    r"C:\Users\This PC\Downloads\science and technology.txt",
    r"C:\Users\This PC\Downloads\Oak Technology.txt"
]

# Process each dataset and store the results in Excel and CSV formats
for file_path in file_paths:
    df = process_dataset(file_path)
    
    # Extract filename without extension to use as a base for output files
    filename_without_extension = os.path.splitext(os.path.basename(file_path))[0]
    
    # Save to Excel format
    output_excel_path = f"{filename_without_extension}.xlsx"
    df.to_excel(output_excel_path, index=False)
    
    # Save to CSV format
    output_csv_path = f"{filename_without_extension}.csv"
    df.to_csv(output_csv_path, index=False)


In [4]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to C:\Users\This
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [5]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download the stopwords resource (Run this once)
nltk.download('stopwords')

# Function to preprocess the text data
def preprocess_text(text):
    # Remove random codes and accounting subjects and amounts
    cleaned_text = re.sub(r"\b[A-Z0-9]{4,}\b", "", text)
    cleaned_text = re.sub(r"\b\d+\b", "", cleaned_text)

    # Convert to lowercase and tokenize
    tokens = word_tokenize(cleaned_text.lower())

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    return " ".join(filtered_tokens)

# Function to process a given file and generate DataFrame
def process_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()

    # Extract stock code and year from the file name
    stock_code_match = re.search(r"\\([A-Za-z0-9]+)\.txt", file_path)
    stock_code = stock_code_match.group(1) if stock_code_match else "Unknown"

    year_match = re.search(r"\d{4}", file_path)
    year = year_match.group() if year_match else "Unknown"

    # Preprocess the text
    preprocessed_text = preprocess_text(text)

    return {"Stock Code": stock_code, "Year": year, "Preprocessed Text": preprocessed_text}

# File paths
file_paths = ["C:\\Users\\This PC\\Downloads\\science and technology.txt",
              "C:\\Users\\This PC\\Downloads\\Oak Technology.txt"]

# Process files and create a list of dictionaries
data = [process_file(file_path) for file_path in file_paths]

# Create DataFrame
df = pd.DataFrame(data)

# Save DataFrame to Excel and CSV formats
df.to_excel("preprocessed_data.xlsx", index=False)
df.to_csv("preprocessed_data.csv", index=False)


[nltk_data] Downloading package stopwords to C:\Users\This
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
