2. Preprocess Text Data

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from tqdm import tqdm  # Progress bar
import scipy.sparse as sp

# Load cleaned data
cleaned_file_path = 'cleaned_medicine_dataset.csv'
data = pd.read_csv(cleaned_file_path, low_memory=False)

# Preprocess text using spaCy
nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Combine relevant columns into a single 'text' column for processing
combined_texts = []
for _, row in tqdm(data.iterrows(), total=data.shape[0], desc="Processing text"):
    combined_text = ' '.join([
        preprocess_text(str(row['use0'])),
        preprocess_text(str(row['use1']))
    ])
    combined_texts.append(combined_text)

data['combined_text'] = combined_texts

# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['combined_text'])
y = data['name']

# Save the processed data
X_file_path = 'X_tfidf_vectors.npz'
y_file_path = 'y_labels.csv'
sp.save_npz(X_file_path, X)
y.to_csv(y_file_path, index=False)

print("Text data preprocessed and saved successfully.")

Processing text: 100%|██████████| 222975/222975 [51:06<00:00, 72.71it/s] 


Text data preprocessed and saved successfully.
