<a href="https://colab.research.google.com/github/isaacbull/NLP-TASKS/blob/main/NLP_preprocessing_and_training_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Building a preprocessing and training pipeline for a NLP model...

### Using a gradient boosting model and gridsearchCV for Neural Architecture Search

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from transformers import BertTokenizer
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
import nlpaug.augmenter.word as naw

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
data = pd.read_csv(url, header=None)

# Define the preprocessing class
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def remove_html_tags(self, text):
        return BeautifulSoup(text, "html.parser").get_text()

    def remove_urls(self, text):
        return re.sub(r'http[s]?://\S+|www\.\S+', '', text)

    def remove_stopwords(self, tokens):
        stop_words = set(["this", "is", "a", "using", "and", "be", "can", "for", "have", "not", "with"])
        return [token for token in tokens if token.lower() not in stop_words]

    def stem_text(self, tokens):
        return [stemmer.stem(token) for token in tokens]

    def lemmatize_text(self, tokens):
        return [token.lemma_ for token in nlp(' '.join(tokens))]

    def augment_text(self, text):
        aug = naw.SynonymAug(aug_src='wordnet')
        return aug.augment(text)

    def normalize_text(self, text):
        date_pattern = r'(\d{1,2})(st|nd|rd|th)?\s(January|February|March|April|May|June|July|August|September|October|November|December)\s(\d{4})'
        date_replacement = r'\4-\3-\1'
        text = re.sub(date_pattern, date_replacement, text)
        number_pattern = r'\bone hundred\b'
        text = re.sub(number_pattern, '100', text)
        return text

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [' '.join(self.augment_text(self.lemmatize_text(self.stem_text(self.remove_stopwords(tokenizer.tokenize(self.normalize_text(self.remove_urls(self.remove_html_tags(text))))))))) for text in X]

# Sample dataset
texts = data.iloc[:, :-1].astype(str).apply(lambda x: ' '.join(x), axis=1)
labels = data.iloc[:, -1]

# Preprocess texts
preprocessor = TextPreprocessor()
texts = preprocessor.fit_transform(texts)

# Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)
y = labels.values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer()),
    ('classifier', GradientBoostingClassifier())
])

# Define parameter grid for GridSearch
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.1, 0.01],
    'classifier__max_depth': [3, 4, 5]
}

# Grid Search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best model summary
best_model = grid_search.best_estimator_
print("Best parameters found: ", grid_search.best_params_)
print("Best model score: ", grid_search.best_score_)

# Evaluate the model
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

### Using a neural network and tuner from keras for NAS

In [None]:
import pandas as pd
import numpy as np
import re
from transformers import BertTokenizer
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
import spacy
import nlpaug.augmenter.word as naw
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras_tuner import HyperModel, RandomSearch

# Load models
nlp = spacy.load("en_core_web_sm")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
stemmer = PorterStemmer()

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def remove_html_tags(self, text):
        return BeautifulSoup(text, "html.parser").get_text()

    def remove_urls(self, text):
        return re.sub(r'http[s]?://\S+|www\.\S+', '', text)

    def remove_stopwords(self, tokens):
        stop_words = set(["this", "is", "a", "using", "and", "be", "can", "for", "have", "not", "with"])
        return [token for token in tokens if token.lower() not in stop_words]

    def stem_text(self, tokens):
        return [stemmer.stem(token) for token in tokens]

    def lemmatize_text(self, tokens):
        return [token.lemma_ for token in nlp(' '.join(tokens))]

    def augment_text(self, text):
        aug = naw.SynonymAug(aug_src='wordnet')
        return aug.augment(text)

    def normalize_text(self, text):
        date_pattern = r'(\d{1,2})(st|nd|rd|th)?\s(January|February|March|April|May|June|July|August|September|October|November|December)\s(\d{4})'
        date_replacement = r'\4-\3-\1'
        text = re.sub(date_pattern, date_replacement, text)
        number_pattern = r'\bone hundred\b'
        text = re.sub(number_pattern, '100', text)
        return text

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [' '.join(self.augment_text(self.lemmatize_text(self.stem_text(self.remove_stopwords(tokenizer.tokenize(self.normalize_text(self.remove_urls(self.remove_html_tags(text))))))))) for text in X]

class MyHyperModel(HyperModel):
    def __init__(self, input_dim):
        self.input_dim = input_dim

    def build(self, hp):
        model = Sequential()
        model.add(Dense(units=hp.Int('units', min_value=32, max_value=512, step=32), activation='relu', input_shape=(self.input_dim,)))
        model.add(Dropout(rate=hp.Float('dropout', min_value=0.1, max_value=0.5, step=0.1)))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
data = pd.read_csv(url, header=None)

# Separate features and labels
texts = data.iloc[:, :-1].astype(str).apply(lambda x: ' '.join(x), axis=1)
labels = data.iloc[:, -1]

# Preprocess texts
preprocessor = TextPreprocessor()
processed_texts = preprocessor.fit_transform(texts)

# Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_texts).toarray()
y = labels.values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()),  # Preprocessing step
    ('vectorizer', TfidfVectorizer()),  # Vectorization step
])

# Preprocess and vectorize the training data
X_train_transformed = pipeline.fit_transform(texts)
input_dim = X_train_transformed.shape[1]

# Hyperparameter tuning for neural network
tuner = RandomSearch(MyHyperModel(input_dim=input_dim), objective='val_accuracy', max_trials=5)
tuner.search(X_train, y_train, epochs=10, validation_split=0.2)

# Best model summary
best_model = tuner.get_best_models(num_models=1)[0]
best_model.summary()

# Evaluate the model

y_pred = (best_model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))