<a href="https://colab.research.google.com/github/ibrahimr/NLP-intro/blob/main/NEW_TOPIC_CLASSIFICATION_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Download dataset:
AG News Classification Dataset
The AG's news topic classification dataset is constructed by choosing 4 largest classes
from the original corpus. Each class contains 30,000 training samples and 1,900 testing
samples. The total number of training samples is 120,000 and testing 7,600.
https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score,precision_score, recall_score, f1_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#get files from lab   #AG News Classification Dataset
from google.colab import files
uploaded = files.upload()

In [None]:
# Reading training set
training = pd.read_csv("train.csv",encoding="ISO-8859-1")
training.head()

In [None]:
len(training)
training['Class Index'].value_counts()

In [None]:
# splits the DataFrame into groups based on unique values in the 'Class Index' column.Each group represents one class
#For each group x, take a random sample of 1000 rows.
#After sampling, the DataFrame might have non-sequential or duplicate indices.reset_index(drop=True):Drops the old index.
training = training.groupby('Class Index',group_keys=False).apply(lambda x: x.sample(n=1000, random_state=42))
training = training.reset_index(drop=True)

In [None]:
#Reading testing set
testing = pd.read_csv("test.csv",encoding="ISO-8859-1")
testing.head()

In [None]:
len(testing)

In [None]:
testing = testing.groupby('Class Index',
group_keys=False).apply(lambda x: x.sample(n=1000, random_state=42))
testing = testing.reset_index(drop=True)

In [None]:
training['text'] = training['Title'] + ' ' + training['Description']
testing['text'] = testing['Title'] + ' ' + testing['Description']

In [None]:
training.head()


Cleaning dataset

In [None]:
import re
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    clean_text = ' '.join(stemmed_tokens)
    return clean_text

In [None]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
  text = text.lower()
  text = re.sub(r'http\S+|www.\S+', '', text) #Lowercase the text
  text = re.sub(r'<.*?>', '', text) #Remove HTML Tags
  text = re.sub(r'\W+', ' ', text)
  text = re.sub(r'[^a-zA-Z\s]', '', text) #Remove Special Characters, Numbers, and Punctuation
  #text = text.lower()
  tokens = word_tokenize(text) #Tokenization
  tokens = [word for word in tokens if word not in stop_words] #Remove Stop Words
  stemmed_tokens = [stemmer.stem(word) for word in tokens]  # Stemming
  clean_text = ' '.join(stemmed_tokens) #reconstruct Text
  return clean_text

In [None]:
training = training.dropna()
training["clean_text"] = training["text"].apply(preprocess_text)


In [None]:
testing["clean_text"] = testing["text"].apply(preprocess_text)
testing = testing.dropna()

## Determining X_train, y_train, X_test, y_test

In [None]:
X_train= training['clean_text'].values
y_train = training['Class Index'].values
X_test= testing['clean_text'].values
y_test = testing['Class Index'].values

## Feature representation
 TF-IDF with Unigrams, Bigrams, and Trigrams
Three types of n-grams
- one token: "machine", "learning", "fun"
- two words  : "machine learning", "deep learning"
- three words  "machine learning is", "deep learning is"
-  ngram_range=(min_n, max_n)

Unigrams(1,1)
Bigrams(2,2)
Unigrams + Bigrams (1,2)
Unigrams + Bigrams + Trigrams(1,3)

## Uni
N-grams: These are contiguous sequences of words or characters.

A 1-gram (or unigram) is simply a single word.

A 2-gram (or bigram) would be two consecutive words, and a 3-gram would be three consecutive words, and so on.

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1, 1))
X_train_Unigram = vectorizer.fit_transform(X_train)
X_test_Unigram = vectorizer.transform(X_test)

In [None]:
training['clean_text'].values

## Model Evaluation

In [None]:
def evaluate_model(model_name, y_true, y_pred):
  # Calculate metrics
  accuracy = accuracy_score(y_true, y_pred)
  precision = precision_score(y_true, y_pred, average='weighted')
  recall = recall_score(y_true, y_pred, average='weighted')
  f1 = f1_score(y_true, y_pred, average='weighted')
  cm = confusion_matrix(y_true, y_pred)
  # Create a report
  report = classification_report(y_true, y_pred)
  # Output results
  metrics = {'Model Name': model_name,'Accuracy': accuracy,'Precision': precision,'Recall': recall,'F1 Score': f1,'Classification Report': report}
  # Plot Confusion Matrix
  plt.figure(figsize=(4, 4))
  sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,xticklabels=np.unique(y_true), yticklabels=np.unique(y_true))
  plt.title(f'Confusion Matrix for {model_name}')
  plt.xlabel('Predicted Label')
  plt.ylabel('True Label')
  plt.show()
  return metric

### Random Forest Classifier

- one of the Ensembled algorithms :   which combines more than one algorithms of same or different kind for classifying objects.

- Random forest classifier creates a set of decision trees from randomly selected subset of training set. It then aggregates the votes from different decision trees to decide the final class of the test object.
- Basic Parameters:Basic parameters to Random Forest Classifier can be total number of trees to be generated and decision tree related parameters like minimum split, split criteria


In [None]:
# Define the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
# Define the hyperparameters for grid search
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, sc
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
# Get the best estimator from grid search
best_rf_classifier = grid_search.best_estimator_
# Make predictions using the best model
y_pred = best_rf_classifier.predict(X_test)
# Evaluate the model
evaluation_results = evaluate_model('RandomForestClassifier', y_test, y_pred)
# Print the evaluation results
                           or key, value in evaluation_results.items():
if key == 'Classification Report':
print(value) # Print report separately for better readability
else:
print(f"{key}: {value:.4f}" if isinstance(value, float) else f"{key}: \n{va
# Print the best parameters found by grid search
print("\nBest hyperparameters found by GridSearchCV:")
print(grid_search.best_params_)