# Text Classification
## This notebook outlines the usage of NLP Feature extraction (CountVectorizer, TfidfVectorizer) in classification of text documents

### Import all the necessary libraries

In [45]:
from pprint import pprint
from time import time
import logging
import re
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

### Choose a few categories fro the entire 20 categories

In [46]:
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
]

In [47]:
print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc']


### Fetch documents for these 2 categories

In [48]:
data = fetch_20newsgroups(subset='train', categories=categories)
print(f"{len(data.filenames)} documents")
print(f"{len(data.target_names)} categories")
print()

857 documents
2 categories



### Define a pipeline combining a text feature extractor with a simple classifier

In [49]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(tol=1e-3)),
])

### Specify parameter grid
- 'vect__max_df': (0.5, 0.75, 1.0)
- 'vect__max_features': (None, 5000, 10000, 50000)
- 'vect__ngram_range': ((1, 1), (1, 2))
- 'tfidf__use_idf': (True, False)
- 'tfidf__norm': ('l1', 'l2')
- 'clf__max_iter': (20,)
- 'clf__alpha': (0.00001, 0.000001)
- 'clf__penalty': ('l2', 'elasticnet')
- 'clf__max_iter': (10, 50, 80)

In [50]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    # 'clf__max_iter': (10, 50, 80),
}

### Find the best parameters for both the feature extraction and the classifier

### Build a GridSearch with the pipeline and parameter grid

In [51]:
grid_search = GridSearchCV(pipeline, parameters, cv=5,
                           n_jobs=-1, verbose=1)

### Start the grid search

In [52]:
grid_search.fit(data.data, data.target)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


### Best Score

In [53]:
print("Best score: %0.3f" % grid_search.best_score_)

Best score: 0.952


### Best Parameter

In [54]:
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best parameters set:
	clf__alpha: 1e-05
	clf__max_iter: 20
	clf__penalty: 'l2'
	vect__max_df: 1.0
	vect__ngram_range: (1, 1)


### Choose the best model

### Use the model to classify a piece of text

In [55]:
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [56]:


from sklearn.feature_extraction.text import CountVectorizer

# Define a custom text preprocessing function
def clean(text):
    # Remove non-alphabetic characters using regular expression
    cleaned_text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    cleaned_text = cleaned_text.lower()
    return cleaned_text






In [57]:
#Step 2-3: Pre-process and Vectorize train and test data
vect = CountVectorizer(preprocessor=clean)

#clean 1s a custom defined function for pre-processing
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)
print(X_train_dtm.shape, X_test_dtm.shape)

(642, 14140) (215, 14140)


In [58]:
from gensim.models import Word2Vec
import numpy as np
def embedding_feats(list_of_lists,w2v_model):
    DIMENSION = 300
    zero_vector = np.zeros(DIMENSION)
    feats = []
    for tokens in list_of_lists:
        feat_for_this = np.zeros(DIMENSION)
        count_for_this = 0
        for token in tokens:
            if token in w2v_model.wv:
                feat_for_this += w2v_model.wv[token]
                count_for_this +=1    
            if count_for_this > 0:
                feats.append(feat_for_this / count_for_this)
        else:
            feats.append(zero_vector)  # Handle the case where no valid tokens are found
    return feats   
texts_processed_train = [clean(text) for text in X_train]
texts_processed = [text.split() for text in texts_processed_train]

w2v_model = Word2Vec(sentences=texts_processed, vector_size=300, window=5, min_count=1, sg=0)

# To get the average word embedding for a document, use your embedding_feats function
train_vectors_word2vec = embedding_feats(texts_processed, w2v_model)

print('done loading Word2Vec')
len(train_vectors_word2vec)







done loading Word2Vec


229081

In [59]:
#Prepare training data in doc2vec format:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

d2vtrain = [TaggedDocument((d),tags=[str(i)]) for i, d in enumerate(X_train)]
#Train a doc2vec model to learn tweet representations. Use only training data!!
model = Doc2Vec(vector_size=50, alpha=0.025, min_count=10, dm =1, epochs=100)
model. build_vocab(d2vtrain)

model.train(d2vtrain, total_examples=model.corpus_count, epochs=model.epochs)
model.save("d2v.model")

print("Model Saved")

Model Saved


In [60]:
#from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Prepare training data in Doc2Vec format:


#d2vtrain = [TaggedDocument(d, [str(i)]) for i, d in enumerate(train_data)]

# Create a Doc2Vec model

#model = Doc2Vec(vector_size=50, alpha=0.025, min_count=10, dm=1, epochs=100)

# Build the vocabulary
#model.build_vocab(d2vtrain)

# Train the model
#model.train(d2vtrain, total_examples=model.corpus_count, epochs=model.epochs)

# Save the model
#model.save("d2v.model")
#print("Model Saved")

In [61]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
#instantiate a Multinomial Nalve Bayes classifier
nb = MultinomialNB()
#train the model
nb.fit(X_train_dtm, y_train)

#make class predictions for test data
y_pred_class = nb.predict(X_test_dtm)

#Multinomial Naive Bayes
vect = CountVectorizer(preprocessor=clean)
#vect1= Doc2Vec(preprocessor=clean)

X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

nb = MultinomialNB()
nb. fit(X_train_dtm, y_train)

y_pred_class = nb.predict(X_test_dtm)

print("Accuracy: ", metrics.accuracy_score(y_test, y_pred_class))

Accuracy:  0.9441860465116279


In [62]:
#Logistic Regression

from sklearn. linear_model import LogisticRegression

logreg = LogisticRegression(class_weight="balanced")

logreg. fit(X_train_dtm, y_train)

y_pred_class = logreg.predict(X_test_dtm)

print("Accuracy: ", metrics.accuracy_score(y_test, y_pred_class))

Accuracy:  0.958139534883721


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [63]:
#Support Vector Machine

from sklearn.svm import LinearSVC

vect = CountVectorizer(preprocessor=clean, max_features=1000)
X_train_dtm = vect.fit_transform(X_train)

X_test_dtm = vect.transform(X_test)

classifier = LinearSVC(class_weight='balanced')
classifier.fit(X_train_dtm, y_train)

y_pred_class = classifier.predict(X_test_dtm)

print("Accuracy: ", metrics.accuracy_score(y_test, y_pred_class))



Accuracy:  0.8744186046511628




In [64]:
from sklearn.tree import DecisionTreeClassifier

# Instantiate a Decision Tree classifier
dt_classifier = DecisionTreeClassifier()

# Train the model using your training data (X_train_dtm, y_train)
dt_classifier.fit(X_train_dtm, y_train)

# Make predictions on the test data
y_pred_class = dt_classifier.predict(X_test_dtm)

# Calculate and print the accuracy of the model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred_class))

Accuracy:  0.7906976744186046


In [65]:
from sklearn.metrics import accuracy_score

# Define the configurations to evaluate
configurations = [
    ('Multinomial Naive Bayes', CountVectorizer(), MultinomialNB()),
    ('Logistic Regression', CountVectorizer(), LogisticRegression(class_weight="balanced")),
    ('SVM', CountVectorizer(), LinearSVC(class_weight='balanced')),
    ('Decision Tree', CountVectorizer(), DecisionTreeClassifier()),
    ('Word2Vec', None, LinearSVC(class_weight='balanced')),  
    ('Doc2Vec', None, LinearSVC(class_weight='balanced')), 
]

results = []

# Evaluate each configuration
for name, vectorizer, classifier in configurations:
    if vectorizer is not None:
        X_train_dtm = vectorizer.fit_transform(X_train)
        X_test_dtm = vectorizer.transform(X_test)
    else:
        # Use your Word2Vec or Doc2Vec model to get document embeddings here
        pass
    
    classifier.fit(X_train_dtm, y_train)
    y_pred_class = classifier.predict(X_test_dtm)
    accuracy = accuracy_score(y_test, y_pred_class)
    results.append((name, accuracy))

# Sort the results by accuracy in descending order
results.sort(key=lambda x: x[1], reverse=True)

# Write results to a tabular format in a text file
with open('simran_results.txt', 'w') as file:
    file.write('Algorithm\tAccuracy\n')
    for name, accuracy in results:
        file.write(f'{name}\t{accuracy:.2f}\n')

print('Benchmark results written to simran_results.txt')



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Benchmark results written to simran_results.txt


