<a href="https://colab.research.google.com/github/itz-aniket-akm/LABS/blob/main/Module3_Lab3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
import re
import numpy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from bs4 import BeautifulSoup

def cleanText(text, lemmatize, stemmer):
    """Method for cleaning text from train and test data. Removes numbers, punctuation, and capitalization. Stems or lemmatizes text."""

    if isinstance(text, float):
        text = str(text)
    if isinstance(text, numpy.int64):
        text = str(text)
    try:
        text = text.decode()
    except AttributeError:
        pass

    soup = BeautifulSoup(text, "lxml")
    text = soup.get_text()
    text = re.sub(r"[^A-Za-z]", " ", text)
    text = text.lower()


    if lemmatize:
        wordnet_lemmatizer = WordNetLemmatizer()

        def get_tag(tag):
            if tag.startswith('J'):
                return wordnet.ADJ
            elif tag.startswith('V'):
                return wordnet.VERB
            elif tag.startswith('N'):
                return wordnet.NOUN
            elif tag.startswith('R'):
                return wordnet.ADV
            else:
                return ''

        text_result = []
        tokens = word_tokenize(text)  # Generate list of tokens
        tagged = pos_tag(tokens)
        for t in tagged:
            try:
                text_result.append(wordnet_lemmatizer.lemmatize(t[0], get_tag(t[1][:2])))
            except:
                text_result.append(wordnet_lemmatizer.lemmatize(t[0]))
        return text_result

    if stemmer:
        text_result = []
        tokens = word_tokenize(text)
        snowball_stemmer = SnowballStemmer('english')
        for t in tokens:
            text_result.append(snowball_stemmer.stem(t))
        return text_result

In [None]:
sample_text = "Playing"
sample_text_result = cleanText(sample_text, lemmatize=False, stemmer=True)
sample_text_result = " ".join(str(x) for x in sample_text_result)
print(sample_text)
print(sample_text_result)
sample_text_result = cleanText(sample_text, lemmatize=True, stemmer=False)
sample_text_result = " ".join(str(x) for x in sample_text_result)
print(sample_text_result)

Playing
play
play


In [None]:
# Functions to convert document(s) to a list of words, with the option of removing stopwords. Returns document-term matrix.

def createBagOfWords(train, test, remove_stopwords, lemmatize, stemmer):
    if remove_stopwords:
        vectorizer = CountVectorizer(analyzer='word', input='content', stop_words=stopwords.words('english'))
    else:
        vectorizer = CountVectorizer(analyzer='word', input='content')

    clean_train = []
    for paragraph in train:
        paragraph_result = cleanText(paragraph, lemmatize, stemmer)
        paragraph = " ".join(str(x) for x in paragraph_result)
        clean_train.append(paragraph)

    clean_test = []
    for paragraph in test:
        paragraph_result = cleanText(paragraph, lemmatize, stemmer)
        paragraph = " ".join(str(x) for x in paragraph_result)
        clean_test.append(paragraph)

    bag_of_words_train = vectorizer.fit_transform(clean_train).toarray()
    bag_of_words_test = vectorizer.transform(clean_test).toarray()
    return bag_of_words_train, bag_of_words_test


In [None]:
def createTFIDF(train, test, remove_stopwords, lemmatize, stemmer):
    if remove_stopwords:
        vectorizer = TfidfVectorizer(analyzer='word', input='content', stop_words=stopwords.words('english'))
    else:
        vectorizer =  TfidfVectorizer(analyzer='word', input='content')

    clean_train = []
    for paragraph in train:
        paragraph_result = cleanText(paragraph, lemmatize, stemmer)
        paragraph = " ".join(str(x) for x in paragraph_result)
        clean_train.append(paragraph)

    clean_test = []
    for paragraph in test:
        paragraph_result = cleanText(paragraph, lemmatize, stemmer)
        paragraph = " ".join(str(x) for x in paragraph_result)
        clean_test.append(paragraph)

    tfidf_train = vectorizer.fit_transform(clean_train).toarray()
    tfidf_test = vectorizer.transform(clean_test).toarray()
    return tfidf_train, tfidf_test

In [None]:
# Upload the Reviews CSV file that has been shared with you.
# Run this cell, click on the 'Choose files' button and upload the file.
from google.colab import files
uploaded = files.upload()

Saving reviews.csv to reviews.csv


In [None]:
import pandas as pd
df = pd.read_csv('reviews.csv')

In [None]:
df.to_csv('reviews.csv', index=False)

In [None]:
from sklearn import metrics, neighbors
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

## TASK - 1: Tweak the models below and see results with different parameters and distance metrics.

def bow_knn():
    """Method for determining nearest neighbors using bag-of-words and K-Nearest Neighbor algorithm"""

    training_data = pd.read_csv('reviews.csv')
    X_train, X_test, y_train, y_test = train_test_split(training_data["sentence"], training_data["sentiment"], test_size=0.2, random_state=5)
    X_train, X_test = createBagOfWords(X_train, X_test, remove_stopwords=True, lemmatize=True, stemmer=False)
    # print(X_train)
    knn = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='euclidean', metric_params=None, n_jobs=1)

    knn.fit(X_train, y_train)
    predicted = knn.predict(X_test)
    acc = metrics.accuracy_score(y_test, predicted)
    print('KNN with BOW accuracy = ' + str(acc * 100) + '%')

    scores = cross_val_score(knn, X_train, y_train, cv=3)
    print("Cross Validation Accuracy: %0.2f" % (scores.mean()))
    print(scores)
    print('\n')
    return predicted, y_test


def tfidf_knn():
    """Method for determining nearest neighbors using tf-idf and K-Nearest Neighbor algorithm"""

    training_data = pd.read_csv('reviews.csv')
    X_train, X_test, y_train, y_test = train_test_split(training_data["sentence"], training_data["sentiment"],
                                                        test_size=0.2, random_state=5)
    X_train, X_test = createTFIDF(X_train, X_test, remove_stopwords=True, lemmatize=True, stemmer=False)
    # print(X_train)
    knn = neighbors.KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='brute', leaf_size=30, p=2,
                                         metric='cosine', metric_params=None, n_jobs=1)

    knn.fit(X_train, y_train)
    predicted = knn.predict(X_test)
    acc = metrics.accuracy_score(y_test, predicted)
    print('KNN with TFIDF accuracy = ' + str(acc * 100) + '%')

    scores = cross_val_score(knn, X_train, y_train, cv=3)
    print("Cross Validation Accuracy: %0.2f" % (scores.mean()))
    print(scores)
    return predicted, y_test


In [None]:
## KNN accuracy after using BoW
predicted, y_test = bow_knn()

  soup = BeautifulSoup(text, "lxml")


KNN with BOW accuracy = 62.30366492146597%




Cross Validation Accuracy: 0.62
[0.60784314 0.58431373 0.66141732]




In [None]:
## KNN accuracy after using TFIDF
predicted, y_test = tfidf_knn()

  soup = BeautifulSoup(text, "lxml")


KNN with TFIDF accuracy = 70.15706806282722%
Cross Validation Accuracy: 0.73
[0.7254902  0.74117647 0.72834646]




In [None]:
# Upload the spam text data CSV file that has been shared with you. You can also download the file from https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset
# Run this cell, click on the 'Choose files' button and upload the file.
from google.colab import files
uploaded = files.upload()

Saving spam.csv to spam.csv


In [None]:
import pandas as pd
df = pd.read_csv('spam.csv', error_bad_lines=False)
df



  df = pd.read_csv('spam.csv', error_bad_lines=False)


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ã¼ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})

In [None]:
df.head(5)

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
len(df)

5572

In [None]:
from sklearn import metrics, neighbors
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

## TASK - 2: Tweak the models below and see results with different parameters and distance metrics.

def bow_knn():
    """Method for determining nearest neighbors using bag-of-words and K-Nearest Neighbor algorithm"""

    training_data = pd.read_csv('spam.csv')
    training_data['Category'] = training_data['Category'].map({'ham': 0, 'spam': 1})
    X_train, X_test, y_train, y_test = train_test_split(training_data["Message"], training_data["Category"], test_size=0.2, random_state=5)
    X_train, X_test = createBagOfWords(X_train, X_test, remove_stopwords=True, lemmatize=True, stemmer=False)
    knn = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='euclidean', metric_params=None, n_jobs=1)

    knn.fit(X_train, y_train)
    predicted = knn.predict(X_test)
    acc = metrics.accuracy_score(y_test, predicted)
    print('KNN with BOW accuracy = ' + str(acc * 100) + '%')

    scores = cross_val_score(knn, X_train, y_train, cv=3)
    print("Cross Validation Accuracy: %0.2f" % (scores.mean()))
    print(scores)
    print('\n')
    return predicted, y_test


def tfidf_knn():
    """Method for determining nearest neighbors using tf-idf and K-Nearest Neighbor algorithm"""

    training_data = pd.read_csv('spam.csv')
    training_data['Category'] = training_data['Category'].map({'ham': 0, 'spam': 1})
    X_train, X_test, y_train, y_test = train_test_split(training_data["Message"], training_data["Category"], test_size=0.2, random_state=5)
    X_train, X_test = createTFIDF(X_train, X_test, remove_stopwords=True, lemmatize=True, stemmer=False)
    knn = neighbors.KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='brute', leaf_size=30, p=2, metric='cosine', metric_params=None, n_jobs=1)

    knn.fit(X_train, y_train)
    predicted = knn.predict(X_test)
    acc = metrics.accuracy_score(y_test, predicted)
    print('KNN with TFIDF accuracy = ' + str(acc * 100) + '%')

    scores = cross_val_score(knn, X_train, y_train, cv=3)
    print("Cross Validation Accuracy: %0.2f" % (scores.mean()))
    print(scores)
    return predicted, y_test

In [None]:
predicted, y_test = bow_knn()

  soup = BeautifulSoup(text, "lxml")


KNN with BOW accuracy = 92.19730941704036%
Cross Validation Accuracy: 0.91
[0.90713324 0.90040377 0.91245791]




In [None]:
predicted, y_test = tfidf_knn()

  soup = BeautifulSoup(text, "lxml")


KNN with TFIDF accuracy = 98.56502242152466%
Cross Validation Accuracy: 0.97
[0.96837147 0.96769852 0.96363636]


### **Questions to Think About and Answer**
## 1. Why does the TF-IDF approach generally result in a better accuracy than Bag-of-Words ?
## ans:-
TF-IDF (Term Frequency-Inverse Document Frequency) and Bag-of-Words are both popular approaches in natural language processing, but they have different characteristics that make one potentially more effective than the other in certain scenarios.

1. **Consideration of Word Importance:**
   - **Bag-of-Words (BoW):** BoW represents a document as an unordered set of words, ignoring the order and structure of words in the document. It only considers the presence or absence of words in the document.
   - **TF-IDF:** TF-IDF, on the other hand, not only considers the presence or absence of words but also takes into account the importance of each word in the document. It weighs words based on their frequency in the document (Term Frequency) and inversely proportional to their frequency in the entire dataset (Inverse Document Frequency).

2. **Handling of Common Words:**
   - **Bag-of-Words (BoW):** BoW tends to give equal importance to all words, including common words that may not carry much semantic meaning (e.g., "the," "and," "is").
   - **TF-IDF:** TF-IDF helps in mitigating the impact of common words by assigning lower weights to them. Words that are common across many documents have lower IDF values, resulting in lower overall importance.

3. **Normalization:**
   - **TF-IDF:** TF-IDF normalizes the importance of words by considering the term frequency and inverse document frequency. This normalization helps in handling variations in document lengths and overall term frequencies.

4. **Contextual Information:**
   - **TF-IDF:** TF-IDF captures some level of contextual information by considering the importance of each word within a document and across the entire dataset.
   - **Bag-of-Words (BoW):** BoW lacks this contextual information as it treats each document as an independent set of words.

5. **Sparse Representation:**
   - **TF-IDF:** The TF-IDF matrix is typically sparse, which means it is efficient in terms of memory usage and can handle high-dimensional data well.
   - **Bag-of-Words (BoW):** BoW representations can also be sparse, but TF-IDF tends to be more effective in capturing the discriminatory power of words.

While TF-IDF often outperforms Bag-of-Words in capturing word importance and providing a more nuanced representation of documents, the effectiveness of these methods depends on the specific task and dataset. In some cases, simpler representations like Bag-of-Words might be sufficient, especially when the focus is on capturing the overall frequency of words rather than their nuanced importance. Additionally, more advanced models like word embeddings and transformer-based models have gained popularity for their ability to capture semantic relationships and contextual information.

## 2. Can you think of techniques that are better than both BoW and TF-IDF ?
## ans:-
Certainly! Advanced techniques that often outperform Bag-of-Words (BoW) and TF-IDF include:

1. **Word Embeddings:** Represent words as dense vectors capturing semantic relationships.
2. **Transformer-Based Models (e.g., BERT, GPT):** Leverage attention mechanisms for contextual understanding.
3. **Doc2Vec:** Represents entire documents with continuous vector embeddings.
4. **ELMo (Embeddings from Language Models):** Provides deep contextualized word representations.
5. **ULMFiT (Universal Language Model Fine-tuning):** Utilizes transfer learning for NLP tasks.
6. **Attention-Based Models:** Focus on different parts of the input sequence for improved performance.

## **3.Read about Stemming and Lemmatization from the resources given below. Think about the pros/cons of each**.
# ans:-
# Stemming and Lemmatization: Pros and Cons

**Stemming:**
- **Pros:**
  1. **Simplicity:** Stemming is a simpler process, stripping affixes to reduce words to their root form.
  2. **Computational Efficiency:** Stemming is computationally less intensive compared to lemmatization, making it faster.
  3. **Reduced Vocabulary Size:** Stemming can help in reducing the vocabulary size, potentially improving the efficiency of downstream NLP tasks.

- **Cons:**
  1. **Over-Stemming:** It may lead to over-stemming, where unrelated words are reduced to the same root, impacting semantic meaning.
  2. **Loss of Meaning:** Stemming can result in the loss of the original word's meaning, affecting the interpretability of the text.
  3. **Language-Specific:** Stemming algorithms are language-specific, and the effectiveness may vary across languages.

**Lemmatization:**
- **Pros:**
  1. **Preservation of Meaning:** Lemmatization retains the base or dictionary form of words, preserving their semantic meaning.
  2. **Better Accuracy:** Lemmatization generally produces more accurate results than stemming, especially in tasks requiring a deeper understanding of language.
  3. **Improved Interpretability:** Lemmatized text is often more interpretable as the words are mapped to their dictionary form.

- **Cons:**
  1. **Computational Complexity:** Lemmatization is computationally more intensive than stemming, potentially impacting processing time.
  2. **Increased Vocabulary Size:** Lemmatization may result in a larger vocabulary size compared to stemming, potentially affecting memory and computational requirements.
  3. **Language-Specific:** Like stemming, lemmatization can also be language-dependent and may require language-specific resources.

**General Considerations:**
- **Task Dependency:** The choice between stemming and lemmatization depends on the specific NLP task. For information retrieval tasks, where speed is crucial, stemming might be preferred. For tasks requiring semantic accuracy, lemmatization is often more suitable.
- **Language and Domain:** The effectiveness of both techniques can vary based on the language of the text and the specific domain of the data. Some languages or domains may benefit more from one approach over the other.

### **Questions to Think About and Answer**
## 1. Why does the TF-IDF approach generally result in a better accuracy than Bag-of-Words ?
## ans:-
TF-IDF (Term Frequency-Inverse Document Frequency) and Bag-of-Words are both popular approaches in natural language processing, but they have different characteristics that make one potentially more effective than the other in certain scenarios.

1. **Consideration of Word Importance:**
   - **Bag-of-Words (BoW):** BoW represents a document as an unordered set of words, ignoring the order and structure of words in the document. It only considers the presence or absence of words in the document.
   - **TF-IDF:** TF-IDF, on the other hand, not only considers the presence or absence of words but also takes into account the importance of each word in the document. It weighs words based on their frequency in the document (Term Frequency) and inversely proportional to their frequency in the entire dataset (Inverse Document Frequency).

2. **Handling of Common Words:**
   - **Bag-of-Words (BoW):** BoW tends to give equal importance to all words, including common words that may not carry much semantic meaning (e.g., "the," "and," "is").
   - **TF-IDF:** TF-IDF helps in mitigating the impact of common words by assigning lower weights to them. Words that are common across many documents have lower IDF values, resulting in lower overall importance.

3. **Normalization:**
   - **TF-IDF:** TF-IDF normalizes the importance of words by considering the term frequency and inverse document frequency. This normalization helps in handling variations in document lengths and overall term frequencies.

4. **Contextual Information:**
   - **TF-IDF:** TF-IDF captures some level of contextual information by considering the importance of each word within a document and across the entire dataset.
   - **Bag-of-Words (BoW):** BoW lacks this contextual information as it treats each document as an independent set of words.

5. **Sparse Representation:**
   - **TF-IDF:** The TF-IDF matrix is typically sparse, which means it is efficient in terms of memory usage and can handle high-dimensional data well.
   - **Bag-of-Words (BoW):** BoW representations can also be sparse, but TF-IDF tends to be more effective in capturing the discriminatory power of words.

While TF-IDF often outperforms Bag-of-Words in capturing word importance and providing a more nuanced representation of documents, the effectiveness of these methods depends on the specific task and dataset. In some cases, simpler representations like Bag-of-Words might be sufficient, especially when the focus is on capturing the overall frequency of words rather than their nuanced importance. Additionally, more advanced models like word embeddings and transformer-based models have gained popularity for their ability to capture semantic relationships and contextual information.

## 2. Can you think of techniques that are better than both BoW and TF-IDF ?
## ans:-
Certainly! Advanced techniques that often outperform Bag-of-Words (BoW) and TF-IDF include:

1. **Word Embeddings:** Represent words as dense vectors capturing semantic relationships.
2. **Transformer-Based Models (e.g., BERT, GPT):** Leverage attention mechanisms for contextual understanding.
3. **Doc2Vec:** Represents entire documents with continuous vector embeddings.
4. **ELMo (Embeddings from Language Models):** Provides deep contextualized word representations.
5. **ULMFiT (Universal Language Model Fine-tuning):** Utilizes transfer learning for NLP tasks.
6. **Attention-Based Models:** Focus on different parts of the input sequence for improved performance.

## **3.Read about Stemming and Lemmatization from the resources given below. Think about the pros/cons of each**.
# ans:-
# Stemming and Lemmatization: Pros and Cons

**Stemming:**
- **Pros:**
  1. **Simplicity:** Stemming is a simpler process, stripping affixes to reduce words to their root form.
  2. **Computational Efficiency:** Stemming is computationally less intensive compared to lemmatization, making it faster.
  3. **Reduced Vocabulary Size:** Stemming can help in reducing the vocabulary size, potentially improving the efficiency of downstream NLP tasks.

- **Cons:**
  1. **Over-Stemming:** It may lead to over-stemming, where unrelated words are reduced to the same root, impacting semantic meaning.
  2. **Loss of Meaning:** Stemming can result in the loss of the original word's meaning, affecting the interpretability of the text.
  3. **Language-Specific:** Stemming algorithms are language-specific, and the effectiveness may vary across languages.

**Lemmatization:**
- **Pros:**
  1. **Preservation of Meaning:** Lemmatization retains the base or dictionary form of words, preserving their semantic meaning.
  2. **Better Accuracy:** Lemmatization generally produces more accurate results than stemming, especially in tasks requiring a deeper understanding of language.
  3. **Improved Interpretability:** Lemmatized text is often more interpretable as the words are mapped to their dictionary form.

- **Cons:**
  1. **Computational Complexity:** Lemmatization is computationally more intensive than stemming, potentially impacting processing time.
  2. **Increased Vocabulary Size:** Lemmatization may result in a larger vocabulary size compared to stemming, potentially affecting memory and computational requirements.
  3. **Language-Specific:** Like stemming, lemmatization can also be language-dependent and may require language-specific resources.

**General Considerations:**
- **Task Dependency:** The choice between stemming and lemmatization depends on the specific NLP task. For information retrieval tasks, where speed is crucial, stemming might be preferred. For tasks requiring semantic accuracy, lemmatization is often more suitable.
- **Language and Domain:** The effectiveness of both techniques can vary based on the language of the text and the specific domain of the data. Some languages or domains may benefit more from one approach over the other.