<a href="https://colab.research.google.com/github/jyotidabass/NLP-Projects/blob/main/NLP_Projects.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Project 1: Text Classification**

In [2]:
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# Load data
train_data = ["This is a positive review.", "This is a negative review."]
train_labels = [1, 0]

# Tokenize text
nltk.download('punkt')
tokenized_train_data = [word_tokenize(text) for text in train_data]

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, train_labels, test_size=0.2, random_state=42)

# Train Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Make predictions on test data
y_pred = clf.predict(X_test)

# Evaluate model
print("Accuracy:", clf.score(X_test, y_test))

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Accuracy: 0.0


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**Project 2: Sentiment Analysis**

In [4]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Load data
text = "I love this product! It's amazing."

# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Analyze sentiment
sentiment = sia.polarity_scores(text)

# Print sentiment scores
print("Positive sentiment:", sentiment['pos'])
print("Negative sentiment:", sentiment['neg'])
print("Neutral sentiment:", sentiment['neu'])
print("Compound sentiment:", sentiment['compound'])

Positive sentiment: 0.734
Negative sentiment: 0.0
Neutral sentiment: 0.266
Compound sentiment: 0.8516


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


# **Project 3: Named Entity Recognition**

In [5]:
import spacy

# Load data
text = "Apple is a technology company based in California."

# Load Spacy model
nlp = spacy.load("en_core_web_sm")

# Process text
doc = nlp(text)

# Extract named entities
entities = [(ent.text, ent.label_) for ent in doc.ents]

# Print entities
print("Named entities:", entities)

Named entities: [('Apple', 'ORG'), ('California', 'GPE')]


**Project 4: Language Modeling**

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim

# Load data
text = "This is a sample text."

# Tokenize text
tokens = text.split()

# Create vocabulary mapping
word_to_index = {token: index for index, token in enumerate(set(tokens))}
# Map tokens to indices
indexed_tokens = [word_to_index[token] for token in tokens]

# Create language model
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=1, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        output = self.fc(output[:, -1, :])
        return output

# Initialize model and optimizer
model = LanguageModel(len(word_to_index), 128, 128)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train model
for epoch in range(10):
    optimizer.zero_grad()
    # Pass indexed tokens as input to the model
    output = model(torch.tensor([indexed_tokens]))
    # Use the index of the last token as the target
    loss = criterion(output, torch.tensor([indexed_tokens[-1]]))
    loss.backward()
    optimizer.step()
    print("Epoch:", epoch, "Loss:", loss.item())

Epoch: 0 Loss: 1.6580555438995361
Epoch: 1 Loss: 0.9252477288246155
Epoch: 2 Loss: 0.4701130986213684
Epoch: 3 Loss: 0.23719075322151184
Epoch: 4 Loss: 0.12714453041553497
Epoch: 5 Loss: 0.07412440329790115
Epoch: 6 Loss: 0.0468980111181736
Epoch: 7 Loss: 0.031855370849370956
Epoch: 8 Loss: 0.02295481227338314
Epoch: 9 Loss: 0.01736520044505596


# **Project 5: Text Generation**

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim

# Load data
text = "This is a sample text."

# Tokenize text
tokens = text.split()

# Create a mapping from words to indices and vice-versa
word_to_index = {token: index for index, token in enumerate(tokens)}
index_to_word = {index: token for token, index in word_to_index.items()}

# Create text generator
class TextGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(TextGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=1, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        output = self.fc(output[:, -1, :])
        return output

# Initialize model and optimizer
vocab_size = len(tokens)  # or len(word_to_index), they are the same
model = TextGenerator(vocab_size, 128, 128)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train model
for epoch in range(10):
    optimizer.zero_grad()

    # Convert tokens to numerical indices before feeding to the model
    input_indices = torch.tensor([[word_to_index[token] for token in tokens]])

    output = model(input_indices)

    # Use the index of the last token as the target
    target_index = torch.tensor([word_to_index[tokens[-1]]])

    loss = criterion(output, target_index)
    loss.backward()
    optimizer.step()
    print("Epoch:", epoch, "Loss:", loss.item())

# Generate text
def generate_text(model, start_token, max_length):
    # Get the index of the start token
    start_token_index = word_to_index[start_token]

    tokens_indices = [start_token_index]
    for i in range(max_length):
        output = model(torch.tensor([tokens_indices]))
        token_index = torch.argmax(output).item()
        tokens_indices.append(token_index)

    # Convert indices back to words
    generated_tokens = [index_to_word[index] for index in tokens_indices]

    return generated_tokens

print("Generated text:", generate_text(model, tokens[0], 10))

Epoch: 0 Loss: 1.68317711353302
Epoch: 1 Loss: 1.013279676437378
Epoch: 2 Loss: 0.5608100295066833
Epoch: 3 Loss: 0.30185121297836304
Epoch: 4 Loss: 0.16795217990875244
Epoch: 5 Loss: 0.09989055246114731
Epoch: 6 Loss: 0.0639154389500618
Epoch: 7 Loss: 0.04370548203587532
Epoch: 8 Loss: 0.03161435201764107
Epoch: 9 Loss: 0.023955313488841057
Generated text: ['This', 'sample', 'text.', 'text.', 'text.', 'text.', 'text.', 'text.', 'text.', 'text.', 'text.']
