    SENTIMENT ANALYSIS USING NLTK

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download NLTK resources (if not already downloaded)
nltk.download('vader_lexicon')


# Data cleaning and preprocessing
df['Text'] = df['Text'].str.lower()  # Convert text to lowercase
df['Text'] = df['Text'].str.replace('[^\w\s]', '')  # Remove punctuation
df = df[['Text', 'Language']].dropna()  # Keep only relevant columns and drop rows with missing values

# Sentiment analysis using NLTK's SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

def get_sentiment(text):
    sentiment_scores = sid.polarity_scores(text)
    if sentiment_scores['compound'] > 0:
        return 'positive'
    elif sentiment_scores['compound'] < 0:
        return 'negative'
    else:
        return 'neutral'

df['Sentiment'] = df['Text'].apply(get_sentiment)

# Combine the text and target variables
X = df['Text']
y = df['Language']

# Split the dataset into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Combine the training and test sets for label encoding
combined_labels = pd.concat([y_train, y_test])
le = LabelEncoder()
combined_labels_encoded = le.fit_transform(combined_labels)

# Split the label-encoded data back into training and test sets
y_train_encoded = combined_labels_encoded[:len(y_train)]
y_test_encoded = combined_labels_encoded[len(y_train):]

# Vectorize the text data using TfidfVectorizer
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train a Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train_encoded)

# Make predictions on the testing set
predictions = nb_classifier.predict(X_test_tfidf)

# Decode the predictions
predictions_decoded = le.inverse_transform(predictions)

# Evaluate the model
print(classification_report(y_test, predictions_decoded))

    OUTPUT  

- Accuracy: 0.021897810218978103
- Precision: 0.022483650272244447
- Recall: 0.021897810218978103
- F1-Score: 0.02039687434090701
- Average Silhouette Score: 0.004

---