    SENTIMENT ANALYSIS USING NLTK

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download NLTK resources (if not already downloaded)
nltk.download('vader_lexicon')


# Data cleaning and preprocessing
df['Text'] = df['Text'].str.lower()  # Convert text to lowercase
df['Text'] = df['Text'].str.replace('[^\w\s]', '')  # Remove punctuation
df = df[['Text', 'Language']].dropna()  # Keep only relevant columns and drop rows with missing values

# Sentiment analysis using NLTK's SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

def get_sentiment(text):
    sentiment_scores = sid.polarity_scores(text)
    if sentiment_scores['compound'] > 0:
        return 'positive'
    elif sentiment_scores['compound'] < 0:
        return 'negative'
    else:
        return 'neutral'

df['Sentiment'] = df['Text'].apply(get_sentiment)

# Combine the text and target variables
X = df['Text']
y = df['Language']

# Split the dataset into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Combine the training and test sets for label encoding
combined_labels = pd.concat([y_train, y_test])
le = LabelEncoder()
combined_labels_encoded = le.fit_transform(combined_labels)

# Split the label-encoded data back into training and test sets
y_train_encoded = combined_labels_encoded[:len(y_train)]
y_test_encoded = combined_labels_encoded[len(y_train):]

# Vectorize the text data using TfidfVectorizer
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train a Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train_encoded)

# Make predictions on the testing set
predictions = nb_classifier.predict(X_test_tfidf)

# Decode the predictions
predictions_decoded = le.inverse_transform(predictions)

# Evaluate the model
print(classification_report(y_test, predictions_decoded))

    OUTPUT  

Accuracy: 0.021897810218978103
Precision: 0.022483650272244447
Recall: 0.021897810218978103
F1-Score: 0.02039687434090701
/Users/student/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/Users/student/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/Users/student/anaconda3/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

KMeans
KMeans(n_clusters=5, random_state=42)
Silhouette Score: 0.0037966887340400513
<Figure size 640x480 with 1 Axes>
/Users/student/anaconda3/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/Users/student/anaconda3/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/Users/student/anaconda3/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/Users/student/anaconda3/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/Users/student/anaconda3/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/Users/student/anaconda3/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/Users/student/anaconda3/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/Users/student/anaconda3/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/Users/student/anaconda3/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
<Figure size 1000x500 with 2 Axes>
/Users/student/anaconda3/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
Cluster 0:
22       @paulgp leverage chatgpt…you will be surprised...
150      saved from chatgpt... before it changes writin...
166      can you take new ai tech #chatgpt seriously if...
194      @shillin_villian yo, have you tried @quickieai...
242      @openai using chatgpt feels like you are chatt...
                               ...                        
49788    @zomato \ncontent for you ..\n\nme: best place...
49875    content for you @zomato \n\nme: best place to ...
49878    @darma15206038 @budgieofdespair @brianferoldi ...
49921    @jobartleynz suggest you get chatgpt to write ...
49967    @robhope you know you've made it when chatgpt ...
Name: Text, Length: 1487, dtype: object

Cluster 1:
65       @shillin_villian learn chat gpt means learn te...
92                 @shillin_villian chat gpt is the future
213            @kantrowitz he's being replaced by chat gpt
227        @jiei_yushi chat gpt is not that intelligent :p
237      @chatgptbot @iivdfo ليه بدون كومنت ؟\nناقص تقي...
                               ...                        
49847    @gloomydays1984 به تو علاقه داره 🤗\nchat gpt ر...
49872    @mededdoc @bretweinstein @elonmusk crap attrac...
49926    he : chalo sat me assignment karenge romantic ...
49945    @ashutosh83b i'd prefer chat gpt writing your ...
...
49994    @fedemoctezuma 5 cosas que puedes hacer  con c...
Name: Text, Length: 1433, dtype: object

Average Silhouette Score: 0.004
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...
/Users/student/anaconda3/lib/python3.9/site-packages/sklearn/preprocessing/_encoders.py:868: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.
  warnings.warn(
Accuracy:  0.5428457154284572
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/student/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
/var/folders/j_/pkfzpv5x4xv9mb1gxwdtxsrr0000gn/T/ipykernel_1510/4255330702.py:16: FutureWarning: The default value of regex will change from True to False in a future version.
  df['Text'] = df['Text'].str.replace('[^\w\s]', '')  # Remove punctuation
              precision    recall  f1-score   support

          ar       1.00      0.52      0.69        82
          bn       0.00      0.00      0.00         1
          ca       0.00      0.00      0.00        29
          cs       0.00      0.00      0.00         8
          cy       0.00      0.00      0.00        16
          da       0.00      0.00      0.00        10
          de       1.00      0.85      0.92       234
          el       0.00      0.00      0.00        11
          en       0.79      1.00      0.88      6428
          es       0.92      0.96      0.94       666
          et       0.00      0.00      0.00         4
          fa       1.00      0.55      0.71        29
          fi       0.00      0.00      0.00        17
          fr       0.99      0.89      0.94       491
          hi       0.00      0.00      0.00        13
          ht       0.00      0.00      0.00         4
          hu       0.00      0.00      0.00         4
          in       0.00      0.00      0.00        48
          it       1.00      0.36      0.53        99
          iw       1.00      0.03      0.06        31
          ja       0.90      0.18      0.30       989
          ko       0.00      0.00      0.00        27
          lt       0.00      0.00      0.00         1
...
    accuracy                           0.82     10001
   macro avg       0.24      0.14      0.16     10001
weighted avg       0.79      0.82      0.77     10001

Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...
/Users/student/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/Users/student/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/Users/student/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))