In [5]:
pip install pandas numpy nltk scikit-learn seaborn matplotlib



In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [10]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
file_path = '/content/Tweets.csv'
data = pd.read_csv(file_path)
print(data.head())

             tweet_id airline_sentiment  airline_sentiment_confidence  \
0  570306133677760513           neutral                        1.0000   
1  570301130888122368          positive                        0.3486   
2  570301083672813571           neutral                        0.6837   
3  570301031407624196          negative                        1.0000   
4  570300817074462722          negative                        1.0000   

  negativereason  negativereason_confidence  ...  \
0            NaN                        NaN  ...   
1            NaN                     0.0000  ...   
2            NaN                        NaN  ...   
3     Bad Flight                     0.7033  ...   
4     Can't Tell                     1.0000  ...   

                                                text tweet_coord  \
0                @VirginAmerica What @dhepburn said.         NaN   
1  @VirginAmerica plus you've added commercials t...         NaN   
2  @VirginAmerica I didn't today... Must mea

In [12]:
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

data['text'] = data['text'].apply(preprocess_text)
print(data.head())

             tweet_id airline_sentiment  airline_sentiment_confidence  \
0  570306133677760513           neutral                        1.0000   
1  570301130888122368          positive                        0.3486   
2  570301083672813571           neutral                        0.6837   
3  570301031407624196          negative                        1.0000   
4  570300817074462722          negative                        1.0000   

  negativereason  negativereason_confidence  ...  \
0            NaN                        NaN  ...   
1            NaN                     0.0000  ...   
2            NaN                        NaN  ...   
3     Bad Flight                     0.7033  ...   
4     Can't Tell                     1.0000  ...   

                                                text tweet_coord  \
0                        virginamerica dhepburn said         NaN   
1  virginamerica plus added commercials experienc...         NaN   
2  virginamerica today must mean need take a

In [13]:
X = data['text']
y = data['airline_sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [14]:
model = MultinomialNB()
model.fit(X_train_vect, y_train)

In [15]:
y_pred = model.predict(X_test_vect)

In [16]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7482250136537412
Classification Report:
               precision    recall  f1-score   support

    negative       0.75      0.94      0.84      1026
     neutral       0.71      0.42      0.53       455
    positive       0.76      0.62      0.68       350

    accuracy                           0.75      1831
   macro avg       0.74      0.66      0.68      1831
weighted avg       0.74      0.75      0.73      1831

Confusion Matrix:
 [[961  44  21]
 [213 193  49]
 [101  33 216]]


In [18]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=model.classes_, yticklabels=model.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()