In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

df = pd.read_csv("/kaggle/input/twitter-airline-sentiment/Tweets.csv")[['airline_sentiment', 'text']]

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_tweet(text):
    text = text.lower()
    text = re.sub(r"http\S+|@\S+|#\S+|[^a-z\s]", "", text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words]
    return tokens

w2v = KeyedVectors.load_word2vec_format("/kaggle/input/gnewsvector/GoogleNews-vectors-negative300.bin", binary=True)

def vectorize_tweet(text):
    words = clean_tweet(text)
    vectors = [w2v[w] for w in words if w in w2v]
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)

X = np.array([vectorize_tweet(t) for t in df['text']])
y = df['airline_sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2}).values

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000, multi_class='multinomial')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Test Accuracy: 0.7783469945355191


In [4]:
def predict_tweet_sentiment(model, w2v_model, tweet):
    words = clean_tweet(tweet)
    vectors = [w2v_model[w] for w in words if w in w2v_model]
    vec = np.mean(vectors, axis=0) if vectors else np.zeros(300)
    label = model.predict(vec.reshape(1, -1))[0]
    return {0: 'negative', 1: 'neutral', 2: 'positive'}[label]