In [2]:
!pip install nltk gensim scikit-learn contractions


Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
import gensim.downloader as api
import contractions
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Download resources
nltk.download('punkt')
nltk.download('punkt_tab')

nltk.download('wordnet')
nltk.download('stopwords')

# Load dataset
df = pd.read_csv("Tweets.csv")[['airline_sentiment', 'text']]
df.columns = ['sentiment', 'tweet']

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
def clean_text(text):
    # Lowercase
    text = text.lower()

    # Expand contractions
    text = contractions.fix(text)

    # Remove URLs, mentions, hashtags, special characters, emojis
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'@\w+|#', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize, remove stopwords, and lemmatize
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return tokens

df['tokens'] = df['tweet'].apply(clean_text)


In [1]:
import gensim.downloader as api

# Load the pre-trained Google News Word2Vec model (this downloads and caches it)
model = api.load("word2vec-google-news-300")



In [5]:
# When using glove-wiki-gigaword-100
def vectorize(tokens, model, dim=100):
    vectors = [model[word] for word in tokens if word in model]
    if not vectors:
        return np.zeros(dim)
    return np.mean(vectors, axis=0)

df['vector'] = df['tokens'].apply(lambda x: vectorize(x, model, dim=300))



X = np.vstack(df['vector'].values)
y = df['sentiment']


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=100, multi_class='multinomial')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {acc:.4f}")




Test Accuracy: 0.7753


In [7]:
def predict_tweet_sentiment(model, w2v_model, tweet):
    tokens = clean_text(tweet)
    vector = vectorize(tokens, w2v_model).reshape(1, -1)
    return model.predict(vector)[0]


In [8]:
sample_tweet = "I had a great experience with Southwest Airlines today!"
print("Predicted Sentiment:", predict_tweet_sentiment(clf, model, sample_tweet))


Predicted Sentiment: positive
