## Problem 1

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\TUF
[nltk_data]     GAMING\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Preprocessing function using RegexpTokenizer to avoid punkt error
tokenizer = RegexpTokenizer(r'\w+')
def preprocess(text):
    text = text.lower()
    tokens = tokenizer.tokenize(text)
    stop_words = set(stopwords.words('english'))
    return [tok for tok in tokens if tok not in stop_words and tok.isalpha()]

In [3]:
# Load Google News Word2Vec via gensim-data
print("Loading Word2Vec model (this may take a while)...")
w2v_model = api.load("word2vec-google-news-300")
vector_size = w2v_model.vector_size
print("Word2Vec model loaded.")

Loading Word2Vec model (this may take a while)...
Word2Vec model loaded.


In [4]:
# Convert tokens to mean vector
def vectorize_sentence(tokens, model, vector_size=300):
    valid = [model[t] for t in tokens if t in model]
    if not valid:
        return np.zeros(vector_size)
    return np.mean(valid, axis=0)

In [5]:
# Load SMS Spam dataset from spam.csv
# Ensure spam.csv is in the same directory and has columns 'v1' and 'v2'
print("Loading dataset...")
df = pd.read_csv('spam.csv', encoding='latin-1')[['v1', 'v2']]
df.columns = ['Label', 'Message']
print(f"Loaded {len(df)} messages.")


Loading dataset...
Loaded 5572 messages.


In [6]:
# Vectorize all messages
print("Preprocessing & vectorizing...")
X = np.vstack([vectorize_sentence(preprocess(m), w2v_model, vector_size) for m in df['Message']])
y = df['Label'].map({'ham': 0, 'spam': 1}).values

Preprocessing & vectorizing...


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

acc = accuracy_score(y_test, clf.predict(X_test))
print(f"Test Accuracy: {acc:.4f}")

Test Accuracy: 0.9444


In [8]:
def predict_message_class(classifier, w2v_model, message):
    tokens = preprocess(message)
    vec = vectorize_sentence(tokens, w2v_model, vector_size)
    pred = classifier.predict(vec.reshape(1,-1))[0]
    return 'spam' if pred==1 else 'ham'

In [9]:
# Example
if __name__ == '__main__':
    msg = "Congratulations! You've won a free ticket. Reply now!"
    print(msg, "->", predict_message_class(clf, w2v_model, msg))

Congratulations! You've won a free ticket. Reply now! -> spam


## Problem 2

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import contractions

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Preprocessing tools
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_tweet(text):
    text = text.lower()
    text = contractions.fix(text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # remove URLs
    text = re.sub(r'@\w+|#\w+', '', text)  # remove mentions and hashtags
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation and special chars
    tokens = tokenizer.tokenize(text)
    tokens = [lemmatizer.lemmatize(tok) for tok in tokens if tok not in stop_words]
    return tokens

# Load Word2Vec model
print("Loading Word2Vec model (this may take a while)...")
w2v_model = api.load("word2vec-google-news-300")
vector_size = w2v_model.vector_size
print("Word2Vec model loaded.")

def vectorize_sentence(tokens, model, vector_size=300):
    valid = [model[t] for t in tokens if t in model]
    if not valid:
        return np.zeros(vector_size)
    return np.mean(valid, axis=0)

# Load dataset
print("Loading dataset...")
df = pd.read_csv("Tweets.csv")  # ensure the file is in the same directory
print(f"Loaded {len(df)} tweets.")

# Focus only on necessary columns
df = df[['airline_sentiment', 'text']].dropna()

# Preprocess and vectorize
print("Preprocessing & vectorizing tweets...")
X = np.vstack([vectorize_sentence(preprocess_tweet(t), w2v_model, vector_size) for t in df['text']])
y = df['airline_sentiment'].map({"negative": 0, "neutral": 1, "positive": 2}).values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Evaluate
acc = accuracy_score(y_test, clf.predict(X_test))
print(f"Test Accuracy: {acc:.4f}")

def predict_tweet_sentiment(model, w2v_model, tweet):
    tokens = preprocess_tweet(tweet)
    vec = vectorize_sentence(tokens, w2v_model, vector_size)
    pred = model.predict(vec.reshape(1, -1))[0]
    return {0: "negative", 1: "neutral", 2: "positive"}[pred]

# Example usage
if __name__ == '__main__':
    example_tweet = "@AmericanAir I love how you handle customer service, great job!"
    print(example_tweet, "->", predict_tweet_sentiment(clf, w2v_model, example_tweet))

Loading Word2Vec model (this may take a while)...


[nltk_data] Downloading package stopwords to C:\Users\TUF
[nltk_data]     GAMING\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\TUF
[nltk_data]     GAMING\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\TUF
[nltk_data]     GAMING\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Word2Vec model loaded.
Loading dataset...
Loaded 14640 tweets.
Preprocessing & vectorizing tweets...
