# Problem 1

In [None]:
import pandas as pd
import numpy as np
import re
import gensim.downloader as api
import nltk
import contractions

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


from tqdm import tqdm


In [4]:
print("Loading Word2Vec model...")
w2v_model = api.load("word2vec-google-news-300")

Loading Word2Vec model...


In [5]:
def vectorize_text(text, w2v_model):
    tokens = [word for word in word_tokenize(text.lower()) if word.isalpha()]
    vectors = [w2v_model[word] for word in tokens if word in w2v_model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(w2v_model.vector_size)

In [26]:
df = pd.read_csv("spam_dataset.csv", encoding='latin-1')
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)
df['v1'] = df['v1'].map({'ham': 0, 'spam': 1})
df.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
df.rename(columns={'v1':'label','v2':'text'},inplace=True)
df.sample(5)

Unnamed: 0,label,text
5057,0,"Geeeee ... Your internet is really bad today, ..."
5438,0,"\What are youdoing later? Sar xxx\"""""
5059,0,I think i am disturbing her da
4614,1,Sunshine Quiz! Win a super Sony DVD recorder i...
798,0,Ok i msg u b4 i leave my house.


In [39]:
ps = PorterStemmer()

def transform_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum()]
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    stemmed_tokens = [ps.stem(token) for token in tokens]
    
    return " ".join(stemmed_tokens)

nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [40]:
transform_text("Hi, I am Zeeshan Ahmad. I love working on Machine Learning projects and studying ML.")

'hi zeeshan ahmad love work machin learn project studi ml'

In [42]:

df['transformed_text']=df['text'].apply(transform_text)
df.sample(5)

Unnamed: 0,label,text,transformed_text
5155,0,MY NEW YEARS EVE WAS OK. I WENT TO A PARTY WIT...,new year eve went parti boyfriend si hey
970,0,"Yar lor he wan 2 go c horse racing today mah, ...",yar lor wan 2 go c hors race today mah eat ear...
326,0,Hi da:)how is the todays class?,hi da today class
3425,0,Sure but since my parents will be working on T...,sure sinc parent work tuesday realli need cove...
3133,0,Ok going to sleep. Hope i can meet her.,ok go sleep hope meet


In [44]:
X = np.vstack(df['transformed_text'].apply(lambda x: vectorize_text(x, w2v_model)))
y = df['label']

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

print("Accuracy:", model.score(X_test, y_test))

Accuracy: 0.9381165919282511


In [51]:
def predict_message_class(model, w2v_model, message):
    vec = vectorize_text(message, w2v_model).reshape(1, -1)
    predicted = model.predict(vec)[0]
    if predicted==0:
        return "ham"
    return "spam"

In [52]:
predict_message_class(model, w2v_model, message="Nah I don't think he goes to usf, he lives around here though")

'ham'

# Problem 2

In [53]:
df2 = pd.read_csv("Tweets_dataset.csv")
df2 = df2[['airline_sentiment', 'text']]

In [60]:
lemmatizer = WordNetLemmatizer()

def preprocess_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|@\S+|#\S+", "", tweet)
    tweet = re.sub(r"[^\w\s]", "", tweet)
    tweet = contractions.fix(tweet)
    tokens = word_tokenize(tweet)
    return ' '.join([lemmatizer.lemmatize(w) for w in tokens if w not in stopwords.words('english') and w.isalpha()])


In [61]:
df2['Processed'] = df2['text'].apply(preprocess_tweet)

In [62]:
X2 = np.vstack(df2['Processed'].apply(lambda x: vectorize_text(x, w2v_model)))
y2 = df2['airline_sentiment']

In [63]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

In [64]:
model2 = LogisticRegression(max_iter=1000)
model2.fit(X2_train, y2_train)
print("Accuracy:", model2.score(X2_test, y2_test))

Accuracy: 0.7711748633879781


In [65]:
def predict_tweet_sentiment(model, w2v_model, tweet):
    vec = vectorize_text(tweet, w2v_model).reshape(1, -1)
    return model.predict(vec)[0]
