In [None]:
#Name :- Devashish Mayur Potnis
#Roll No :- 43557
#Practical No :- 8

In [None]:
# Importing necessary libraries
import re    # for regular expressions
import nltk  # for text manipulation
import string
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from nltk.stem.porter import PorterStemmer
from gensim.models import Word2Vec, Doc2Vec
from tqdm import tqdm

In [None]:
# Setting display options and ignoring warnings
pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline



In [None]:
# Loading the datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test_tweets_anuFYb8.csv')



In [None]:
# Displaying sample data
print(train[train['label'] == 0].head())
print('--------------------------------------')
print(train[train['label'] == 1].head())



   id  label  \
0   1      0   
1   2      0   
2   3      0   
3   4      0   
4   5      0   

                                                                                                                        tweet  
0                       @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run  
1  @user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked  
2                                                                                                         bihday your majesty  
3                                      #model   i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦    
4                                                                                      factsguide: society now    #motivation  
--------------------------------------
    id  label  \
13  14      1   
14  15      1   
17  18      1   
23  24      1   
34  35     

In [None]:
# Combining train and test datasets for preprocessing
combi = pd.concat([train, test], ignore_index=True)  # ✅ Corrected




In [None]:
# Function to remove unwanted patterns from the tweets
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt



In [None]:
# Removing Twitter handles (@user)
combi['tidy_tweet'] = np.vectorize(remove_pattern)(combi['tweet'], "@[\w]*")


In [None]:

# Removing special characters, numbers, punctuations
combi['tidy_tweet'] = combi['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")



In [None]:
# Removing short words (length <= 3)
combi['tidy_tweet'] = combi['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))



In [None]:
# Tokenizing the tweets
tokenized_tweet = combi['tidy_tweet'].apply(lambda x: x.split())



In [None]:
# Stemming the tokens
stemmer = PorterStemmer()
tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])



In [None]:
# Detokenizing the tokens
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
combi['tidy_tweet'] = tokenized_tweet

# Extracting features using Bag-of-Words
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
bow = bow_vectorizer.fit_transform(combi['tidy_tweet'])



In [None]:
# Extracting features using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(combi['tidy_tweet'])

# Splitting data into training and validation sets
train_bow = bow[:31962, :]
test_bow = bow[31962:, :]
xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_bow, train['label'], random_state=42, test_size=0.3)

train_tfidf = tfidf[:31962, :]
test_tfidf = tfidf[31962:, :]
xtrain_tfidf = train_tfidf[ytrain.index]
xvalid_tfidf = train_tfidf[yvalid.index]



In [None]:
# Function to train model and evaluate F1 score
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    return f1_score(predictions, yvalid)

# Logistic Regression on Bag-of-Words features
lreg = LogisticRegression()
f1_bow = train_model(lreg, xtrain_bow, ytrain, xvalid_bow)
print(f"F1 Score (Bag-of-Words): {f1_bow}")



F1 Score (Bag-of-Words): 0.48770894788593905


In [None]:
# Logistic Regression on TF-IDF features
f1_tfidf = train_model(lreg, xtrain_tfidf, ytrain, xvalid_tfidf)
print(f"F1 Score (TF-IDF): {f1_tfidf}")

# Random Forest on Bag-of-Words features
rf = RandomForestClassifier(n_estimators=400, random_state=11)
f1_rf_bow = train_model(rf, xtrain_bow, ytrain, xvalid_bow)
print(f"F1 Score (Random Forest, Bag-of-Words): {f1_rf_bow}")

# Random Forest on TF-IDF features
f1_rf_tfidf = train_model(rf, xtrain_tfidf, ytrain, xvalid_tfidf)
print(f"F1 Score (Random Forest, TF-IDF): {f1_rf_tfidf}")



F1 Score (TF-IDF): 0.46282722513089003
F1 Score (Random Forest, Bag-of-Words): 0.5205905205905206
F1 Score (Random Forest, TF-IDF): 0.5402405180388529


In [None]:
# Support Vector Machine on Bag-of-Words features
svc = SVC()
f1_svc_bow = train_model(svc, xtrain_bow, ytrain, xvalid_bow)
print(f"F1 Score (SVM, Bag-of-Words): {f1_svc_bow}")

# Support Vector Machine on TF-IDF features
f1_svc_tfidf = train_model(svc, xtrain_tfidf, ytrain, xvalid_tfidf)
print(f"F1 Score (SVM, TF-IDF): {f1_svc_tfidf}")

# XGBoost on Bag-of-Words features
xgb = XGBClassifier(n_estimators=100, random_state=11)
f1_xgb_bow = train_model(xgb, xtrain_bow, ytrain, xvalid_bow)
print(f"F1 Score (XGBoost, Bag-of-Words): {f1_xgb_bow}")



F1 Score (SVM, Bag-of-Words): 0.46680497925311204
F1 Score (SVM, TF-IDF): 0.48925281473899696
F1 Score (XGBoost, Bag-of-Words): 0.46502057613168724


In [None]:
# XGBoost on TF-IDF features
f1_xgb_tfidf = train_model(xgb, xtrain_tfidf, ytrain, xvalid_tfidf)
print(f"F1 Score (XGBoost, TF-IDF): {f1_xgb_tfidf}")



F1 Score (XGBoost, TF-IDF): 0.4564994882292733


In [None]:
# Preparing Word2Vec embeddings
tokenized_tweet = combi['tidy_tweet'].apply(lambda x: x.split())
model_w2v = Word2Vec(tokenized_tweet, vector_size=200, window=5, min_count=2, sg=1, hs=0, negative=10, workers=2, seed=34)
model_w2v.train(tokenized_tweet, total_examples=len(combi['tidy_tweet']), epochs=20)

# Function to compute average Word2Vec for each tweet
def word_vector(tokens, size):
    vec = np.zeros(size)
    count = 0
    for word in tokens:
        if word in model_w2v.wv:
            vec += model_w2v.wv[word]
            count += 1
    if count != 0:
        vec /= count
    return vec

