In [37]:
#Importing all the necessary packages

import pandas as pd # For data reading
import chardet # For detecting the correct encoding of the data
import re # For preprocessing
from nltk.corpus import stopwords # For removing stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
from gensim.models import Word2Vec # For Skip-gram and CBOW
import torch
import torch.nn as nn # for perceptron classifier
import torch.optim as optim # to optimise the loss function


# Downloading
import nltk
nltk.download('punkt') #tokenizer
nltk.download('stopwords') # stopwords
nltk.download('wordnet')
nltk.download('punkt_tab') # required by sent_tokenize which in turn is used by word_tokenize to split text into sentences


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [38]:
# Loading the data (Note that the file was not encoded in uft8 hence why we need to find the encoding and set it to that when using pd.read_csv)

# First we detect encoding using chardet
with open("Semeval.txt", "rb") as f:
    result = chardet.detect(f.read())
Encoding = result['encoding'] # store the coding into the variable Encoding
print(f"Detected encoding: {Encoding}") # It was encoded in Windows-1252

# Next we read the file with the correct encoding
data = pd.read_csv("Semeval.txt", delimiter="\t", encoding=Encoding)

# Let's display the top rows.
print(data.head())



Detected encoding: Windows-1252
    ID   Target                                              Tweet   Stance
0  101  Atheism  dear lord thank u for all of ur blessings forg...  AGAINST
1  102  Atheism  Blessed are the peacemakers, for they shall be...  AGAINST
2  103  Atheism  I am not conformed to this world. I am transfo...  AGAINST
3  104  Atheism  Salah should be prayed with #focus and #unders...  AGAINST
4  105  Atheism  And stay in your houses and do not display you...  AGAINST


In [39]:
"""
Preprocessing is a must because we are dealing with tweets which are informal in many ways. So we will
1) Remove unnecessary characters like mentions and punctuations (there are no links because they discarded any tweets with links)
2) Remove stopwords by using nltk
"""

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to preprocess tweets
def preprocess_tweet(tweet):
    # Lowercase
    tweet = tweet.lower()

    # Remove mentions
    tweet = re.sub(r"@\w+", "", tweet)

    # Remove hashtags
    tweet = re.sub(r"#\w+", "", tweet)

    # Remove punctuation
    tweet = re.sub(r"[^\w\s]", "", tweet)

    # Remove numbers
    tweet = re.sub(r"\d+", "", tweet)

    # Tokenize
    words = word_tokenize(tweet)

    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    # Join tokens back into a string
    tweet = " ".join(tokens)

    return tweet

# We apply preprocessing to the 'Tweet' column
data['Tweet'] = data['Tweet'].apply(preprocess_tweet)

# Display the first few rows
print(data.head())

#Identifying the columns

Id = data['ID']
Target = data['Target']
Tweet = data['Tweet']
Stance = data['Stance']


    ID   Target                                              Tweet   Stance
0  101  Atheism  dear lord thank u ur blessing forgive sin lord...  AGAINST
1  102  Atheism  blessed peacemaker shall called child god matthew  AGAINST
2  103  Atheism          conformed world transformed renewing mind  AGAINST
3  104  Atheism       salah prayed warns prayer done surah almaoon  AGAINST
4  105  Atheism       stay house display like time ignorance quran  AGAINST


In [40]:
# Calculating how many in each class
from collections import Counter
class_distribution = Counter(Stance)
print(class_distribution)



Counter({'AGAINST': 1342, 'NONE': 741, 'FAVOR': 731})


In [41]:
# Loading the test data
"""
We will load the test data and we preprocess it.
"""

# Detecing the code for the test data
with open("Testdata.txt", "rb") as f:
    result = chardet.detect(f.read())
Encoding = result['encoding'] # store the coding into the variable Encoding
print(f"Detected encoding: {Encoding}") # It was encoded in Windows-1252

# Reading the test data with the right encoding
Test_data = pd.read_csv("Testdata.txt", delimiter="\t", encoding=Encoding)

# Let's display the top rows.
print(Test_data.head())


# Preprocessing
Test_data['Tweet'] = Test_data['Tweet'].apply(preprocess_tweet)


#Identifying the columns for the test data

Id_test = Test_data['ID']
Target_test = Test_data['Target']
Tweet_test = Test_data['Tweet']
Stance_test = Test_data['Stance']



Detected encoding: ascii
      ID   Target                                              Tweet   Stance
0  10001  Atheism  He who exalts himself shall      be humbled; a...  AGAINST
1  10002  Atheism  RT @prayerbullets: I remove Nehushtan -previou...  AGAINST
2  10003  Atheism  @Brainman365 @heidtjj @BenjaminLives I have so...  AGAINST
3  10004  Atheism  #God is utterly powerless without Human interv...  AGAINST
4  10005  Atheism  @David_Cameron   Miracles of #Multiculturalism...  AGAINST


In [53]:
#Word embedding GLOVE
"""
Since we be using Glove, we download it. We will be using the wikipidia one with 6B tokens with 300d. We then define a function to get the sentence embedding.
"""

glove_embeddings = {} # An empty dictionary where we will store word-vector pairs

with open("glove.6B.300d.txt", "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        glove_embeddings[word] = vector




# Function to get sentence embeddings
def get_sentence_embedding_Glove(sentence, embeddings):
    words = sentence.lower().split()
    vectors = [embeddings[word] for word in words if word in embeddings]
    if len(vectors) == 0:
        return np.zeros(300)  # Return zero vector if no words are found
    return np.mean(vectors, axis=0)

"""
# Sometimes we run into the problem of the array having different dimension hence we can't take the mean. This function is to tackle that problem. It deos the same as getting sentence embedding


def get_sentence_embedding_Glove(sentence, embeddings):
    words = sentence.lower().split()
    vectors = []
    for word in words:
        if word in embeddings:
            # If the word has multiple embeddings, take the first one
            if isinstance(embeddings[word], list):
                vectors.append(embeddings[word][0])  # Take the first embedding
            else:
                vectors.append(embeddings[word])
    if len(vectors) == 0:
        return np.zeros(300)  # Return zero vector if no words are found
    # Stack vectors into a 2D array before computing the mean
    return np.mean(np.vstack(vectors), axis=0)


"""



# Example
sentence = Tweet [1]
sentence_embedding = get_sentence_embedding_Glove(sentence, glove_embeddings)
print(sentence, "Sentence embedding:", sentence_embedding)


blessed peacemaker shall called child god matthew Sentence embedding: [-1.62994578e-01 -1.96897149e-01 -1.77997440e-01 -1.90302849e-01
 -9.27521512e-02  1.50632858e-01 -4.69469950e-02  1.70879111e-01
 -7.55877048e-02 -7.65702546e-01  3.39621842e-01 -5.76807261e-02
 -1.16539426e-01 -1.41729444e-01 -6.20109215e-02  1.74758703e-01
 -1.21041432e-01 -3.46834272e-01 -4.63918559e-02  6.23211376e-02
 -3.45137656e-01  3.62692028e-01  1.10021448e-02 -3.75642092e-03
 -1.57148577e-02  2.11083010e-01 -7.10457042e-02 -2.38870427e-01
  1.19302854e-01  9.62345675e-02  1.53267458e-01  1.54039994e-01
 -2.72348616e-02  4.15671468e-02 -5.39141059e-01  1.07810304e-01
  8.60812888e-02  7.05508664e-02 -9.25327092e-02  7.50798583e-02
  1.30589992e-01 -7.51765594e-02 -1.60334721e-01 -1.03309579e-01
 -9.16528236e-03 -7.87018538e-02  1.07409000e-01  3.20013463e-01
  1.70773551e-01 -2.60613978e-01  1.49713978e-01 -4.33692820e-02
 -7.97282904e-02  2.94151723e-01 -4.22611445e-01  2.48506796e-02
 -4.15757038e-02  3.

In [54]:
#Word2Vec
"""
Another method we can use to turn words into a vector is using Word2Vec, which we will do here with the same dimension 300.
"""

# First we tokenize the tweets
tokenized_tweets = [word_tokenize(tweet.lower()) for tweet in Tweet]

# We will use skip-gram
word2vec_model_SkipGram = Word2Vec(
    sentences=tokenized_tweets,  # List of tokenized sentences
    vector_size=300,             # Dimensionality of word embeddings (same as Glove)
    window=5,                    # Context window size
    min_count=1,
    sg=1                         # 1 since we are using skip-gram (0 is for CBOW)
)

# Function to get the embedding for a sentence
def get_sentence_embedding_word2vec(sentence, model):
    words = word_tokenize(sentence.lower())
    vectors = [model.wv[word] for word in words if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)  # Return zero vector if no words are found
    return np.mean(vectors, axis=0)


# Example
sentence = Tweet [1]
sentence_embedding = get_sentence_embedding_word2vec(sentence, word2vec_model_SkipGram)
print(sentence, "Sentence embedding:", sentence_embedding)



blessed peacemaker shall called child god matthew Sentence embedding: [-8.03718064e-03  4.33953814e-02  2.78786686e-03  3.51471454e-02
  1.05961217e-02 -6.25265464e-02  4.02857065e-02  1.40310034e-01
  2.87497067e-03 -2.64463462e-02 -1.20979790e-02 -5.58552481e-02
  1.05174666e-03 -8.85106530e-03 -3.88267227e-02 -2.16841456e-02
  1.23427557e-02  4.80516395e-03  2.81267036e-02 -9.72439069e-03
 -2.31212992e-02 -3.03364899e-02  3.82888354e-02 -4.62116936e-04
  5.97012416e-02 -7.07284454e-03 -6.38193712e-02  5.26329596e-03
 -4.02161144e-02 -4.02505510e-02  1.14009576e-02 -3.19272727e-02
 -1.37367006e-02  4.61821957e-03  1.41318340e-03  1.89662687e-02
  2.48746779e-02 -6.43830672e-02 -1.38941011e-03 -1.33168306e-02
 -1.88871976e-02  7.77876843e-03 -9.81551688e-03 -3.71559300e-02
  1.16333272e-02  5.13161123e-02  1.64397471e-02  4.00561392e-02
 -2.12975051e-02  4.49087359e-02  1.50973191e-02  9.32910573e-03
 -3.97390090e-02  3.07353791e-02 -2.32112575e-02  7.74729922e-02
  6.77889306e-03  2.

In [55]:
#CBOW

word2vec_model_CBOW = Word2Vec(
    sentences=tokenized_tweets,  # List of tokenized sentences
    vector_size=300,             # Dimensionality of word embeddings (same as Glove)
    window=5,                    # Context window size
    min_count=1,
    sg=0                         # 0 for CBOW
)

# Example
sentence = Tweet [1]
sentence_embedding = get_sentence_embedding_word2vec(sentence, word2vec_model_CBOW)
print(sentence, "Sentence embedding:", sentence_embedding)


blessed peacemaker shall called child god matthew Sentence embedding: [ 1.20047480e-03  1.60158321e-03 -6.86998595e-04  1.14370533e-03
  9.99160460e-04 -2.34421715e-03  2.46528583e-03  5.66589180e-03
  1.07552922e-04 -9.59662138e-04 -5.54121914e-04 -2.21568113e-03
  4.90455364e-04 -2.85762973e-04 -2.49680015e-03 -8.49443197e-04
  8.27419921e-04  1.03148748e-03  1.91152038e-03  6.44701824e-04
 -1.55117153e-03 -1.18158590e-04  2.80664535e-03  6.48041780e-04
  1.80233561e-03  1.16908726e-04 -3.62628000e-03  9.01288411e-04
 -1.53766270e-03 -2.24982179e-03  1.44706003e-03 -1.01849402e-03
  2.11424922e-04 -7.48511811e-04  3.23263666e-04  1.28261547e-03
  8.32915597e-04 -2.32302514e-03  7.04523060e-04  1.34790404e-04
 -1.66442525e-03 -8.54351907e-04  3.29805393e-04 -1.18393847e-03
  2.23014009e-04  1.47570588e-03  8.22412781e-04  2.51717330e-03
 -4.18630661e-04  1.45952415e-03 -6.31384974e-05 -4.38501796e-04
 -7.59532035e-04  6.17473037e-04 -2.47908116e-04  3.31335468e-03
  1.19512354e-03  1.

In [56]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(Tweet)
print(X_train_tfidf)

  (0, 4587)	0.14814727000362926
  (0, 11900)	0.29218047222408566
  (0, 19813)	0.13996326252995958
  (0, 21040)	0.154150985775193
  (0, 2103)	0.18377393191228916
  (0, 7451)	0.1941940625335076
  (0, 18313)	0.16819672410550798
  (0, 8085)	0.13633765482086307
  (0, 19167)	0.18845945083578208
  (0, 5928)	0.17981238210928824
  (0, 2517)	0.17638073157917766
  (0, 4488)	0.11955933102559835
  (0, 482)	0.1941940625335076
  (0, 4593)	0.21200739348783754
  (0, 11923)	0.21200739348783754
  (0, 19839)	0.21200739348783754
  (0, 21042)	0.21200739348783754
  (0, 2104)	0.21200739348783754
  (0, 7452)	0.20158726286661913
  (0, 18319)	0.21200739348783754
  (0, 11908)	0.21200739348783754
  (0, 8116)	0.21200739348783754
  (0, 19168)	0.21200739348783754
  (0, 5929)	0.21200739348783754
  (0, 2519)	0.21200739348783754
  :	:
  (2812, 6047)	0.23871107212935166
  (2812, 16900)	0.3124000531979786
  (2812, 2256)	0.3124000531979786
  (2812, 21671)	0.3124000531979786
  (2812, 16981)	0.3124000531979786
  (2812, 22739

In [85]:
# Perceptron classifier

"""
Next we will define a perceptron model and train it using the training data. But first we need to convert the data to PyTorch tensors.
So we need to define the X and Y, with X being the tweet and Y being the stance. And since we have 3 types of word embeddings, we will do them sepereately.
"""

X_train_Glove = np.array([get_sentence_embedding_Glove(tweet, glove_embeddings) for tweet in Tweet])
X_train_word2vec_SkipGram = np.array([get_sentence_embedding_word2vec(tweet, word2vec_model_SkipGram) for tweet in Tweet])
X_train_word2vec_CBOW = np.array([get_sentence_embedding_word2vec(tweet, word2vec_model_CBOW) for tweet in Tweet])
X_train_tfidf = vectorizer.transform(Tweet)
X_train_tfidf_dense = X_train_tfidf.toarray() #TfidfVectorizer returns a sparse matrix and pytorch tensor requires an array



# Converting data to PyTorch tensors
X_train_tensor_Glove = torch.tensor(X_train_Glove, dtype=torch.float32)
X_train_tensor_word2vec_SkipGram = torch.tensor(X_train_word2vec_SkipGram, dtype=torch.float32)
X_train_tensor_word2vec_CBOW = torch.tensor(X_train_word2vec_CBOW, dtype=torch.float32)
X_train_tensor_tfidf = torch.tensor(X_train_tfidf_dense, dtype=torch.float32)

y_train_tensor = torch.tensor(Stance.factorize()[0], dtype=torch.long)



# Define the Perceptron model
class Perceptron(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Perceptron, self).__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.fc(x)

# Initialize the model
input_dim = 300  # Word2Vec and Glove embedding size
input_dim_tfidf = X_train_tensor_tfidf.shape[1]  # Number of features (TF-IDF dimensions)
output_dim = 3   # Number of stance classes (FAVOR, AGAINST, NONE)
model_Glove = Perceptron(input_dim, output_dim)
model_word2vec_SkipGram = Perceptron(input_dim, output_dim)
model_word2vec_CBOW = Perceptron(input_dim, output_dim)
model_tfidf = Perceptron(input_dim_tfidf, output_dim)


# Adjusting weights
weights = torch.tensor([1/0.48, 1/0.26, 1/0.26]) # AGAINST : 0.48, NONE : 0.26, FAVOR : 0.26



# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(weights) # Used for multi-class classification

#Glove optimiser SGD
optimizer_Glove = optim.SGD(model_Glove.parameters(), lr=0.01)

#Skipgram optimiser SGD
optimizer_SkipGram = optim.SGD(model_word2vec_SkipGram.parameters(), lr=0.01)

#CBOW optimiser SGD
optimizer_CBOW = optim.SGD(model_word2vec_CBOW.parameters(), lr=0.01)

# TF-IDF optimiser SGD
optimizer_tfidf = optim.SGD(model_tfidf.parameters(), lr=0.01)



"""

#Glove optimiser adam
optimizer_Glove = optim.Adam(model_Glove.parameters(), lr=0.001)

#Skipgram optimiser adam
optimizer_SkipGram = optim.Adam(model_word2vec_SkipGram.parameters(), lr=0.001)

#CBOW opitmiser adam
optimizer_CBOW = optim.Adam(model_word2vec_CBOW.parameters(), lr=0.001)

#TF-IDF optimiser adam
optimizer_tfidif = optim.Adam(model_tfidf.parameters(), lr=0.001)

"""


# Train the model for Glove
for epoch in range(10):  # Number of epochs
    optimizer_Glove.zero_grad() # optimise for gradient 0
    outputs = model_Glove(X_train_tensor_Glove) # predict the output based on our model
    loss = criterion(outputs, y_train_tensor) # calculate the loss
    loss.backward() # Compute the gradient
    optimizer_Glove.step() # update the weights
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# Train the model for SkipGram
for epoch in range(10):  # Number of epochs
    optimizer_SkipGram.zero_grad() # optimise for gradient 0
    outputs = model_word2vec_SkipGram(X_train_tensor_word2vec_SkipGram) # predict the output based on our model
    loss = criterion(outputs, y_train_tensor) # calculate the loss
    loss.backward() # Compute the gradient
    optimizer_SkipGram.step() # update the weights
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# Train the model for CBOW
for epoch in range(10):  # Number of epochs
    optimizer_CBOW.zero_grad() # optimise for gradient 0
    outputs = model_word2vec_CBOW(X_train_tensor_word2vec_CBOW) # predict the output based on our model
    loss = criterion(outputs, y_train_tensor) # calculate the loss
    loss.backward() # Compute the gradient
    optimizer_CBOW.step() # update the weights
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# Train the model for TF-IDF
for epoch in range(10):  # Number of epochs
    optimizer_tfidf.zero_grad() # optimise for gradient 0
    outputs = model_tfidf(X_train_tensor_tfidf) # predict the output based on our model
    loss = criterion(outputs, y_train_tensor) # calculate the loss
    loss.backward() # Compute the gradient
    optimizer_tfidf.step() # update the weights
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")





Epoch 1, Loss: 1.105629324913025
Epoch 2, Loss: 1.1053701639175415
Epoch 3, Loss: 1.1051167249679565
Epoch 4, Loss: 1.1048682928085327
Epoch 5, Loss: 1.1046251058578491
Epoch 6, Loss: 1.104386806488037
Epoch 7, Loss: 1.104153037071228
Epoch 8, Loss: 1.103924036026001
Epoch 9, Loss: 1.1036990880966187
Epoch 10, Loss: 1.1034783124923706
Epoch 1, Loss: 1.0985606908798218
Epoch 2, Loss: 1.0985573530197144
Epoch 3, Loss: 1.0985541343688965
Epoch 4, Loss: 1.0985511541366577
Epoch 5, Loss: 1.0985478162765503
Epoch 6, Loss: 1.0985445976257324
Epoch 7, Loss: 1.098541259765625
Epoch 8, Loss: 1.0985380411148071
Epoch 9, Loss: 1.0985349416732788
Epoch 10, Loss: 1.098531723022461
Epoch 1, Loss: 1.0991383790969849
Epoch 2, Loss: 1.0991345643997192
Epoch 3, Loss: 1.0991308689117432
Epoch 4, Loss: 1.0991270542144775
Epoch 5, Loss: 1.0991233587265015
Epoch 6, Loss: 1.0991196632385254
Epoch 7, Loss: 1.0991159677505493
Epoch 8, Loss: 1.0991121530532837
Epoch 9, Loss: 1.0991084575653076
Epoch 10, Loss: 1.

In [86]:
# Evaluation of Perceptron

"""
First we convert the data into word embedding and then convert it into tensors.
We then make predictions and then convert them into labels, i.e. stances.
We will then use Micro-average F1 score.
"""

X_test_Glove = np.array([get_sentence_embedding_Glove(tweet, glove_embeddings) for tweet in Tweet_test])
X_test_word2vec_SkipGram = np.array([get_sentence_embedding_word2vec(tweet, word2vec_model_SkipGram) for tweet in Tweet_test])
X_test_word2vec_CBOW = np.array([get_sentence_embedding_word2vec(tweet, word2vec_model_CBOW) for tweet in Tweet_test])
X_test_tfidf = vectorizer.transform(Tweet_test)
X_test_tfidf_dense = X_test_tfidf.toarray() #TfidfVectorizer returns a sparse matrix and pytorch tensor requires an array


# Converting data to PyTorch tensors
X_test_tensor_Glove = torch.tensor(X_test_Glove, dtype=torch.float32)
X_test_tensor_word2vec_SkipGram = torch.tensor(X_test_word2vec_SkipGram, dtype=torch.float32)
X_test_tensor_word2vec_CBOW = torch.tensor(X_test_word2vec_CBOW, dtype=torch.float32)
X_test_tensor_tfidf = torch.tensor(X_test_tfidf_dense, dtype=torch.float32)


y_test_tensor = torch.tensor(Stance_test.factorize()[0], dtype=torch.long)

# Make predictions
#Glove
with torch.no_grad():
    outputs = model_Glove(X_test_tensor_Glove)
    _, predicted_Glove = torch.max(outputs, 1)

#SkipGram
with torch.no_grad():
    outputs = model_word2vec_SkipGram(X_test_tensor_word2vec_SkipGram)
    _, predicted_SkipGram = torch.max(outputs, 1)


#CBOW
with torch.no_grad():
    outputs = model_word2vec_CBOW(X_test_tensor_word2vec_CBOW)
    _, predicted_CBOW = torch.max(outputs, 1)

#TF-IDF
with torch.no_grad():
    outputs = model_tfidf(X_test_tensor_tfidf)
    _, predicted_tfidf = torch.max(outputs, 1)


# Convert predictions to labels
predicted_labels_Glove = [["FAVOR", "AGAINST", "NONE"][i] for i in predicted_Glove]
predicted_labels_SkipGram = [["FAVOR", "AGAINST", "NONE"][i] for i in predicted_SkipGram]
predicted_labels_CBOW = [["FAVOR", "AGAINST", "NONE"][i] for i in predicted_CBOW]
predicted_labels_tfidf = [["FAVOR", "AGAINST", "NONE"][i] for i in predicted_tfidf]


print(Counter(predicted_labels_Glove))
print(Counter(predicted_labels_SkipGram))
print(Counter(predicted_labels_CBOW))
print(Counter(predicted_labels_tfidf))

# Calculate F1 score For each
from sklearn.metrics import classification_report, f1_score

print(classification_report(Stance_test, predicted_labels_Glove))
#print("Macro-average F1 score:", f1_score(Stance_test, predicted_labels_Glove, average='macro'))
print("Micro-average F1 score:", f1_score(Stance_test, predicted_labels_Glove, average='micro'))
print(classification_report(Stance_test, predicted_labels_SkipGram))
#print("Macro-average F1 score:", f1_score(Stance_test, predicted_labels_SkipGram, average='macro'))
print("Micro-average F1 score:", f1_score(Stance_test, predicted_labels_SkipGram, average='micro'))
print(classification_report(Stance_test, predicted_labels_CBOW))
#print("Macro-average F1 score:", f1_score(Stance_test, predicted_labels_CBOW, average='macro'))
print("Micro-average F1 score:", f1_score(Stance_test, predicted_labels_CBOW, average='micro'))
print(classification_report(Stance_test, predicted_labels_tfidf))
#print("Macro-average F1 score:", f1_score(Stance_test, predicted_labels_tfidf, average='macro'))
print("Micro-average F1 score:", f1_score(Stance_test, predicted_labels_tfidf, average='micro'))

Counter({'FAVOR': 579, 'NONE': 553, 'AGAINST': 117})
Counter({'AGAINST': 1249})
Counter({'FAVOR': 1249})
Counter({'NONE': 507, 'AGAINST': 492, 'FAVOR': 250})
              precision    recall  f1-score   support

     AGAINST       0.56      0.09      0.16       715
       FAVOR       0.27      0.52      0.36       304
        NONE       0.19      0.45      0.26       230

    accuracy                           0.26      1249
   macro avg       0.34      0.35      0.26      1249
weighted avg       0.42      0.26      0.23      1249

Micro-average F1 score: 0.2618094475580464
              precision    recall  f1-score   support

     AGAINST       0.57      1.00      0.73       715
       FAVOR       0.00      0.00      0.00       304
        NONE       0.00      0.00      0.00       230

    accuracy                           0.57      1249
   macro avg       0.19      0.33      0.24      1249
weighted avg       0.33      0.57      0.42      1249

Micro-average F1 score: 0.57245796637

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [87]:
# 2nd ML method: Naive Bayes classification

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score
from sklearn.naive_bayes import GaussianNB


# Train Naive Bayes classifier for tfidf
nb_classifier_tfidf = MultinomialNB()
nb_classifier_tfidf.fit(X_train_tfidf, Stance)

# Glove
nb_classifier_Glove = GaussianNB()
nb_classifier_Glove.fit(X_train_Glove, Stance)

# Skipgram
nb_classifier_skipgram = GaussianNB()
nb_classifier_skipgram.fit(X_train_word2vec_SkipGram, Stance)

# CBOW
nb_classifier_cbow = GaussianNB()
nb_classifier_cbow.fit(X_train_word2vec_CBOW, Stance)

# Make predictions
y_pred_tfidf = nb_classifier_tfidf.predict(X_test_tfidf)
y_pred_Glove = nb_classifier_Glove.predict(X_test_Glove)
y_pred_skipgram = nb_classifier_skipgram.predict(X_test_word2vec_SkipGram)
y_pred_cbow = nb_classifier_cbow.predict(X_test_word2vec_CBOW)


# Evaluate performance
print(classification_report(Stance_test, y_pred_Glove))
#print("Macro-average F1 score:", f1_score(Stance_test, y_pred_Glove, average='macro'))
print("Micro-average F1 score:", f1_score(Stance_test, y_pred_Glove, average='micro'))
print(classification_report(Stance_test, y_pred_skipgram))
#print("Macro-average F1 score:", f1_score(Stance_test, y_pred_skipgram, average='macro'))
print("Micro-average F1 score:", f1_score(Stance_test, y_pred_skipgram, average='micro'))
print(classification_report(Stance_test, y_pred_cbow))
#print("Macro-average F1 score:", f1_score(Stance_test, y_pred_cbow, average='macro'))
print("Micro-average F1 score:", f1_score(Stance_test, y_pred_cbow, average='micro'))
print(classification_report(Stance_test, y_pred_tfidf))
#print("Macro-average F1 score:", f1_score(Stance_test, y_pred_tfidf, average='micro'))
print("Micro-average F1 score:", f1_score(Stance_test, y_pred_tfidf, average='micro'))

              precision    recall  f1-score   support

     AGAINST       0.74      0.55      0.63       715
       FAVOR       0.37      0.49      0.42       304
        NONE       0.31      0.43      0.36       230

    accuracy                           0.51      1249
   macro avg       0.47      0.49      0.47      1249
weighted avg       0.57      0.51      0.53      1249

Micro-average F1 score: 0.510808646917534
              precision    recall  f1-score   support

     AGAINST       0.57      0.23      0.33       715
       FAVOR       0.24      0.38      0.29       304
        NONE       0.22      0.47      0.30       230

    accuracy                           0.31      1249
   macro avg       0.35      0.36      0.31      1249
weighted avg       0.43      0.31      0.31      1249

Micro-average F1 score: 0.3098478783026421
              precision    recall  f1-score   support

     AGAINST       0.60      0.32      0.42       715
       FAVOR       0.24      0.33      0.28 

In [61]:
#svm

from sklearn.svm import SVC


# Train SVM classifier

#TF-IDF
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, Stance)

#Glove
svm_classifier_Glove = SVC(kernel='linear')
svm_classifier_Glove.fit(X_train_Glove, Stance)

#Skipgram
svm_classifier_skipgram = SVC(kernel='linear')
svm_classifier_skipgram.fit(X_train_word2vec_SkipGram, Stance)

#CBOW
svm_classifier_cbow = SVC(kernel='linear')
svm_classifier_cbow.fit(X_train_word2vec_CBOW, Stance)


# Make predictions
y_pred_tfidf_svm = svm_classifier.predict(X_test_tfidf)
y_pred_Glove_svm = svm_classifier_Glove.predict(X_test_Glove)
y_pred_skipgram_svm = svm_classifier_skipgram.predict(X_test_word2vec_SkipGram)
y_pred_cbow_svm = svm_classifier_cbow.predict(X_test_word2vec_CBOW)

# Evaluate performance
print(classification_report(Stance_test, y_pred_Glove_svm))
#print("Macro-average F1 score:", f1_score(Stance_test, y_pred_Glove_svm, average='macro'))
print("Micro-average F1 score:", f1_score(Stance_test, y_pred_Glove_svm, average='micro'))
print(classification_report(Stance_test, y_pred_skipgram_svm))
#print("Macro-average F1 score:", f1_score(Stance_test, y_pred_skipgram_svm, average='macro'))
print("Micro-average F1 score:", f1_score(Stance_test, y_pred_skipgram_svm, average='micro'))
print(classification_report(Stance_test, y_pred_cbow_svm))
#print("Macro-average F1 score:", f1_score(Stance_test, y_pred_cbow_svm, average='macro'))
print("Micro-average F1 score:", f1_score(Stance_test, y_pred_cbow_svm, average='micro'))
print(classification_report(Stance_test, y_pred_tfidf_svm))
#print("Macro-average F1 score:", f1_score(Stance_test, y_pred_tfidf_svm, average='macro'))
print("Micro-average F1 score:", f1_score(Stance_test, y_pred_tfidf_svm, average='micro'))

              precision    recall  f1-score   support

     AGAINST       0.70      0.73      0.71       715
       FAVOR       0.49      0.40      0.44       304
        NONE       0.35      0.40      0.37       230

    accuracy                           0.59      1249
   macro avg       0.51      0.51      0.51      1249
weighted avg       0.59      0.59      0.58      1249

Micro-average F1 score: 0.5860688550840673
              precision    recall  f1-score   support

     AGAINST       0.57      1.00      0.73       715
       FAVOR       0.00      0.00      0.00       304
        NONE       0.00      0.00      0.00       230

    accuracy                           0.57      1249
   macro avg       0.19      0.33      0.24      1249
weighted avg       0.33      0.57      0.42      1249

Micro-average F1 score: 0.5724579663730984
              precision    recall  f1-score   support

     AGAINST       0.57      1.00      0.73       715
       FAVOR       0.00      0.00      0.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
