## Imports

In [1]:
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import Word2Vec
import ast

# Pre-trained GloVe vectors

Remember to download the pre-trained GloVe vectors (Twitter database) from the following link and use the file with 100 dimensions (100d) and then create and store it in the "Data" folder : https://nlp.stanford.edu/projects/glove/

In [7]:
glove_file = datapath('Data/glove.twitter.27B.100d.txt')
tmp_file = get_tmpfile("test_word2vec.txt")

_ = glove2word2vec(glove_file, tmp_file)

glove_vectors = KeyedVectors.load_word2vec_format(tmp_file)

  _ = glove2word2vec(glove_file, tmp_file)


## Word-2-vec model (basic implementation)

In [8]:
# Loading the entire dataset for creating the word embeddings
df = pd.read_csv("Data/tokenized_eng.csv")

In [9]:
# Extracting only the token column as a pandas series
documents = df['eng_tokens']

# Convert the string representations of lists to actual lists using ast.literal_eval
sentences = [ast.literal_eval(sentence) for sentence in documents]

In [10]:
# build a word2vec model on your dataset
base_model = Word2Vec(vector_size=100, min_count=10, sg=1) # skip gram model
base_model.build_vocab(sentences)
total_examples = base_model.corpus_count

### Adding GloVe weights and retraining

In [11]:
# Update the vocabulary of the base_model with the keys from glove_vectors
base_model.build_vocab([list(glove_vectors.index_to_key)], update=True)

# train on your data
base_model.train(sentences, total_examples=total_examples, epochs=base_model.epochs)
base_model_wv = base_model.wv

## Creating a Feature matrix (with document embeddings)

In [12]:
# Loading the balanced dataset
df_balanced = pd.read_csv("Data/Undersampled_balanced_data.csv")

In [13]:
# Initialize empty lists to store data
author_ids = []
avg_word_length = []
avg_sent_length = []
ttr = []
nr_unique_words = []
nr_chars = []
nr_contradictions = []
subjectivity = []
feature_vectors = []
labels = []
nr_period = []
nr_comma = []
nr_question = []
nr_exclamation = []

# Iterate through each row in the DataFrame
for index, row in df_balanced.iterrows():
    sentence = ast.literal_eval(row['eng_tokens'])
    # Get the sentence vector by taking average of all word embeddings in sentence
    word_embeddings = [base_model_wv[token] for token in sentence if token in base_model_wv]

    # Ensure all word embeddings have the same length
    # Sometimes they have varying lengths because not all words are embedded in Word2Vec
    # LIMITATION of Word2Vec!!!
    if word_embeddings:
        sentence_vector = np.mean(word_embeddings, axis=0)
        feature_vectors.append(sentence_vector)
    else:
        # Handle the case where no embeddings are available
        feature_vectors.append(np.zeros(base_model_wv.vector_size))

    # Append author_id, feature_vector, and label, etc. to respective lists
    author_ids.append(row['auhtor_ID'])
    avg_word_length.append(row['average_word_length'])
    avg_sent_length.append(row['avg_sentence_length'])
    ttr.append(row['ttr'])
    nr_unique_words.append(row['nr_unique_words'])
    nr_chars.append(row['nr_chars'])
    nr_contradictions.append(row['nr_contradictions'])
    subjectivity.append(row['subjectivity'])
    nr_period.append(row['norm_.'])
    nr_comma.append(row['norm_,'])
    nr_question.append(row['norm_?'])
    nr_exclamation.append(row['norm_!'])

    labels.append(row['Poles'])

# Create a new DataFrame with author IDs, feature vectors, and labels, etc.
columns = [f'feature_{i}' for i in range(len(feature_vectors[0]))]
additional_columns = ['average_word_length', 'avg_sentence_length', 'ttr', 'nr_unique_words', 'nr_chars', 'nr_contradictions', 'subjectivity','nr_period', 'nr_comma', 'nr_question', 'nr_exclamation']
data = np.concatenate([np.array(author_ids).reshape(-1, 1), np.array(avg_word_length).reshape(-1, 1),
                       np.array(avg_sent_length).reshape(-1, 1), np.array(ttr).reshape(-1, 1),
                       np.array(nr_unique_words).reshape(-1, 1), np.array(nr_chars).reshape(-1, 1),
                       np.array(nr_contradictions).reshape(-1, 1), np.array(subjectivity).reshape(-1, 1),
                       np.array(nr_period).reshape(-1, 1), np.array(nr_comma).reshape(-1, 1),
                       np.array(nr_question).reshape(-1, 1), np.array(nr_exclamation).reshape(-1, 1),
                       np.array(feature_vectors).tolist(), np.array(labels).reshape(-1, 1)], axis=1)

new_df = pd.DataFrame(data, columns=['author_ID'] + additional_columns + columns + ['Poles'])

# Convert feature columns to numeric
new_df[columns] = new_df[columns].apply(pd.to_numeric)
new_df

Unnamed: 0,author_ID,average_word_length,avg_sentence_length,ttr,nr_unique_words,nr_chars,nr_contradictions,subjectivity,nr_period,nr_comma,...,feature_91,feature_92,feature_93,feature_94,feature_95,feature_96,feature_97,feature_98,feature_99,Poles
0,t2_ffcfiueh,5.481442205726405,66.16483516483517,0.7073170731707317,667,6111,6,0.4854949874686716,0.0911983032873807,0.0477200424178154,...,-0.007784,0.031585,0.049738,0.100031,-0.031415,-0.071001,0.038708,0.066147,-0.077816,Western
1,t2_lfs48,6.285714285714286,85.01492537313433,0.718078381795196,568,5762,17,0.487030497888162,0.11378002528445,0.0783817951959544,...,0.032240,0.023061,0.054812,0.041297,-0.027160,-0.021973,0.006359,0.030645,-0.081342,Western
2,t2_zcj4y,5.580459770114943,34.13496932515337,0.7471264367816092,650,5724,18,0.5510049760644998,0.1747126436781609,0.0655172413793103,...,0.011856,-0.051878,0.068134,0.081694,0.018575,-0.040013,0.047502,0.069519,-0.064935,Western
3,t2_2xpu7n1c,5.4812967581047385,75.44117647058823,0.6408977556109726,514,5197,0,0.5143662238145,0.0735660847880299,0.112219451371571,...,0.018508,-0.034094,0.069904,0.126776,0.027026,-0.037627,0.040436,0.025213,-0.070703,Western
4,t2_3edl7,5.9168646080760094,60.96808510638298,0.7197149643705463,606,5823,11,0.4018541930046354,0.1223277909738717,0.0855106888361045,...,-0.009086,0.012244,0.106775,0.108287,-0.079001,-0.050491,0.046639,0.056835,-0.030492,Western
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30937,t2_8hacr7if,5.69746835443038,49.875,0.6835443037974683,540,5290,13,0.4372444684944683,0.1113924050632911,0.0379746835443038,...,0.000702,-0.014954,0.096263,0.116204,-0.052225,-0.058670,0.011701,0.083648,0.000851,Eastern
30938,t2_8hacr7if,5.7907253269916765,50.45045045045045,0.7443519619500595,626,5710,9,0.4364888583638583,0.1093935790725326,0.0487514863258026,...,-0.001142,0.008996,0.093147,0.125046,-0.044154,-0.022087,0.003849,0.087167,-0.020898,Eastern
30939,t2_8hacr7if,5.671834625322997,47.71698113207548,0.7002583979328165,542,5163,9,0.4665453055398707,0.1317829457364341,0.0478036175710594,...,-0.006764,-0.022218,0.129190,0.094650,-0.095174,-0.005999,-0.066520,0.104894,0.030986,Eastern
30940,t2_8hacr7if,5.74845869297164,47.86607142857143,0.7040690505548706,571,5472,7,0.4284130624726955,0.1146732429099876,0.0530209617755857,...,0.003105,-0.023637,0.137076,0.091326,-0.080962,-0.032644,-0.063697,0.111811,0.031217,Eastern


In [15]:
## Removing all the rows with 0 values (OOV words not captured!!)
new_df = new_df[new_df['feature_0'] != 0.0]

In [16]:
## Converting df into csv file
new_df.to_csv('Data/Word2Vec_feature_data.csv', index=False)