In [1]:
import numpy as np
import operator
import pandas as pd
import re
from sklearn.model_selection import StratifiedShuffleSplit
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModel

tqdm.pandas()




In [2]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
print(f'Train shape: {df.shape}'), print(f'Test shape: {df_test.shape}')

Train shape: (19579, 3)
Test shape: (8392, 2)


(None, None)

In [3]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=5)

# Get the indices for the validation set
for _, test_indx in sss.split(df, df['author']):
    valid_df = df.iloc[test_indx]
    train_df = df.drop(test_indx)

print(len(valid_df)), print(len(train_df))

3916
15663


(None, None)

In [4]:
def build_vocab(sentences, verbose=True):
    """
    sentences: list of list of words
    return: dictionary of words and their count
    """

    vocab = {}
    for sentence in tqdm(sentences, disable=(not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    
    return vocab

*Let's populate the dictionary and display the first 5 elements and their count*

In [5]:
sentences = df['text'].progress_apply(lambda x: x.split()).values

vocab = build_vocab(sentences)
print({key: vocab[key] for key in list(vocab)[:5]})

100%|██████████| 19579/19579 [00:00<00:00, 176288.08it/s]
100%|██████████| 19579/19579 [00:00<00:00, 142013.95it/s]

{'This': 391, 'process,': 5, 'however,': 290, 'afforded': 34, 'me': 2015}





In [6]:
# Load the vocabulary
vocab_path = 'bert_large_uncased_vocab.txt'  # Path to the vocab.txt file
with open(vocab_path, 'r', encoding='utf-8') as f:
    vocab_list = f.read().splitlines()

# Convert the vocabulary list to indices
word_to_idx = {word: idx for idx, word in enumerate(vocab_list)}

In [7]:
def check_coverage(vocab, word_to_idx):
    in_vocab = {}
    oov = {}
    k = 0
    i = 0

    for word in tqdm(vocab):
        try:
            in_vocab[word] = word_to_idx[word]
            k += vocab[word]
        except:
            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print(f'Found embeddings for {len(in_vocab) / len(vocab):.2%} of words in the vocab')
    print(f'Found embeddings for {k / (k + i):.2%}% of all text')

    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [8]:
%%time

oov = check_coverage(vocab, word_to_idx)

100%|██████████| 47556/47556 [00:00<00:00, 952542.58it/s]

Found embeddings for 21.89% of words in the vocab
Found embeddings for 75.57%% of all text
CPU times: total: 0 ns
Wall time: 62.3 ms





In [9]:
oov[:10]

[('I', 10382),
 ('The', 2121),
 ('It', 880),
 ('He', 863),
 ('But', 623),
 ('In', 592),
 ('me,', 470),
 ('And', 447),
 ('This', 391),
 ('We', 372)]

*Looks like capitalization and punctuations attached to words are the main reasons words seen as out of vocab*

In [20]:
for char in '&?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~]':
    confirmation = char in word_to_idx
    print(f'The punctuation {char} is confirmed {confirmation}')

The punctuation & is confirmed True
The punctuation ? is confirmed True
The punctuation ! is confirmed True
The punctuation . is confirmed True
The punctuation , is confirmed True
The punctuation " is confirmed True
The punctuation # is confirmed True
The punctuation $ is confirmed True
The punctuation % is confirmed True
The punctuation ' is confirmed True
The punctuation ( is confirmed True
The punctuation ) is confirmed True
The punctuation * is confirmed True
The punctuation + is confirmed True
The punctuation - is confirmed True
The punctuation / is confirmed True
The punctuation : is confirmed True
The punctuation ; is confirmed True
The punctuation < is confirmed True
The punctuation = is confirmed True
The punctuation > is confirmed True
The punctuation @ is confirmed True
The punctuation [ is confirmed True
The punctuation \ is confirmed True
The punctuation ] is confirmed True
The punctuation ^ is confirmed True
The punctuation _ is confirmed True
The punctuation ` is confirm

*Every punctuation is in bert-large and since the model reads meaning, it means there might some meaning in the context the punctuation appears so all I need to do is separate the punctuation from the word it is attached to so it becomes its own token*

In [21]:
def clean_text_punct(text):
    text = str(text)
    for punct in '&?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~]' + "'“”’": 
        text = text.replace(punct, f' {punct}')
    
    return text

In [22]:
df['text'] = df['text'].progress_apply(lambda x: clean_text_punct(x))

sentences = df['text'].progress_apply(lambda x: x.split()).values

vocab = build_vocab(sentences)

oov = check_coverage(vocab, word_to_idx)

100%|██████████| 19579/19579 [00:00<00:00, 102523.09it/s]
100%|██████████| 19579/19579 [00:00<00:00, 75562.72it/s]
100%|██████████| 19579/19579 [00:00<00:00, 177853.07it/s]
100%|██████████| 28398/28398 [00:00<00:00, 943115.63it/s]

Found embeddings for 38.69% of words in the vocab
Found embeddings for 87.01%% of all text





*38.71% of the words now, improvement of 16.82% which accounts for 86.94% of the total text. Not bad!*

In [23]:
oov[:10]

[('I', 10584),
 ('The', 2121),
 ("'s", 1261),
 ('It', 880),
 ('He', 876),
 ('But', 676),
 ('In', 592),
 ('And', 469),
 ('This', 413),
 ('We', 382)]

*Number of words improved but captilization seems to be the next big issue*

In [24]:
df['text'] = df['text'].str.lower()

sentences = df['text'].progress_apply(lambda x: x.split()).values

vocab = build_vocab(sentences)

oov = check_coverage(vocab, word_to_idx)

100%|██████████| 19579/19579 [00:00<00:00, 248868.94it/s]
100%|██████████| 19579/19579 [00:00<00:00, 201167.21it/s]
100%|██████████| 26008/26008 [00:00<00:00, 1298605.49it/s]

Found embeddings for 45.63% of words in the vocab
Found embeddings for 93.07%% of all text





*Now 45.63% from 38.71%, improved by 6.92% which is 93.01% of all the text*

In [25]:
oov[:10]

[("'s", 1261),
 ('"i', 213),
 ('perdita', 167),
 ('"the', 131),
 ('countenance', 128),
 ('idris', 109),
 ('"you', 104),
 ('beheld', 88),
 ('"and', 86),
 ('gilman', 76)]

*I have noticed that quotes are messing punctuation cleaning so I need to deal with those specifically by removing them entirely*

In [26]:
def clean_text_quotes(text):
    text = str(text)
    for punct in '"': 
        text = text.replace(punct, f'{punct} ')
    
    return text

In [27]:
df['text'] = df['text'].progress_apply(lambda x: clean_text_quotes(x))

sentences = df['text'].progress_apply(lambda x: x.split()).values

vocab = build_vocab(sentences)

oov = check_coverage(vocab, word_to_idx)

100%|██████████| 19579/19579 [00:00<00:00, 549641.44it/s]
100%|██████████| 19579/19579 [00:00<00:00, 229121.29it/s]
100%|██████████| 19579/19579 [00:00<00:00, 148500.40it/s]
100%|██████████| 25344/25344 [00:00<00:00, 967070.97it/s]


Found embeddings for 46.90% of words in the vocab
Found embeddings for 93.51%% of all text


*That added another 1.27% to the number of words in the vocab and an additional 0.35% to all the text that have embeddings*

In [28]:
oov[:10]

[("'s", 1261),
 ('perdita', 169),
 ('countenance', 128),
 ('idris', 109),
 ('beheld', 88),
 ('gilman', 76),
 ('alas', 75),
 ("'t", 63),
 ('exceedingly', 62),
 ('frightful', 61)]

*Now to deal with contractions i.e. turning words like didn't to did not*

In [33]:
contraction_mapping = {
    "Trump's" : 'trump is',
    "'cause": 'because',
    ',cause': 'because',
    ';cause': 'because',
    "ain't": 'am not',
    'ain,t': 'am not',
    'ain;t': 'am not',
    'ain´t': 'am not',
    'ain’t': 'am not',
    "aren't": 'are not',
    'aren,t': 'are not',
    'aren;t': 'are not',
    'aren´t': 'are not',
    'aren’t': 'are not',
    "can't": 'cannot',
    "can't've": 'cannot have',
    'can,t': 'cannot',
    'can,t,ve': 'cannot have',
    'can;t': 'cannot',
    'can;t;ve': 'cannot have',
    'can´t': 'cannot',
    'can´t´ve': 'cannot have',
    'can’t': 'cannot',
    'can’t’ve': 'cannot have',
    "could've": 'could have',
    'could,ve': 'could have',
    'could;ve': 'could have',
    "couldn't": 'could not',
    "couldn't've": 'could not have',
    'couldn,t': 'could not',
    'couldn,t,ve': 'could not have',
    'couldn;t': 'could not',
    'couldn;t;ve': 'could not have',
    'couldn´t': 'could not',
    'couldn´t´ve': 'could not have',
    'couldn’t': 'could not',
    'couldn’t’ve': 'could not have',
    'could´ve': 'could have',
    'could’ve': 'could have',
    "didn't": 'did not',
    'didn,t': 'did not',
    'didn;t': 'did not',
    'didn´t': 'did not',
    'didn’t': 'did not',
    "doesn't": 'does not',
    'doesn,t': 'does not',
    'doesn;t': 'does not',
    'doesn´t': 'does not',
    'doesn’t': 'does not',
    "don't": 'do not',
    'don,t': 'do not',
    'don;t': 'do not',
    'don´t': 'do not',
    'don’t': 'do not',
    "dun't": 'do not',
    "hadn't": 'had not',
    "hadn't've": 'had not have',
    'hadn,t': 'had not',
    'hadn,t,ve': 'had not have',
    'hadn;t': 'had not',
    'hadn;t;ve': 'had not have',
    'hadn´t': 'had not',
    'hadn´t´ve': 'had not have',
    'hadn’t': 'had not',
    'hadn’t’ve': 'had not have',
    "hasn't": 'has not',
    'hasn,t': 'has not',
    'hasn;t': 'has not',
    'hasn´t': 'has not',
    'hasn’t': 'has not',
    "haven't": 'have not',
    'haven,t': 'have not',
    'haven;t': 'have not',
    'haven´t': 'have not',
    'haven’t': 'have not',
    "he'd": 'he would',
    "he'd've": 'he would have',
    "he'll": 'he will',
    "he's": 'he is',
    'he,d': 'he would',
    'he,d,ve': 'he would have',
    'he,ll': 'he will',
    'he,s': 'he is',
    'he;d': 'he would',
    'he;d;ve': 'he would have',
    'he;ll': 'he will',
    'he;s': 'he is',
    'he´d': 'he would',
    'he´d´ve': 'he would have',
    'he´ll': 'he will',
    'he´s': 'he is',
    'he’d': 'he would',
    'he’d’ve': 'he would have',
    'he’ll': 'he will',
    'he’s': 'he is',
    "how'd": 'how did',
    "how'll": 'how will',
    "how's": 'how is',
    'how,d': 'how did',
    'how,ll': 'how will',
    'how,s': 'how is',
    'how;d': 'how did',
    'how;ll': 'how will',
    'how;s': 'how is',
    'how´d': 'how did',
    'how´ll': 'how will',
    'how´s': 'how is',
    'how’d': 'how did',
    'how’ll': 'how will',
    'how’s': 'how is',
    "i'd": 'i would',
    "i'll": 'i will',
    "i'm": 'i am',
    "i've": 'i have',
    'i,d': 'i would',
    'i,ll': 'i will',
    'i,m': 'i am',
    'i,ve': 'i have',
    'i;d': 'i would',
    'i;ll': 'i will',
    'i;m': 'i am',
    'i;ve': 'i have',
    "isn't": 'is not',
    'isn,t': 'is not',
    'isn;t': 'is not',
    'isn´t': 'is not',
    'isn’t': 'is not',
    "it'd": 'it would',
    "it'll": 'it will',
    "It's":'it is',
    "it's": 'it is',
    'it,d': 'it would',
    'it,ll': 'it will',
    'it,s': 'it is',
    'it;d': 'it would',
    'it;ll': 'it will',
    'it;s': 'it is',
    'it´d': 'it would',
    'it´ll': 'it will',
    'it´s': 'it is',
    'it’d': 'it would',
    'it’ll': 'it will',
    'it’s': 'it is',
    'i´d': 'i would',
    'i´ll': 'i will',
    'i´m': 'i am',
    'i´ve': 'i have',
    'i’d': 'i would',
    'i’ll': 'i will',
    'i’m': 'i am',
    'i’ve': 'i have',
    "let's": 'let us',
    'let,s': 'let us',
    'let;s': 'let us',
    'let´s': 'let us',
    'let’s': 'let us',
    "ma'am": 'madam',
    'ma,am': 'madam',
    'ma;am': 'madam',
    "mayn't": 'may not',
    'mayn,t': 'may not',
    'mayn;t': 'may not',
    'mayn´t': 'may not',
    'mayn’t': 'may not',
    'ma´am': 'madam',
    'ma’am': 'madam',
    "might've": 'might have',
    'might,ve': 'might have',
    'might;ve': 'might have',
    "mightn't": 'might not',
    'mightn,t': 'might not',
    'mightn;t': 'might not',
    'mightn´t': 'might not',
    'mightn’t': 'might not',
    'might´ve': 'might have',
    'might’ve': 'might have',
    "must've": 'must have',
    'must,ve': 'must have',
    'must;ve': 'must have',
    "mustn't": 'must not',
    'mustn,t': 'must not',
    'mustn;t': 'must not',
    'mustn´t': 'must not',
    'mustn’t': 'must not',
    'must´ve': 'must have',
    'must’ve': 'must have',
    "needn't": 'need not',
    'needn,t': 'need not',
    'needn;t': 'need not',
    'needn´t': 'need not',
    'needn’t': 'need not',
    "n't": 'not',
    "oughtn't": 'ought not',
    'oughtn,t': 'ought not',
    'oughtn;t': 'ought not',
    'oughtn´t': 'ought not',
    'oughtn’t': 'ought not',
    "sha'n't": 'shall not',
    'sha,n,t': 'shall not',
    'sha;n;t': 'shall not',
    "shan't": 'shall not',
    'shan,t': 'shall not',
    'shan;t': 'shall not',
    'shan´t': 'shall not',
    'shan’t': 'shall not',
    'sha´n´t': 'shall not',
    'sha’n’t': 'shall not',
    "she'd": 'she would',
    "she'll": 'she will',
    "she's": 'she is',
    'she,d': 'she would',
    'she,ll': 'she will',
    'she,s': 'she is',
    'she;d': 'she would',
    'she;ll': 'she will',
    'she;s': 'she is',
    'she´d': 'she would',
    'she´ll': 'she will',
    'she´s': 'she is',
    'she’d': 'she would',
    'she’ll': 'she will',
    'she’s': 'she is',
    "should've": 'should have',
    'should,ve': 'should have',
    'should;ve': 'should have',
    "shouldn't": 'should not',
    'shouldn,t': 'should not',
    'shouldn;t': 'should not',
    'shouldn´t': 'should not',
    'shouldn’t': 'should not',
    'should´ve': 'should have',
    'should’ve': 'should have',
    "that'd": 'that would',
    "that's": 'that is',
    'that,d': 'that would',
    'that,s': 'that is',
    'that;d': 'that would',
    'that;s': 'that is',
    'that´d': 'that would',
    'that´s': 'that is',
    'that’d': 'that would',
    'that’s': 'that is',
    "there'd": 'there had',
    "there's": 'there is',
    'there,d': 'there had',
    'there,s': 'there is',
    'there;d': 'there had',
    'there;s': 'there is',
    'there´d': 'there had',
    'there´s': 'there is',
    'there’d': 'there had',
    'there’s': 'there is',
    "they'd": 'they would',
    "they'll": 'they will',
    "they're": 'they are',
    "they've": 'they have',
    'they,d': 'they would',
    'they,ll': 'they will',
    'they,re': 'they are',
    'they,ve': 'they have',
    'they;d': 'they would',
    'they;ll': 'they will',
    'they;re': 'they are',
    'they;ve': 'they have',
    'they´d': 'they would',
    'they´ll': 'they will',
    'they´re': 'they are',
    'they´ve': 'they have',
    'they’d': 'they would',
    'they’ll': 'they will',
    'they’re': 'they are',
    'they’ve': 'they have',
    "wasn't": 'was not',
    'wasn,t': 'was not',
    'wasn;t': 'was not',
    'wasn´t': 'was not',
    'wasn’t': 'was not',
    "we'd": 'we would',
    "we'll": 'we will',
    "we're": 'we are',
    "we've": 'we have',
    'we,d': 'we would',
    'we,ll': 'we will',
    'we,re': 'we are',
    'we,ve': 'we have',
    'we;d': 'we would',
    'we;ll': 'we will',
    'we;re': 'we are',
    'we;ve': 'we have',
    "weren't": 'were not',
    'weren,t': 'were not',
    'weren;t': 'were not',
    'weren´t': 'were not',
    'weren’t': 'were not',
    'we´d': 'we would',
    'we´ll': 'we will',
    'we´re': 'we are',
    'we´ve': 'we have',
    'we’d': 'we would',
    'we’ll': 'we will',
    'we’re': 'we are',
    'we’ve': 'we have',
    "what'll": 'what will',
    "what're": 'what are',
    "what's": 'what is',
    "what've": 'what have',
    'what,ll': 'what will',
    'what,re': 'what are',
    'what,s': 'what is',
    'what,ve': 'what have',
    'what;ll': 'what will',
    'what;re': 'what are',
    'what;s': 'what is',
    'what;ve': 'what have',
    'what´ll': 'what will',
    'what´re': 'what are',
    'what´s': 'what is',
    'what´ve': 'what have',
    'what’ll': 'what will',
    'what’re': 'what are',
    'what’s': 'what is',
    'what’ve': 'what have',
    "where'd": 'where did',
    "where's": 'where is',
    'where,d': 'where did',
    'where,s': 'where is',
    'where;d': 'where did',
    'where;s': 'where is',
    'where´d': 'where did',
    'where´s': 'where is',
    'where’d': 'where did',
    'where’s': 'where is',
    "who'll": 'who will',
    "who's": 'who is',
    'who,ll': 'who will',
    'who,s': 'who is',
    'who;ll': 'who will',
    'who;s': 'who is',
    'who´ll': 'who will',
    'who´s': 'who is',
    'who’ll': 'who will',
    'who’s': 'who is',
    "won't": 'will not',
    'won,t': 'will not',
    'won;t': 'will not',
    'won´t': 'will not',
    'won’t': 'will not',
    "wouldn't": 'would not',
    'wouldn,t': 'would not',
    'wouldn;t': 'would not',
    'wouldn´t': 'would not',

    'wouldn’t': 'would not',
    "you'd": 'you would',
    "you'll": 'you will',
    "you're": 'you are',
    'you,d': 'you would',
    'you,ll': 'you will',
    'you,re': 'you are',
    'you;d': 'you would',
    'you;ll': 'you will',
    'you;re': 'you are',
    'you´d': 'you would',
    'you´ll': 'you will',
    'you´re': 'you are',
    'you’d': 'you would',
    'you’ll': 'you will',
    'you’re': 'you are',
    '´cause': 'because',
    '’cause': 'because',
    "you've": "you have",
    "could'nt": 'could not',
    "havn't": 'have not',
    "here’s": "here is",
    'i""m': 'i am',
    "i'am": 'i am',
    "i'l": "i will",
    "i'v": 'i have',
    "wan't": 'want',
    "was'nt": "was not",
    "who'd": "who would",
    "who're": "who are",
    "who've": "who have",
    "why'd": "why would",
    "would've": "would have",
    "y'all": "you all",
    "y'know": "you know",
    # "you.i": "you i",
    "your'e": "you are",
    "arn't": "are not",
    "agains't": "against",
    "c'mon": "common",
    "doens't": "does not",
    'don""t': "do not",
    "dosen't": "does not",
    "dosn't": "does not",
    "shoudn't": "should not",
    "that'll": "that will",
    "there'll": "there will",
    "there're": "there are",
    "this'll": "this all",
    "u're": "you are",
     "ya'll": "you all",
    "you'r": "you are",
    "you’ve": "you have",
    "d'int": "did not",
    "did'nt": "did not",
    "din't": "did not",
    "dont't": "do not",
    "gov't": "government",
    "i'ma": "i am",
    "is'nt": "is not",
    "‘I":'I',
    'ᴀɴᴅ':'and',
    'ᴛʜᴇ':'the',
    'ʜᴏᴍᴇ':'home',
    'ᴜᴘ':'up',
    'ʙʏ':'by',
    'ᴀᴛ':'at',
    '…and':'and',
    'civilbeat':'civil beat',
    'TrumpCare':'Trump care',
    'Trumpcare':'Trump care',
     'OBAMAcare':'Obama care',
    'ᴄʜᴇᴄᴋ':'check',
    'ғᴏʀ':'for',
    'ᴛʜɪs':'this',
    'ᴄᴏᴍᴘᴜᴛᴇʀ':'computer',
    'ᴍᴏɴᴛʜ':'month',
    'ᴡᴏʀᴋɪɴɢ':'working',
    'ᴊᴏʙ':'job',
    'ғʀᴏᴍ':'from',
    'Sᴛᴀʀᴛ':'start',
    'gubmit':'submit',
    'CO₂':'carbon dioxide',
    'ғɪʀsᴛ':'first',
    'ᴇɴᴅ':'end',
    'ᴄᴀɴ':'can',
    'ʜᴀᴠᴇ':'have',
    'ᴛᴏ':'to',
    'ʟɪɴᴋ':'link',
    'ᴏғ':'of',
    'ʜᴏᴜʀʟʏ':'hourly',
    'ᴡᴇᴇᴋ':'week',
    'ᴇɴᴅ':'end',
    'ᴇxᴛʀᴀ':'extra',
    'Gʀᴇᴀᴛ':'great',
    'sᴛᴜᴅᴇɴᴛs':'student',
    'sᴛᴀʏ':'stay',
    'ᴍᴏᴍs':'mother',
    'ᴏʀ':'or',
    'ᴀɴʏᴏɴᴇ':'anyone',
    'ɴᴇᴇᴅɪɴɢ':'needing',
    'ᴀɴ':'an',
    'ɪɴᴄᴏᴍᴇ':'income',
    'ʀᴇʟɪᴀʙʟᴇ':'reliable',
    'ғɪʀsᴛ':'first',
    'ʏᴏᴜʀ':'your',
    'sɪɢɴɪɴɢ':'signing',
    'ʙᴏᴛᴛᴏᴍ':'bottom',
    'ғᴏʟʟᴏᴡɪɴɢ':'following',
    'Mᴀᴋᴇ':'make',
    'ᴄᴏɴɴᴇᴄᴛɪᴏɴ':'connection',
    'ɪɴᴛᴇʀɴᴇᴛ':'internet',
    'financialpost':'financial post',
     'ʜaᴠᴇ':' have ',
     'ᴄaɴ':' can ',
     'Maᴋᴇ':' make ',
     'ʀᴇʟɪaʙʟᴇ':' reliable ',
     'ɴᴇᴇᴅ':' need ',
    'ᴏɴʟʏ':' only ',
     'ᴇxᴛʀa':' extra ',
     'aɴ':' an ',
     'aɴʏᴏɴᴇ':' anyone ',
     'sᴛaʏ':' stay ',
     'Sᴛaʀᴛ':' start',
     'SHOPO':'shop',
    "aren't" : "are not",
    "can't" : "cannot",
    "couldn't" : "could not",
    "couldnt" : "could not",
    "didn't" : "did not",
    "doesn't" : "does not",
    "doesnt" : "does not",
    "don't" : "do not",
    "hadn't" : "had not",
    "hasn't" : "has not",
    "haven't" : "have not",
    "havent" : "have not",
    "he'd" : "he would",
    "he'll" : "he will",
    "he's" : "he is",
    "i'd" : "I would",
    "i'd" : "I had",
    "i'll" : "I will",
    "i'm" : "I am",
    "isn't" : "is not",
    "it's" : "it is",
    "it'll":"it will",
    "i've" : "I have",
    "let's" : "let us",
    "mightn't" : "might not",
    "mustn't" : "must not",
    "shan't" : "shall not",
    "she'd" : "she would",
    "she'll" : "she will",
    "she's" : "she is",
    "shouldn't" : "should not",
    "shouldnt" : "should not",
    "that's" : "that is",
    "thats" : "that is",
    "there's" : "there is",
    "theres" : "there is",
    "they'd" : "they would",
    "they'll" : "they will",
    "they're" : "they are",
    "theyre":  "they are",
    "they've" : "they have",
    "we'd" : "we would",
    "we're" : "we are",
    "weren't" : "were not",
    "we've" : "we have",
    "what'll" : "what will",
    "what're" : "what are",
    "what's" : "what is",
    "what've" : "what have",
    "where's" : "where is",
    "who'd" : "who would",
    "who'll" : "who will",
    "who're" : "who are",
    "who's" : "who is",
    "who've" : "who have",
    "won't" : "will not",
    "wouldn't" : "would not",
    "you'd" : "you would",
    "you'll" : "you will",
    "you're" : "you are",
    "you've" : "you have",
    "'re": " are",
    "wasn't": "was not",
    "we'll":" will",
    "didn't": "did not",
    "tryin'":"trying"
        }

In [34]:
def _get_contraction(contraction_map):
    contraction_re = re.compile('(%s)' % '|'.join(contraction_map.keys()))
    return contraction_map, contraction_re

def replace_typical_misspell(text):
    mispellings, mispellings_re = _get_contraction(contraction_mapping)

    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

def clean_data(df, columns: list):
    for col in columns:
        df[col] = df[col].apply(lambda x: replace_typical_misspell(x))

    return df

In [35]:
df = clean_data(df, ['text'])

sentences = df['text'].progress_apply(lambda x: x.split()).values

vocab = build_vocab(sentences)

oov = check_coverage(vocab, word_to_idx)

100%|██████████| 19579/19579 [00:00<00:00, 269599.50it/s]
100%|██████████| 19579/19579 [00:00<00:00, 195622.28it/s]
100%|██████████| 25344/25344 [00:00<00:00, 781258.98it/s]

Found embeddings for 46.90% of words in the vocab
Found embeddings for 93.51%% of all text





In [39]:
oov[:10]

[("'s", 1261),
 ('perdita', 169),
 ('countenance', 128),
 ('idris', 109),
 ('beheld', 88),
 ('gilman', 76),
 ('alas', 75),
 ("'t", 63),
 ('exceedingly', 62),
 ('frightful', 61)]

*Considering the writers are such weirdos, a lot of contractions that end in 's are clearly possessive so not direct way to deal with it without messing up the context for an embedding (for now at least) so I will move on know 93% of text is covered even though that accounts for less than half the words being used*

In [40]:
max_length_row_index = df['text'].apply(len).idxmax()
max_length = len(df.loc[max_length_row_index, 'text'])

print("Maximum text length:", max_length)
print("Index of the row with the maximum text length:", max_length_row_index)
print("Text in the row with the maximum text length:", df.loc[max_length_row_index, 'text'])

Maximum text length: 4678
Index of the row with the maximum text length: 9215
Text in the row with the maximum text length: diotima approached the fountain seated herself on a mossy mound near it and her disciples placed themselves on the grass near her without noticing me who sat close under her she continued her discourse addressing as it happened one or other of her listeners but before i attempt to repeat her words i will describe the chief of these whom she appeared to wish principally to impress one was a woman of about years of age in the full enjoyment of the most exquisite beauty her golden hair floated in ringlets on her shoulders her hazle eyes were shaded by heavy lids and her mouth the lips apart seemed to breathe sensibility but she appeared thoughtful unhappy her cheek was pale she seemed as if accustomed to suffer and as if the lessons she now heard were the only words of wisdom to which she had ever listened the youth beside her had a far different aspect his form was 

In [None]:
def find_numbers(text):
    # Define a regular expression pattern to match numbers
    pattern = r'\d+'
    
    # Use re.findall() to find all matches of the pattern in the text
    numbers = re.findall(pattern, text)
    
    # Return the list of found numbers
    return numbers


df['numbers'] = df['text'].apply(find_numbers)
print(df['numbers'].sort_values(ascending=False))

In [41]:
%%time

tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-large-uncased')
model = AutoModel.from_pretrained('google-bert/bert-large-uncased')

Some weights of the model checkpoint at google-bert/bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CPU times: total: 1.62 s
Wall time: 7.98 s


In [42]:
def get_bert_embeddings(sentences):
    model.eval()  # Put the model in evaluation mode
    batch_size = 64  # Adjust based on your memory availability
    embeddings = []
    
    # Wrap the range generator with tqdm for a progress bar
    for i in tqdm(range(0, len(sentences), batch_size), desc="Processing batches"):
        batch = sentences[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state[:, 0, :].detach().numpy())
    
    # Concatenate all batch embeddings
    embeddings = np.concatenate(embeddings, axis=0)
    return embeddings

In [43]:
documents_train = df['text'].tolist()

In [44]:
%%time

bert_embeddings = get_bert_embeddings(documents_train)
bert_df_train = pd.DataFrame(bert_embeddings)
bert_df_train.columns = ['bert_' + str(col) for col in bert_df_train.columns]
bert_df_train.head()

Processing batches: 100%|██████████| 306/306 [3:54:38<00:00, 46.01s/it]   


CPU times: total: 7h 15min 49s
Wall time: 3h 54min 48s


Unnamed: 0,bert_0,bert_1,bert_2,bert_3,bert_4,bert_5,bert_6,bert_7,bert_8,bert_9,...,bert_1014,bert_1015,bert_1016,bert_1017,bert_1018,bert_1019,bert_1020,bert_1021,bert_1022,bert_1023
0,-0.622036,-0.783141,-0.768789,-0.661886,0.028125,0.504465,-0.207343,-0.109943,-0.072224,0.794893,...,0.029174,-0.245753,-0.150378,0.802312,0.281644,0.201897,0.260581,-1.174859,0.127157,-0.27381
1,-0.701897,0.374661,-0.704999,-0.269579,0.633197,0.398946,0.595634,0.220773,0.074318,0.455393,...,0.075826,-0.567909,0.174853,-0.206451,-0.00113,0.299368,-0.053149,-0.23563,0.489391,-0.722882
2,-0.066654,-0.447025,-0.446701,0.109352,0.140808,0.046788,-0.435363,0.35756,0.830479,0.860996,...,0.875746,-0.17748,-0.641226,0.650413,0.383797,0.605456,-0.196379,-0.515951,0.297116,0.061242
3,-0.102057,-0.651902,-0.687253,0.218423,0.070657,0.326936,0.12001,0.004033,0.599281,-0.026284,...,0.326366,-0.657595,-0.287512,0.891334,0.251273,0.203012,-0.006166,-0.761718,-0.435652,0.204584
4,-0.723254,-0.508487,-0.519215,0.261606,0.076165,0.418882,-0.252534,0.405623,0.65315,0.391357,...,0.223673,-0.098319,-0.285708,0.296431,0.532552,0.990138,-0.120088,-1.080273,0.182172,0.078177


In [45]:
bert_df_train.to_csv('pytorch_bert_train.csv', index=False)