In [1]:
import math
import random
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\14694\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [3]:
with open("en_US.twitter.txt", "r") as f:
    data = f.read()
print("Data type:", type(data))
print("Number of letters:", len(data))
print("First 300 letters of the data")
print("-------")
display(data[0:300])
print("-------")

print("Last 300 letters of the data")
print("-------")
display(data[-300:])
print("-------")

Data type: <class 'str'>
Number of letters: 3335477
First 300 letters of the data
-------


"How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long.\nWhen you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason.\nthey've decided its more fun if I don't.\nSo Tired D; Played Lazer Tag & Ran A "

-------
Last 300 letters of the data
-------


"ust had one a few weeks back....hopefully we will be back soon! wish you the best yo\nColombia is with an 'o'...“: We now ship to 4 countries in South America (fist pump). Please welcome Columbia to the Stunner Family”\n#GutsiestMovesYouCanMake Giving a cat a bath.\nCoffee after 5 was a TERRIBLE idea.\n"

-------


In [9]:
def split_to_sentences(data):
    """
    Split data by linebreak "\n"

    Args:
        data (str): input data

    Returns:
        list: list of sentences
    """
    sentences = data.split('\n')
    sentences = [s.strip() for s in sentences]
    sentences = [s for s in sentences if len(s) > 0]
    
    return sentences


def tokenize_sentences(sentences):
    
    tokenized_sentences = []
    
    for sentence in sentences:
        sentence = sentence.lower()
        tokenized_sentences.append(nltk.word_tokenize(sentence))
        
    return tokenized_sentences



In [None]:
def get_tokenized_data(data):
    
    sentences = split_to_sentences(data)
    tokenized_sentences = tokenize_sentences(sentences)
    
    return tokenized_sentences

In [11]:
tokenized_data = get_tokenized_data(data)
random.seed(100)
random.shuffle(tokenized_data)

train_size = int(len(tokenized_data) * 0.8)
train_data = tokenized_data[0:train_size]
test_data = tokenized_data[train_size:]

In [20]:
def count_words(tokenize_sentences):
    word_counts = {}
    
    for sentence in tokenize_sentences:
        
        for word in sentence:
            if word not in word_counts:
                word_counts[word] = 1
            else:
                word_counts[word] += 1
    
    return word_counts

def get_words_with_nplus_frequency(tokenized_sentences, count_threshold):
    
    closed_vocab = []
    
    word_counts = count_words(tokenized_sentences)
    
    for word, count in word_counts.items():
        if count >= count_threshold:
            closed_vocab.append(word)
    
    return closed_vocab


def replace_oov_words_by_unk(tokenized_sentences, vocabulary, unknown_token = '<unk>'):
    vocabulary = set(vocabulary)
    
    replaced_tokenized_sentences = []
    
    for sentence in tokenized_sentences:
        replaced_sentence = []
        
        for word in sentence:
            if word in vocabulary:
                replaced_sentence.append(word)
            else:
                replaced_sentence.append(unknown_token)
        
        replaced_tokenized_sentences.append(replaced_sentence)
    return replaced_tokenized_sentences
  

In [22]:
def preprocess_data(train_data, test_data, count_threshold, unknown_token= "unk"):
    
    vocabulary = get_words_with_nplus_frequency(train_data, count_threshold)
    
    train_data_replaced = replace_oov_words_by_unk(train_data, vocabulary, unknown_token)
    
    test_data_replaced = replace_oov_words_by_unk(test_data, vocabulary, unknown_token)
    
    return train_data_replaced, test_data_replaced, vocabulary

In [24]:
minimum_freq = 2
train_data_processed, test_data_processed, vocabulary = preprocess_data(train_data, 
                                                                        test_data, 
                                                                        minimum_freq)

In [25]:
print("First preprocessed training sample:")
print(train_data_processed[0])
print()
print("First preprocessed test sample:")
print(test_data_processed[0])
print()
print("First 10 vocabulary:")
print(vocabulary[0:10])
print()
print("Size of vocabulary:", len(vocabulary))

First preprocessed training sample:
['some', 'of', 'the', 'highlights', 'from', 'this', 'report', 'include', 'the', 'popularity', 'of', 'games', 'by', 'platform', 'and', 'stats', 'on', 'the', 'top', 'publishers']

First preprocessed test sample:
['i', 'know', 'it', "'s", 'not', 'their', 'fault', 'personally', ',', 'but', 'where', "'s", 'the', 'unk', 'apple', 'store', 'employee', 'so', 'i', 'can', 'punch', 'them', 'in', 'the', 'face', 'real', 'quick', '?']

First 10 vocabulary:
['some', 'of', 'the', 'highlights', 'from', 'this', 'report', 'include', 'popularity', 'games']

Size of vocabulary: 14794
