# BoW

In [1]:
import os
output_directory = '../data/preprocessed'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

In [2]:
import json
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

data1 = []

# Assuming the NDJSON data is stored in a file called 'data.ndjson'
with open("../data/domain1_train.json", 'r') as file:
    for line in file:
        data1.append(json.loads(line.strip()))
data_strings1 = [' '.join(map(str, sample['text'])) for sample in data1]

data2 = []
# Assuming the NDJSON data is stored in a file called 'data.ndjson'
with open("../data/domain2_train.json", 'r') as file:
    for line in file:
        data2.append(json.loads(line.strip()))
data_strings2 = [' '.join(map(str, sample['text'])) for sample in data2]


In [3]:
vectorizer = CountVectorizer(tokenizer=lambda x: x.split())
X = vectorizer.fit_transform(data_strings1)

unique_tokens_from_vectorizer = set(vectorizer.get_feature_names_out())

all_possible_tokens = set(map(str, range(5000)))  # Convert to string for comparison with vectorizer output

missing_tokens = all_possible_tokens - unique_tokens_from_vectorizer

print("Number of missing tokens:", len(missing_tokens))
print("Missing tokens:", missing_tokens)


Number of missing tokens: 74
Missing tokens: {'2444', '3416', '3163', '3401', '4129', '3953', '3213', '2578', '2316', '4444', '3145', '3144', '2571', '4814', '4155', '4838', '4999', '4421', '25', '3501', '2467', '4766', '2858', '910', '4414', '4395', '4400', '4026', '3867', '4311', '4674', '238', '4636', '4538', '4260', '3425', '2907', '4134', '4828', '4732', '4094', '4090', '4497', '4350', '3448', '3836', '2793', '3956', '3339', '4622', '4274', '3562', '4443', '4119', '4944', '4522', '2619', '4338', '2754', '4957', '3811', '2880', '4788', '3965', '4259', '4105', '4477', '2915', '3823', '4080', '4852', '3605', '4716', '3236'}


Hence the total length of BoW column is 4926 instead of 5000
however since the test data in Kaggle may have these variable, we will  keep column number as 5000

In [4]:
def bow(data, domain):
    vocab = list(map(str, range(5000)))

    # Create a CountVectorizer with defined vocabulary
    vectorizer = CountVectorizer(tokenizer=lambda x: x.split(), vocabulary=vocab)
    X = vectorizer.fit_transform(data)

    # Convert to DataFrame for better visualization
    df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    df.to_csv(f'../data/preprocessed/{domain}_train_bow.csv')

In [5]:
bow(data_strings1, 'domain1')

In [6]:
bow(data_strings2, 'domain2')

# TFIDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf(data, domain):
    # Define the vocabulary
    vocab = list(map(str, range(5000)))

    # Create a TfidfVectorizer with defined vocabulary
    tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(), vocabulary=vocab)
    X_tfidf_direct = tfidf_vectorizer.fit_transform(data)

    # Convert to DataFrame for better visualization
    df_tfidf_direct = pd.DataFrame(X_tfidf_direct.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

    print(df_tfidf_direct)
    df_tfidf_direct.to_csv(f'../data/preprocessed/{domain}_train_tfidf.csv')


In [12]:
tfidf(data_strings1, 'domain1')

              0         1         2         3         4         5         6  \
0      0.414947  0.000000  0.065837  0.000000  0.000000  0.000000  0.000000   
1      0.053230  0.055328  0.059119  0.000000  0.080729  0.000000  0.074856   
2      0.214801  0.037212  0.000000  0.045032  0.108590  0.000000  0.000000   
3      0.141684  0.049090  0.104907  0.118815  0.000000  0.000000  0.000000   
4      0.181774  0.047235  0.050472  0.057163  0.068921  0.000000  0.063906   
...         ...       ...       ...       ...       ...       ...       ...   
19495  0.036312  0.037743  0.040329  0.091351  0.055071  0.060088  0.051064   
19496  0.075273  0.039120  0.125402  0.142027  0.057080  0.062281  0.052927   
19497  0.318742  0.239278  0.059001  0.200470  0.080568  0.117212  0.000000   
19498  0.046187  0.048008  0.051297  0.058098  0.000000  0.305719  0.194854   
19499  0.063545  0.198150  0.000000  0.159864  0.000000  0.000000  0.000000   

              7         8         9  ...  4990  499

In [13]:
tfidf(data_strings2, 'domain2')

              0         1         2         3         4         5         6  \
0      0.143333  0.216323  0.095259  0.000000  0.095743  0.098240  0.062053   
1      0.169637  0.153612  0.169111  0.104404  0.042492  0.000000  0.000000   
2      0.562023  0.151468  0.186761  0.214129  0.080446  0.082544  0.095589   
3      0.000000  0.178949  0.000000  0.000000  0.079201  0.000000  0.000000   
4      0.254114  0.210934  0.147774  0.026066  0.063653  0.043542  0.055007   
...         ...       ...       ...       ...       ...       ...       ...   
14895  0.186567  0.107509  0.016908  0.041754  0.050981  0.017437  0.022028   
14896  0.000000  0.183528  0.040409  0.000000  0.000000  0.083346  0.052646   
14897  0.099553  0.150248  0.165407  0.000000  0.099748  0.068233  0.000000   
14898  0.000000  0.176558  0.077749  0.000000  0.039072  0.080181  0.050647   
14899  0.080593  0.182451  0.000000  0.099203  0.040376  0.041429  0.052337   

              7         8         9  ...  4990  499

# First 200 
We want to keep sequences of tokens for each text with a fixed length (e.g., 200 tokens) and handle sequences shorter than this length by padding them, but don't want to use 0 because it already has a meaning in the dataset.

Padding Token: Decide on a padding token using 5000.

If its length is more than 200, keep only the first 200 tokens.
If its length is less than 200, pad the sequence with the token 5000 until its length is 200.

In [14]:
def pad_or_truncate(sequence, target_length=200, padding_token=5000):
    # If sequence length is less than target, pad it
    if len(sequence) < target_length:
        return sequence + [padding_token] * (target_length - len(sequence))
    # If sequence length is more than target, truncate it
    elif len(sequence) > target_length:
        return sequence[:target_length]
    # If sequence length is equal to target, return as is
    return sequence

In [19]:
def first_200(data, domain):
    processed_texts = [pad_or_truncate(item["text"]) for item in data]
    labels = [item["label"] for item in data]

    # Convert to DataFrame
    df = pd.DataFrame(processed_texts, columns=[str(i) for i in range(200)])
    df["label"] = labels

    # Save the DataFrame
    output_filename = f'../data/preprocessed/{domain}_train_200.csv'
    df.to_csv(output_filename, index=False)

In [20]:
first_200(data1, "domain1")

In [21]:
first_200(data2, "domain2")