In [1]:
# Import required libraries
import pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [2]:
print("TensorFlow version:", tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


TensorFlow version: 2.10.1
Num GPUs Available:  0


In [3]:
# Define list of dataset
dir = '../dataset/'
datasets = [file for file in Path(dir).glob('*.csv') if not file.name == "best_dataset.csv"]

datasets


[WindowsPath('../dataset/set_01_02_03_04_0_0_0_new.csv'),
 WindowsPath('../dataset/set_01_02_03_04_0_0_1_new.csv'),
 WindowsPath('../dataset/set_01_02_03_04_0_1_0_new.csv'),
 WindowsPath('../dataset/set_01_02_03_04_0_1_1_new.csv'),
 WindowsPath('../dataset/set_01_02_03_04_1_0_0_new.csv'),
 WindowsPath('../dataset/set_01_02_03_04_1_0_1_new.csv'),
 WindowsPath('../dataset/set_01_02_03_04_1_1_0_new.csv'),
 WindowsPath('../dataset/set_01_02_03_04_1_1_1_new.csv')]

In [4]:
# Helper function to load data
def load_data(filename):
    print(filename)

    df = pd.read_csv(filename, header=None, encoding='utf-8').dropna()
    df.columns = ['label', 'data']
    df = df[:2]

    global X, y

    X = df['data']
    y = df['label']


In [5]:
X = None
y = None

load_data(datasets[0])
vectorizer = CountVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X).toarray()
print(vectorizer.get_feature_names_out())
print(X)
print()


..\dataset\set_01_02_03_04_0_0_0_new.csv
['attempt' 'await' 'before' 'box' 'call' 'callcost' 'cash' 'collection'
 'complimentary' 'contact' 'cs' 'csbcm' 'from' 'have' 'holiday' 'hp' 'is'
 'landline' 'max' 'mobilesvary' 'nd' 'number' 'or' 'ppm' 'sae' 'tenerife'
 'the' 'this' 'to' 'urgent' 'wc' 'won' 'xx' 'yf' 'you' 'your']
[[ 0  1  0  1  1  0  1  1  1  0  1  0  1  0  1  1  0  1  0  0  0  9  1  1
   1  1  0  0  0  1  0  0  0  1  0  1]
 [ 1  0  1  0  1  1  0  0  0  1  0  1  0  1  0  0  1  0  1  1  1 11  0  1
   0  0  1  1  1  1  1  1  1  0  2  0]]



In [6]:
load_data(datasets[0])
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X).toarray()
print(vectorizer.get_feature_names_out())
print(X)
print()


..\dataset\set_01_02_03_04_0_0_0_new.csv
['attempt' 'await' 'before' 'box' 'call' 'callcost' 'cash' 'collection'
 'complimentary' 'contact' 'cs' 'csbcm' 'from' 'have' 'holiday' 'hp' 'is'
 'landline' 'max' 'mobilesvary' 'nd' 'number' 'or' 'ppm' 'sae' 'tenerife'
 'the' 'this' 'to' 'urgent' 'wc' 'won' 'xx' 'yf' 'you' 'your']
[[0.         0.13184802 0.         0.13184802 0.09381095 0.
  0.13184802 0.13184802 0.13184802 0.         0.13184802 0.
  0.13184802 0.         0.13184802 0.13184802 0.         0.13184802
  0.         0.         0.         0.84429854 0.13184802 0.09381095
  0.13184802 0.13184802 0.         0.         0.         0.09381095
  0.         0.         0.         0.13184802 0.         0.13184802]
 [0.10991384 0.         0.10991384 0.         0.0782046  0.10991384
  0.         0.         0.         0.10991384 0.         0.10991384
  0.         0.10991384 0.         0.         0.10991384 0.
  0.10991384 0.10991384 0.10991384 0.8602506  0.         0.0782046
  0.         0.     

In [7]:
load_data(datasets[0])
vectorizer = HashingVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X).toarray()
print(X)
print()


..\dataset\set_01_02_03_04_0_0_0_new.csv
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]



In [8]:
load_data(datasets[0])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=100, padding='post', truncating='post')
print(tokenizer.word_index)
print(X)
print()


..\dataset\set_01_02_03_04_0_0_0_new.csv
{'number': 1, 'urgent': 2, 'call': 3, 't': 4, 'ppm': 5, 'you': 6, 'from': 7, 'landline': 8, 'your': 9, 'complimentary': 10, 'tenerife': 11, 'holiday': 12, 'or': 13, 'cash': 14, 'await': 15, 'collection': 16, 'sae': 17, 'cs': 18, 'box': 19, 'hp': 20, 'yf': 21, 'this': 22, 'is': 23, 'the': 24, 'nd': 25, 'attempt': 26, 'to': 27, 'contact': 28, 'have': 29, 'won': 30, 'before': 31, 'csbcm': 32, 'wc': 33, 'n': 34, 'xx': 35, 'callcost': 36, 'mobilesvary': 37, 'max': 38}
[[ 2  3  1  7  8  9 10  1 11 12 13  1  1 14 15 16 17  4 18 19  1 20  1  1
  21  1  5  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0]
 [ 1  2 22 23 24  1 25 26 27 28  6  6 29 30  1  3  1 31  1  4 32  1 33  1
  34  1 35 36  1  5 37 38  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0 