In [1]:
import pandas as pd

In [4]:
import tensorflow as tf
import numpy as np

In [5]:
def splitText(dataFrame: pd.DataFrame) -> pd.DataFrame:
    dataFrame.text = dataFrame.text.apply(lambda x: x.split())
    dataFrame.labels = dataFrame.labels.apply(lambda x: x.split())
    # splits the dataframe into lists instead of strings 
    
    return dataFrame

In [6]:
df = pd.read_csv("ner.csv")
# loading dataset 

df = splitText(df)
# converting the text to lists 
df

Unnamed: 0,text,labels
0,"[Thousands, of, demonstrators, have, marched, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo..."
1,"[Iranian, officials, say, they, expect, to, ge...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,..."
2,"[Helicopter, gunships, Saturday, pounded, mili...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O..."
3,"[They, left, after, a, tense, hour-long, stand...","[O, O, O, O, O, O, O, O, O, O, O]"
4,"[U.N., relief, coordinator, Jan, Egeland, said...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo..."
...,...,...
47954,"[Opposition, leader, Mir, Hossein, Mousavi, ha...","[O, O, O, B-per, I-per, O, O, O, O, O, O, O, O..."
47955,"[On, Thursday, ,, Iranian, state, media, publi...","[O, B-tim, O, B-gpe, O, O, O, O, O, O, O, O, B..."
47956,"[Following, Iran, 's, disputed, June, 12, elec...","[O, B-geo, O, O, B-tim, I-tim, O, O, O, O, O, ..."
47957,"[Since, then, ,, authorities, have, held, publ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


## Label encoder

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
def getEncoder(dataFrame: pd.DataFrame) -> LabelEncoder:
    labelEncoder = LabelEncoder()
    labelEncoder.fit(["U"] + list(set(dataFrame.labels.explode())))

    return labelEncoder

In [9]:
labelEncoder = getEncoder(df)

totalClasses = len(labelEncoder.classes_)
totalClasses, labelEncoder.classes_
# initializing a labelEncoder with all the classes

(18,
 array(['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per',
        'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org',
        'I-per', 'I-tim', 'O', 'U'], dtype='<U5'))

In [10]:
def labelEncodeDataFrame(dataFrame: pd.DataFrame, labelEncoder:LabelEncoder) -> pd.DataFrame:
    dataFrame.labels = dataFrame.labels.apply(lambda x: labelEncoder.transform(x))
    return dataFrame

In [11]:
df = labelEncodeDataFrame(df, labelEncoder)
df
# label encoding the dataframe 

Unnamed: 0,text,labels
0,"[Thousands, of, demonstrators, have, marched, ...","[16, 16, 16, 16, 16, 16, 2, 16, 16, 16, 16, 16..."
1,"[Iranian, officials, say, they, expect, to, ge...","[3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16..."
2,"[Helicopter, gunships, Saturday, pounded, mili...","[16, 16, 7, 16, 16, 16, 16, 16, 2, 16, 16, 16,..."
3,"[They, left, after, a, tense, hour-long, stand...","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]"
4,"[U.N., relief, coordinator, Jan, Egeland, said...","[2, 16, 16, 6, 14, 16, 7, 16, 2, 16, 3, 16, 3,..."
...,...,...
47954,"[Opposition, leader, Mir, Hossein, Mousavi, ha...","[16, 16, 16, 6, 14, 16, 16, 16, 16, 16, 16, 16..."
47955,"[On, Thursday, ,, Iranian, state, media, publi...","[16, 7, 16, 3, 16, 16, 16, 16, 16, 16, 16, 16,..."
47956,"[Following, Iran, 's, disputed, June, 12, elec...","[16, 2, 16, 16, 7, 15, 16, 16, 16, 16, 16, 16,..."
47957,"[Since, then, ,, authorities, have, held, publ...","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1..."


## Tokenization

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [13]:
def getTokenizer(dataFrame:pd.DataFrame) -> Tokenizer:
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(dataFrame.text.explode("text").to_list())
    # initializing the tokenizer 

    return tokenizer

In [14]:
tokenizer = getTokenizer(df)

totalClasses = len(tokenizer.word_index) + 1
totalClasses
# initializing a tokenizer 

27943

In [15]:
def sequence(wordList:list, tokenizer:Tokenizer):
    seq = tokenizer.texts_to_sequences(wordList)
    return seq

In [16]:
def padSequence(sequence:list, maxLen:int, filler:int=0):
    if len(sequence) > maxLen:
        return sequence[:maxLen]
    totalZeros = maxLen - len(sequence) 
    return sequence + [filler for _ in range(totalZeros)]
# pad function 

In [17]:
def prepareData(sequence:list[list], tags:list, unknownFiller:int):
    assert len(sequence) == len(tags)
    newTags = []
    newSequence = []
    for i, s in enumerate(sequence):
        if len(s) == 0:
            newTags.append(unknownFiller)
            newSequence.append(0)
        else:
            for item in s:
                newTags.append(tags[i])
                newSequence.append(item)
                
    return newTags, newSequence

In [11]:
def preparePaddedData(sequence:list[list], tags:list, unknownFiller:int, maxLen:int, totalClasses:int):
    tags, sequence = prepareData(sequence, tags, unknownFiller)
    newTags = padSequence(tags, maxLen)
    newSequence = padSequence(sequence, maxLen)
    
    return newSequence, tf.one_hot(newTags, totalClasses)

In [19]:
maxLen = 35

## Data Pipeline

In [20]:
def prepareDataFrame(fileName:str, split=True, splitRatio:float=0.85) -> pd.DataFrame | tuple[pd.DataFrame]:
    df = pd.read_csv(fileName)
    df = splitText(df)
    df = labelEncodeDataFrame(df, labelEncoder)

    if split:
        df = df.sample(frac=1)
        splitIndex = int(df.shape[0] * splitRatio)
        train = df.iloc[:splitIndex].reset_index(drop=True)
        test = df.iloc[splitIndex:].reset_index(drop=True)
        return train, test
    else:
        return df

In [21]:
def dataGenerator(df:pd.DataFrame,tokenizer:Tokenizer,maxLen:int, totalClasses:int, fields=("text", "labels")):
    for x, y in zip(df[fields[0]], df[fields[1]]):
        
        s, t = preparePaddedData(sequence(x, tokenizer), y, totalClasses-1, maxLen, totalClasses)
        
        yield tf.constant(s, dtype=tf.float32), tf.constant(t, dtype=tf.float32)

In [36]:
def getTensorflowDataset(df:pd.DataFrame,tokenizer:Tokenizer,maxLen:int, totalClasses:int, fields=("text", "labels"), batchSize=64):
    dataset = tf.data.Dataset.from_generator(lambda : dataGenerator(df, tokenizer, maxLen, totalClasses, fields), output_signature=(
        tf.TensorSpec(shape=(maxLen,), dtype=tf.float32),
        tf.TensorSpec(shape=(maxLen, totalClasses), dtype=tf.float32),
    )).batch(batchSize)
    return dataset

In [37]:
dataset = prepareDataFrame("ner.csv", False)
train, test = prepareDataFrame("ner.csv")

In [38]:
trainingSet = getTensorflowDataset(train, totalClasses, maxLen, totalClasses)
testingSet = getTensorflowDataset(test, totalClasses, maxLen, totalClasses)

In [40]:
for x, y in trainingSet:
    if x.shape != y.shape:
        print(x, y)
        break
print("finished")

tf.Tensor(
[[1.2800e+02 2.2500e+03 1.0800e+02 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [1.0000e+00 3.6000e+01 3.1320e+03 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [1.8400e+02 5.1000e+01 3.3000e+01 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 ...
 [1.7530e+03 7.0000e+00 2.3600e+02 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [1.0000e+00 1.0600e+02 1.5581e+04 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [7.8000e+01 1.1280e+03 6.3000e+01 ... 0.0000e+00 0.0000e+00 0.0000e+00]], shape=(64, 35), dtype=float32) tf.Tensor(
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 ...

 [[0

## Testing


In [2]:
from modelPrep.utils import prepareDataFrame as p

In [3]:
train, test = p("ner.csv")

In [4]:
test

Unnamed: 0,text,labels
0,"[The, executives, say, their, profits, are, re...","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1..."
1,"[The, demonstrators, Sunday, called, on, the, ...","[16, 16, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16..."
2,"[Officials, and, media, reports, from, CNN-Tur...","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1..."
3,"[He, told, reporters, today, that, he, will, p...","[16, 16, 16, 7, 16, 16, 16, 16, 16, 16, 5, 16,..."
4,"[Iraqi, legislators, have, been, grappling, ov...","[3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16..."
...,...,...
7189,"[Temperatures, in, Moscow, have, hovered, near...","[16, 16, 2, 16, 16, 16, 16, 16, 16, 16, 16, 16..."
7190,"[In, another, development, Friday, ,, thousand...","[16, 16, 16, 7, 16, 16, 16, 3, 16, 16, 16, 16,..."
7191,"[Three, hurricanes, briefly, reached, the, mos...","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1..."
7192,"[The, U.S., military, in, Afghanistan, says, a...","[16, 2, 16, 16, 2, 16, 16, 5, 13, 16, 16, 3, 1..."


In [12]:
from modelPrep.utils import getTensorflowDataset 
from modelPrep.utils import getTokenizer 

In [6]:
tokenizer = t(pd.concat([train, test]))

27943

In [7]:
len(tokenizer.word_index)+1

27943

In [13]:
trainingSet = getTensorflowDataset(train, tokenizer, 35, 18)
testingSet = getTensorflowDataset(train, tokenizer, 35, 18)

In [14]:
for x, y in testingSet:
    print(x.shape, y.shape)
    break

(64, 35) (64, 35, 18)
