In [4]:
import pandas as pd

# import spacy
import numpy as np
import sklearn as skl 

In [2]:
target = 'label'
input_column = 'cleanTitle'

train_data = pd.read_pickle('../../../Files/Submissions/train/train_split_submission.pickle') 
valid_data = pd.read_pickle('../../../Files/Submissions/train/val_split_submission.pickle')
test_data = pd.read_pickle('../../../Files/Submissions/train/test_split_submission.pickle')

train_data = train_data[[target, input_column]]
valid_data = valid_data[[target, input_column]]
test_data = test_data[[target, input_column]]

data = pd.concat([train_data, valid_data, test_data])


train_instances = train_data[input_column].apply(str).apply(str.split)
train_labels = train_data[target]

# collect known word tokens and tags
wordset, labelset = set(), set()

# collect tags from all data, to prevent unseen labels
labelset.update(set(data[target]))

# get the vocabulary
for words in train_instances:
    wordset.update(set(words))

# map words and tags into ints
PAD = '-PAD-'
UNK = '-UNK-'
word2int = {word: i + 2 for i, word in enumerate(sorted(wordset))}
word2int[PAD] = 0  # special token for padding
word2int[UNK] = 1  # special token for unknown words
 
label2int = {label: i for i, label in enumerate(sorted(labelset))}
# inverted index to translate it back
int2label = {i:label for label, i in label2int.items()}


def convert2ints(instances):
    """
    function to apply the mapping to all words
    """
    result = []
    for words in instances:
        # replace words with int, 1 for unknown words
        word_ints = [word2int.get(word, 1) for word in words]
        result.append(word_ints)
    return result
                          
train_instances_int = convert2ints(train_instances)
train_labels_int = [label2int[label] for label in train_labels]

test_instances = test_data[input_column].apply(str).apply(str.split)
test_labels = test_data[target]

test_instances_int = convert2ints(test_instances)
test_labels_int = [label2int[label] for label in test_labels]

# convert dev data
val_instances = valid_data[input_column].apply(str).apply(str.split)
val_labels = valid_data[target]

val_instances_int = convert2ints(val_instances)
val_labels_int = [label2int[label] for label in val_labels]


In [11]:

# compute 95th percentile of training sentence lengths
L = sorted(map(len, train_instances))
MAX_LENGTH = L[int(len(L)*0.95)]

# apply padding
from tensorflow.keras.preprocessing.sequence import pad_sequences
train_instances_int = pad_sequences(train_instances_int, padding='post', maxlen=MAX_LENGTH)
test_instances_int = pad_sequences(test_instances_int, padding='post', maxlen=MAX_LENGTH)
val_instances_int = pad_sequences(val_instances_int, padding='post', maxlen=MAX_LENGTH)

In [6]:
from sklearn.linear_model import LogisticRegression 

In [7]:
lrf = LogisticRegression(random_state=42)

In [10]:
len(train_instances_int)

69000

In [12]:
lrf.fit(train_instances_int, train_labels_int)

LogisticRegression(random_state=42)

In [14]:
lrf.classes_

array([0, 1, 2])