In [None]:
#pip install scikit-learn

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.callbacks import EarlyStopping
import string
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import joblib




In [None]:
# Set NumPy random seed
np.random.seed(42)

# Set TensorFlow random seed
tf.random.set_seed(42)

Import file(s)

In [None]:
train_dataset = pd.read_csv("datasets/Labelled_1997_2017/HanDeSeT.csv")



In [None]:
train_dataset['Concat'] = train_dataset['utt1'].fillna('') + ' ' + train_dataset['utt2'].fillna('') + ' ' + train_dataset['utt3'].fillna('') + ' ' + train_dataset['utt4'].fillna('') + ' ' + train_dataset['utt5'].fillna('')



In [None]:
train_dataset = train_dataset.dropna(subset=['Concat'])
slimmed_train_dataset = train_dataset.drop (columns=['utt1', 'utt2', 'utt3', 'utt4', 'utt5', 'title', 'motion', 'id', 'party affiliation',])
slimmed_train_dataset.head()


## Pre-processing

nlp training data split (train, test, validate)

In [None]:
test_dataset = slimmed_train_dataset[['manual speech', 'Concat']]


In [None]:

# Split the dataset into training and test sets (80% train, 20% test)
train_dataset, test_dataset = train_test_split(test_dataset, test_size=0.2, random_state=42)

# Further split the training set into training and validation sets (80% train, 20% validation)
train_dataset, val_dataset = train_test_split(train_dataset, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

# Display the first few rows of each set
print("Training Set:")
print(train_dataset.head())
print("\nValidation Set:")
print(val_dataset.head())
print("\nTest Set:")
print(test_dataset.head())

Set up simple model so we can test on the whole dataset

In [None]:

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_dataset['Concat'])
X_val = vectorizer.transform(val_dataset['Concat'])
X_test = vectorizer.transform(test_dataset['Concat'])

# Get the target values
y_train = train_dataset['manual speech']
y_val = val_dataset['manual speech']
y_test = test_dataset['manual speech']

In [None]:

# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model on the validation set
val_predictions = model.predict(X_val)
print("Validation Set Evaluation:")
print(classification_report(y_val, val_predictions))

# Evaluate the model on the test set
test_predictions = model.predict(X_test)
print("Test Set Evaluation:")
print(classification_report(y_test, test_predictions))

joblib.dump(model, 'logistic_regression_model.pkl')

'''
to load the model use 
model = joblib.load('logistic_regression_model.pkl')
'''



Train nlp model
If we are going to use the keras from the 

variables

In [None]:
# vector
max_features = 10000
sequence_length = 250

# Dropout
dropout_1 = 0.2
dropout_2 = dropout_1

# Early stopping
_min_delta=0.001
_patience=1


embedding_dim = 32
epochs = 20


In [None]:
vectorize_layer = TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

# Extract the 'Concat' column for adaptation
train_text = train_dataset['Concat'].values

# Adapt the vectorize_layer to the training data
vectorize_layer.adapt(train_text)

# Function to vectorize text from the DataFrame
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    label = tf.expand_dims(label, -1)  # Reshape label to (batch_size, 1) if necessary
    return vectorize_layer(text), label

# Convert the DataFrame to TensorFlow Dataset
def dataframe_to_dataset(df):
    return tf.data.Dataset.from_tensor_slices((df['Concat'].values, df['manual speech'].values))

X_train = dataframe_to_dataset(train_dataset).map(vectorize_text)
X_val = dataframe_to_dataset(val_dataset).map(vectorize_text)
X_test = dataframe_to_dataset(test_dataset).map(vectorize_text)


In [None]:

tfmodel = tf.keras.Sequential([
    layers.Embedding(max_features+1, embedding_dim),
    layers.Dropout(dropout_1),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(dropout_2),
    layers.Dense(1)])


In [None]:
earlystop_callback = EarlyStopping(
    monitor='val_accuracy',
    min_delta=_min_delta,
    patience=_patience)

compile model

In [None]:
tfmodel.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

fit model

In [None]:
tfmodel_history = tfmodel.fit(
    X_train,
    validation_data=X_val,
    epochs=epochs,
    callbacks=earlystop_callback,
    verbose=1)

In [None]:
loss, accuracy = tfmodel.evaluate(X_test)
print ("EVALUATION ON TEST DATASET")
print (f"Loss : {loss:.2f}")
print (f"Accuracy : {accuracy:.2f}")

In [None]:
tfmodel.save('rebels.keras')

In [None]:
from tensorflow.keras.models import load_model
tfmodel = load_model('rebels.keras')

Assess model

At this point break out to another group to use trained model on nthe unlabelled data