Preprocessing images

In [1]:
import pandas as pd
import tensorflow as tf
import os

In [2]:


def resize_and_pad_image(input_path, output_path, target_height=224, target_width=224):
    image = tf.io.read_file(input_path)
    image = tf.image.decode_image(image)

    # Calculate the new width to maintain aspect ratio
    original_height, original_width = tf.shape(image)[0], tf.shape(image)[1]
    aspect_ratio = original_width / original_height
    if aspect_ratio > 1:  # Image is wider than it is tall
        new_width = target_width
        new_height = tf.cast(target_width / aspect_ratio, tf.int32)
    else:  # Image is taller than it is wide
        new_height = target_height
        new_width = tf.cast(target_height * aspect_ratio, tf.int32)
    
    resized_image = tf.image.resize(image, [new_height, new_width], method=tf.image.ResizeMethod.AREA)
    resized_image = tf.cast(resized_image, tf.uint8)

    # Pad the image to target dimensions
    delta_height = target_height - new_height
    delta_width = target_width - new_width
    pad_top = delta_height // 2
    pad_bottom = delta_height - pad_top
    pad_left = delta_width // 2
    pad_right = delta_width - pad_left

    padded_image = tf.image.pad_to_bounding_box(resized_image, pad_top, pad_left, target_height, target_width)

    encoded_image = tf.io.encode_jpeg(padded_image)
    tf.io.write_file(output_path, encoded_image)

# Example usage
input_directory = "C:\\Kingsley\\multimodia-hatespeech\\image_latest(balanced)(testing)"
output_directory = "C:\\Kingsley\\multimodia-hatespeech\\image_latest(balanced)(testing)(resized)"
os.makedirs(output_directory, exist_ok=True)

for filename in os.listdir(input_directory):
    if filename.endswith(".jpg") or filename.endswith(".png"):  # Adjust the extensions as needed
        input_path = os.path.join(input_directory, filename)
        output_path = os.path.join(output_directory, filename)
        resize_and_pad_image(input_path, output_path, target_height=224, target_width=224)


Feature Extraction

In [16]:
# Load the CSV file
file_path = 'train.xlsx'  # replace with your file path
file_path2 = 'test.xlsx'
file_path3 = 'validation.xlsx'


df = pd.read_excel(file_path)
df2 = pd.read_excel(file_path2)
df3 = pd.read_excel(file_path3)

# Map the 'bully/nonbully_str' column to numerical values
mapping = {'bully': 1, 'non-bully': 0}
df['target'] = df['bully/nonbully_str'].map(mapping)
df2['target'] = df['bully/nonbully_str'].map(mapping)
df3['target'] = df['bully/nonbully_str'].map(mapping)

# Save the transformed data back to a CSV file (optional)
output_file_path = 'transformed_file.csv'  # replace with your desired output file path
df.to_csv(output_file_path, index=False)

print("Transformation complete. The transformed file is saved as:", output_file_path)


In [3]:
file_path4 = 'merged(Text)(CleanedLatest).xlsx'

df4 = pd.read_excel(file_path4)
mapping = {'bully': 1, 'non-bully': 0}
df4['target'] = df4['bully/nonbully_str'].map(mapping)

In [32]:
# Save the transformed data back to a CSV file (optional)
output_file_path = 'train(new).xlsx'  # replace with your desired output file path
df.to_excel(output_file_path, index=False)

output_file_path2 = 'test(new).xlsx'  # replace with your desired output file path
df2.to_excel(output_file_path2, index=False)

output_file_path3 = 'validation(new).xlsx'  # replace with your desired output file path
df3.to_excel(output_file_path3, index=False)

print("Transformation complete. The transformed file is saved as:", output_file_path)
print("Transformation complete. The transformed file is saved as:", output_file_path2)
print("Transformation complete. The transformed file is saved as:", output_file_path3)

Transformation complete. The transformed file is saved as: train(new).xlsx
Transformation complete. The transformed file is saved as: test(new).xlsx
Transformation complete. The transformed file is saved as: validation(new).xlsx


In [13]:
output_file_path4 = 'merged(Text)(CleanedLatest)(latest).xlsx'  # replace with your desired output file path
df4.to_excel(output_file_path4, index=False)

In [27]:
df


Unnamed: 0,img_id,tweet_text_processed,img_text,target
0,1051575979045457922,atm fraudster arrested swapping victim card,,0
1,1054304579368747009,another sale get one riding bare cock till cre...,,1
2,1044370988232634368,mom new retarded,,1
3,1053435457420308480,henry van dyke via,friend ne hart need ne heury dike brainyquote,0
4,1108484731346345984,nigga never interested try make jealous,,0
...,...,...,...,...
92507,1050945729395650560,sjw idiot want power use physical threat viole...,social justice warrior western civilization,1
92508,1114624709415424000,even nigga got better rhyme little cozy,cat hat beginner book drseuss,0
92509,1064259627922055168,fucking retarded,,1
92510,1115281019421319168,betta smack childish as nigga,,1


In [29]:
df['img_id'] = df['img_id'].astype('str')
df2['img_id'] = df['img_id'].astype('str')
df3['img_id'] = df['img_id'].astype('str')

In [7]:
df4['img_id'] = df4['img_id'].astype('str')

In [9]:
df4 = df4.drop(columns=['bully/nonbully_str'])
df4 = df4.drop(columns=['tweet_text'])
df4 = df4.drop(columns=['labels_str'])

In [23]:
# Drop the column
df = df.drop(columns=['bully/nonbully_str'])
df2 = df2.drop(columns=['bully/nonbully_str'])
df3 = df3.drop(columns=['bully/nonbully_str'])


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import TFBertModel, BertTokenizer
from transformers import TFDistilBertModel, DistilBertTokenizer
from transformers import AutoTokenizer, TFAutoModel
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
import os
import cv2
from tensorflow.keras.callbacks import LearningRateScheduler, EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
import keras_tuner as kt
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, BatchNormalization, Embedding, Bidirectional,GlobalAveragePooling2D, Concatenate


In [3]:
# Load the data
df = pd.read_excel('merged(Text)(CleanedLatest)(latest).xlsx')

# Define paths to the images
image_path = 'C:\\Kingsley\\multimodia-hatespeech\\image_latest(balanced)(testing)(resized)'

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115640 entries, 0 to 115639
Data columns (total 4 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   img_id                115640 non-null  int64 
 1   tweet_text_processed  115640 non-null  object
 2   img_text              44064 non-null   object
 3   target                115640 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 3.5+ MB


In [5]:
df['img_id'] = df['img_id'].astype('str')

In [6]:
df['img_text'] = df['img_text'].fillna('[IMG_MISSING]')

In [7]:
df

Unnamed: 0,img_id,tweet_text_processed,img_text,target
0,1023940590382268417,better retard picture,[IMG_MISSING],1
1,1023940897346658307,throwing sjw word board creating sentence,ull verizon lte tweet nicktendo h ppl dont lik...,1
2,1023942214844657664,ye either die sound cunt live long enough tae ...,[IMG_MISSING],1
3,1023942220838264837,dante cunt would struggle gene,[IMG_MISSING],1
4,1023943177319919616,retard joke,silverwing hold antact power n c alliance play...,1
...,...,...,...,...
115635,1117698927145443328,stanning whole dyke look omg,[IMG_MISSING],0
115636,1117699493301035008,get dog dawgs work nigga,worldstarhiphopcom,0
115637,1117699973389398016,woman gon na hop onto quote nigga euron null void,[IMG_MISSING],0
115638,1117700169527701504,piss timmy cunt,ches city blocked jeremy blocked sure want vie...,0


checkGPU

In [8]:
# Check if TensorFlow is using GPU
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("TensorFlow version: ", tf.__version__)

# List all available GPUs
print("Available GPUs: ", tf.config.experimental.list_physical_devices('GPU'))

Num GPUs Available:  1
TensorFlow version:  2.10.1
Available GPUs:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [9]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only allocate a specific amount of memory on the GPU
        for gpu in gpus:
            tf.config.experimental.set_virtual_device_configuration(
                gpu,
                [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)])  # Adjust the memory limit as needed
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)


1 Physical GPUs, 1 Logical GPUs


Text Feature extraction using BERT Model

In [10]:
# Load pre-trained Twitter-based hate speech detection model and tokenizer
model_name = 'cardiffnlp/twitter-roberta-base-hate'  # Replace with specific model name
twitter_model = TFAutoModel.from_pretrained(model_name)
twitter_tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to extract Twitter-based model features
def extract_twitter_features(texts):
    inputs = twitter_tokenizer(texts, return_tensors='tf', padding='max_length', truncation=True, max_length=128)
    outputs = twitter_model(inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()


Some layers from the model checkpoint at cardiffnlp/twitter-roberta-base-hate were not used when initializing TFRobertaModel: ['classifier']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-hate.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [11]:
# Function to extract Twitter-based model features in batches
def extract_twitter_features_in_batches(texts, batch_size=64):
    features = []
    for start in range(0, len(texts), batch_size):
        end = start + batch_size
        batch_texts = texts[start:end]
        # Replace '[IMG_MISSING]' with empty strings
        batch_texts = [text if text != '[IMG_MISSING]' else '' for text in batch_texts]
        batch_features = extract_twitter_features(batch_texts)
        features.extend(batch_features)
    return np.array(features)

In [12]:
# Load pre-trained ResNet50 model
resnet50 = ResNet50(weights='imagenet', include_top=False, pooling='avg', input_shape=(224, 224, 3))

In [13]:
# Function to extract ResNet50 features in batches
def extract_image_features_in_batches(image_paths, batch_size=64):
    num_images = len(image_paths)
    image_features = []

    for i in range(0, num_images, batch_size):
        batch_images = []
        for img_path in image_paths[i:i+batch_size]:
            # Load image using cv2
            image = cv2.imread(img_path)
            if image is None:
                print(f"Warning: Unable to read image from {img_path}")
                continue
            
            # Preprocess image for ResNet50
            # image = cv2.resize(image, (224, 224))  # Resize to match input_shape
            image = preprocess_input(image)
            batch_images.append(image)
        
        if not batch_images:
            continue
        
        # Predict batch features using ResNet50
        batch_images = np.array(batch_images)
        batch_features = resnet50.predict(batch_images)
        image_features.extend(batch_features)
    
    return np.array(image_features)

In [14]:
image_paths = [os.path.join(image_path, img_id + '.jpg') for img_id in df['img_id']]
    
    # Extract image features in batches
batch_size = 32

image_features = extract_image_features_in_batches(image_paths, batch_size=batch_size)



In [15]:
# Check alignment
if len(df['img_id']) == len(image_features):
    print("Number of img_id matches number of extracted features.")
else:
    print("Number of img_id does not match number of extracted features.")

Number of img_id matches number of extracted features.


In [16]:
# Optionally, print a few img_id and their corresponding features to verify
for i in range(min(5, len(df['img_id']))):  # Print the first 5 examples
    print(f"img_id: {df['img_id'][i]}, features: {image_features[i]}")

img_id: 1023940590382268417, features: [3.8504231e-01 3.9482018e-01 3.1085679e-02 ... 0.0000000e+00 2.5362190e-04
 2.2766761e-01]
img_id: 1023940897346658307, features: [0.72033995 0.06688589 0.17639667 ... 0.24821106 0.09592003 0.2079422 ]
img_id: 1023942214844657664, features: [0.27171835 0.05135023 0.02516944 ... 0.         0.14373781 0.5471936 ]
img_id: 1023942220838264837, features: [1.8894482  0.06220363 0.20111111 ... 0.         0.07777242 0.13151939]
img_id: 1023943177319919616, features: [0.1506129  0.02870112 0.2586019  ... 0.11260942 0.0109412  1.0708885 ]


In [17]:
print("Shape of extracted features:", image_features.shape)

Shape of extracted features: (115640, 2048)


In [18]:
# Extract features in batches
tweet_text_features = extract_twitter_features_in_batches(df['tweet_text_processed'].tolist(), batch_size=32)
img_text_features = extract_twitter_features_in_batches(df['img_text'].tolist(), batch_size=32)


# # Ensure the feature arrays are aligned correctly
# tweet_text_features = np.squeeze(tweet_text_features)
# img_text_features = np.squeeze(img_text_features)
# image_features = np.squeeze(image_features)




In [20]:
# Print a few examples to verify alignment
for i in range(min(5, len(df['img_id']))):  # Print the first 5 examples
    print(f"img_id: {df['img_id'][i]}, tweet_text_features: {tweet_text_features[i]}, img_text_features: {img_text_features[i]}")

img_id: 1023940590382268417, tweet_text_features: [-4.25549984e-01 -4.68524843e-02 -2.15194315e-01 -3.68572503e-01
  1.07063675e+00  5.49812675e-01  9.19857174e-02 -1.52721852e-01
  4.10989597e-02 -5.53965509e-01  1.85983539e-01  4.27299201e-01
  7.92190358e-02 -3.04556459e-01  4.54545617e-02 -1.55843362e-01
 -3.30030620e-01 -1.35909989e-01 -1.93031639e-01 -1.35186031e-01
 -4.55824584e-01  5.10433912e-01  3.40315133e-01  1.44292518e-01
 -3.03710163e-01 -2.69451499e-01 -1.53185382e-01 -2.01195553e-01
 -6.65639788e-02  1.85954735e-01 -1.07024801e+00 -4.90711443e-02
  2.01151058e-01  1.60057068e-01 -2.62377620e-01 -6.20686263e-03
  5.00549912e-01  1.91117063e-01 -5.59349179e-01  1.74540311e-01
  3.09544474e-01 -6.73342228e-01  2.18394145e-01  8.25739577e-02
  1.68969467e-01  4.38129514e-01 -2.74888098e-01  2.87052393e-01
  2.76305616e-01  2.56952584e-01 -3.06001484e-01 -2.32720882e-01
 -2.32154280e-01 -1.95621923e-01 -6.42215669e-01  1.35721266e-01
  3.13647389e-01  5.43100536e-01  4.7724

In [19]:
# Combine the features
combined_features = np.concatenate([tweet_text_features, img_text_features, image_features], axis=1)

In [22]:
txt_combined_features= np.concatenate([tweet_text_features, img_text_features], axis=1)

Text and Image Features

In [37]:
# Split the dataset
train_features, temp_features, train_labels, temp_labels = train_test_split(combined_features, df['target'], test_size=0.2, random_state=42)
val_features, test_features, val_labels, test_labels = train_test_split(temp_features, temp_labels, test_size=0.5, random_state=42)



Text Features

In [25]:
train_features_txt, temp_features_txt, train_labels_txt, temp_labels_txt = train_test_split(txt_combined_features, df['target'], test_size=0.2, random_state=42)
val_features_txt, test_features_txt, val_labels_txt, test_labels_txt = train_test_split(temp_features_txt, temp_labels_txt, test_size=0.5, random_state=42)

Image Features

In [21]:
train_features_img, temp_features_img, train_labels_img, temp_labels_img = train_test_split(image_features, df['target'], test_size=0.2, random_state=42)
val_features_img, test_features_img, val_labels_img, test_labels_img = train_test_split(temp_features_img, temp_labels_img, test_size=0.5, random_state=42)

Using LG and KNN for classfication for Image and Text

In [25]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [26]:
# Standardize features
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
val_features = scaler.transform(val_features)
test_features = scaler.transform(test_features)

In [27]:
# K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train_features, train_labels)
knn_predictions = knn.predict(test_features)
knn_accuracy = accuracy_score(test_labels, knn_predictions)
print(f'KNN Test Accuracy: {knn_accuracy}')

conf_matrix = confusion_matrix(test_labels, knn_predictions)

# Print the confusion matrix
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'KNN Classification Report:\n{classification_report(test_labels, knn_predictions)}')





KNN Test Accuracy: 0.5570736769283985
Confusion Matrix:
[[3551 2271]
 [2851 2891]]
KNN Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.61      0.58      5822
           1       0.56      0.50      0.53      5742

    accuracy                           0.56     11564
   macro avg       0.56      0.56      0.56     11564
weighted avg       0.56      0.56      0.56     11564



In [23]:
# Support Vector Machine
svm = SVC(kernel='rbf')
svm.fit(train_features, train_labels)
svm_predictions = svm.predict(test_features)
svm_accuracy = accuracy_score(test_labels, svm_predictions)
print(f'SVM Test Accuracy: {svm_accuracy}')
print(f'SVM Classification Report:\n{classification_report(test_labels, svm_predictions)}')

SVM Test Accuracy: 0.6282428225527499
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.74      0.67      5822
           1       0.66      0.51      0.58      5742

    accuracy                           0.63     11564
   macro avg       0.63      0.63      0.62     11564
weighted avg       0.63      0.63      0.62     11564



In [28]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(train_features, train_labels)
lr_predictions = lr.predict(test_features)
lr_accuracy = accuracy_score(test_labels, lr_predictions)
print(f'Logistic Regression Test Accuracy: {lr_accuracy}')
conf_matrix = confusion_matrix(test_labels, lr_predictions)

# Print the confusion matrix
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Logistic Regression Classification Report:\n{classification_report(test_labels, lr_predictions)}')

Logistic Regression Test Accuracy: 0.6130231753718437
Confusion Matrix:
[[3886 1936]
 [2539 3203]]
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.67      0.63      5822
           1       0.62      0.56      0.59      5742

    accuracy                           0.61     11564
   macro avg       0.61      0.61      0.61     11564
weighted avg       0.61      0.61      0.61     11564



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Create a pipeline that scales the data and then applies logistic regression with the 'saga' solver
pipeline = make_pipeline(StandardScaler(), LogisticRegression(solver='saga', max_iter=10000))

# Train the logistic regression model
pipeline.fit(train_features, train_labels)

# Predict and evaluate
logistic_predictions = pipeline.predict(test_features)
logistic_accuracy = accuracy_score(test_labels, logistic_predictions)
print(f'Logistic Regression Test Accuracy: {logistic_accuracy}')
print(f'Logistic Regression Classification Report:\n{classification_report(test_labels, logistic_predictions)}')


Logistic Regression Test Accuracy: 0.6118989968868903
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.67      0.63      5822
           1       0.62      0.56      0.59      5742

    accuracy                           0.61     11564
   macro avg       0.61      0.61      0.61     11564
weighted avg       0.61      0.61      0.61     11564



Tuning for all models

In [34]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for KNN
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Initialize the KNN classifier
knn = KNeighborsClassifier()

# Initialize Grid Search with Cross-Validation
knn_grid_search = GridSearchCV(estimator=knn, param_grid=knn_param_grid, cv=5, n_jobs=-1, scoring='accuracy')

# Fit the model
knn_grid_search.fit(train_features, train_labels)

# Get the best parameters and best score
knn_best_params = knn_grid_search.best_params_
knn_best_score = knn_grid_search.best_score_

print(f'Best KNN parameters: {knn_best_params}')
print(f'Best KNN cross-validation score: {knn_best_score}')

# Predict with the best model
knn_best_model = knn_grid_search.best_estimator_
knn_predictions = knn_best_model.predict(test_features)
knn_accuracy = accuracy_score(test_labels, knn_predictions)
print(f'KNN Test Accuracy: {knn_accuracy}')

conf_matrix = confusion_matrix(test_labels, knn_predictions)

# Print the confusion matrix
print(f'Confusion Matrix:\n{conf_matrix}')

print(f'KNN Classification Report:\n{classification_report(test_labels, knn_predictions)}')


Best KNN parameters: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'uniform'}
Best KNN cross-validation score: 0.5688559322516614
KNN Test Accuracy: 0.5665859564164649
Confusion Matrix:
[[3703 2119]
 [2893 2849]]
KNN Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.64      0.60      5822
           1       0.57      0.50      0.53      5742

    accuracy                           0.57     11564
   macro avg       0.57      0.57      0.56     11564
weighted avg       0.57      0.57      0.56     11564



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Logistic Regression
logistic_param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['saga'],
    'l1_ratio': [0.0, 0.25, 0.5, 0.75, 1.0]  # Only used if penalty is 'elasticnet'
}

# Initialize the Logistic Regression classifier
logistic = LogisticRegression(max_iter=1000)

# Perform Grid Search with cross-validation
logistic_grid_search = GridSearchCV(logistic, logistic_param_grid, cv=5, n_jobs=8, scoring='accuracy')
logistic_grid_search.fit(train_features, train_labels)

# Print the best parameters and best score
print(f"Best Logistic Regression parameters: {logistic_grid_search.best_params_}")
print(f"Best Logistic Regression cross-validation score: {logistic_grid_search.best_score_}")

# Predict using the best model from the grid search
best_logistic_model = logistic_grid_search.best_estimator_
test_predictions = best_logistic_model.predict(test_features)

# Compute the confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions)

# Print the confusion matrix
print(f'Confusion Matrix:\n{conf_matrix}')

# Print the classification report
print(classification_report(test_labels, test_predictions))


Using LG, SVM and KNN for classfication for Text

In [29]:
# Standardize features
scaler = StandardScaler()
train_features_txt = scaler.fit_transform(train_features_txt)
val_features_txt = scaler.transform(val_features_txt)
test_features_txt = scaler.transform(test_features_txt)

In [47]:
# K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train_features_txt, train_labels_txt)
knn_predictions = knn.predict(test_features_txt)
knn_accuracy = accuracy_score(test_labels_txt, knn_predictions)
print(f'KNN Test Accuracy: {knn_accuracy}')
# Compute the confusion matrix
conf_matrix = confusion_matrix(test_labels_txt, knn_predictions)

# Print the confusion matrix
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'KNN Classification Report:\n{classification_report(test_labels_txt, knn_predictions)}')


KNN Test Accuracy: 0.5711691456243514
Confusion Matrix:
[[3356 2466]
 [2493 3249]]
KNN Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.58      0.58      5822
           1       0.57      0.57      0.57      5742

    accuracy                           0.57     11564
   macro avg       0.57      0.57      0.57     11564
weighted avg       0.57      0.57      0.57     11564



In [31]:
# Support Vector Machine
svm = SVC(kernel='rbf')
svm.fit(train_features_txt, train_labels_txt)
svm_predictions = svm.predict(test_features_txt)
svm_accuracy = accuracy_score(test_labels_txt, svm_predictions)
print(f'SVM Test Accuracy: {svm_accuracy}')
print(f'SVM Classification Report:\n{classification_report(test_labels_txt, svm_predictions)}')

SVM Test Accuracy: 0.6280698720166032
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.75      0.67      5822
           1       0.67      0.50      0.57      5742

    accuracy                           0.63     11564
   macro avg       0.64      0.63      0.62     11564
weighted avg       0.64      0.63      0.62     11564



In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Create a pipeline that scales the data and then applies logistic regression with the 'saga' solver
pipeline = make_pipeline(StandardScaler(), LogisticRegression(solver='saga', max_iter=10000))

# Train the logistic regression model
pipeline.fit(train_features_txt, train_labels_txt)

# Predict and evaluate
logistic_predictions = pipeline.predict(test_features_txt)
logistic_accuracy = accuracy_score(test_labels_txt, logistic_predictions)
print(f'Logistic Regression Test Accuracy: {logistic_accuracy}')

# Compute the confusion matrix
conf_matrix = confusion_matrix(test_labels_txt, logistic_predictions)

# Print the confusion matrix
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Logistic Regression Classification Report:\n{classification_report(test_labels_txt, logistic_predictions)}')


Logistic Regression Test Accuracy: 0.6150985818056036
Confusion Matrix:
[[4097 1725]
 [2726 3016]]
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.70      0.65      5822
           1       0.64      0.53      0.58      5742

    accuracy                           0.62     11564
   macro avg       0.62      0.61      0.61     11564
weighted avg       0.62      0.62      0.61     11564



Hyperparameter Tuning

In [49]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for KNN
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Initialize the KNN classifier
knn = KNeighborsClassifier()

# Initialize Grid Search with Cross-Validation
knn_grid_search = GridSearchCV(estimator=knn, param_grid=knn_param_grid, cv=5, n_jobs=-1, scoring='accuracy')

# Fit the model
knn_grid_search.fit(train_features_txt, train_labels_txt)

# Get the best parameters and best score
knn_best_params = knn_grid_search.best_params_
knn_best_score = knn_grid_search.best_score_

print(f'Best KNN parameters: {knn_best_params}')
print(f'Best KNN cross-validation score: {knn_best_score}')

# Predict with the best model
knn_best_model = knn_grid_search.best_estimator_
knn_predictions = knn_best_model.predict(test_features_txt)
knn_accuracy = accuracy_score(test_labels_txt, knn_predictions)
print(f'KNN Test Accuracy: {knn_accuracy}')
# Compute the confusion matrix
conf_matrix = confusion_matrix(test_labels_txt, knn_predictions)

# Print the confusion matrix
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'KNN Classification Report:\n{classification_report(test_labels_txt, knn_predictions)}')


Best KNN parameters: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'uniform'}
Best KNN cross-validation score: 0.5759902318410907
KNN Test Accuracy: 0.5775683154617779
Confusion Matrix:
[[3492 2330]
 [2555 3187]]
KNN Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.60      0.59      5822
           1       0.58      0.56      0.57      5742

    accuracy                           0.58     11564
   macro avg       0.58      0.58      0.58     11564
weighted avg       0.58      0.58      0.58     11564



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Function to perform batch processing
def batch_grid_search_svm(estimator, param_grid, X_train, y_train, batch_size=5000, n_jobs=4):
    best_params = None
    best_score = -1
    n_samples = len(X_train)
    
    for i in range(0, n_samples, batch_size):
        X_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]
        
        # Perform Grid Search with cross-validation
        grid_search = GridSearchCV(estimator, param_grid, cv=5, n_jobs=n_jobs, scoring='accuracy')
        grid_search.fit(X_batch, y_batch)
        
        if grid_search.best_score_ > best_score:
            best_score = grid_search.best_score_
            best_params = grid_search.best_params_
    
    return best_params, best_score

# Define the parameter grid for SVM
svm_param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

# Initialize the SVM classifier
svm = SVC()

# Perform batch grid search for SVM
svm_best_params, svm_best_score = batch_grid_search_svm(svm, svm_param_grid, train_features_txt, train_labels_txt, batch_size=5000, n_jobs=4)

print(f"Best SVM parameters: {svm_best_params}")
print(f"Best SVM cross-validation score: {svm_best_score}")

# Fit the best model on the entire training data
svm_best_model = SVC(**svm_best_params)
svm_best_model.fit(train_features_txt, train_labels_txt)

# Predict with the best model
svm_predictions = svm_best_model.predict(test_features_txt)
svm_accuracy = accuracy_score(test_labels_txt, svm_predictions)
print(f"SVM Test Accuracy: {svm_accuracy}")
print(f"SVM Classification Report:\n{classification_report(test_labels_txt, svm_predictions)}")


In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Logistic Regression
logistic_param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['saga'],
    'l1_ratio': [0.0, 0.25, 0.5, 0.75, 1.0]  # Only used if penalty is 'elasticnet'
}

# Initialize the Logistic Regression classifier
logistic = LogisticRegression(max_iter=1000)

# Perform Grid Search with cross-validation
logistic_grid_search = GridSearchCV(logistic, logistic_param_grid, cv=5, n_jobs=-1, scoring='accuracy')
logistic_grid_search.fit(train_features_txt, train_labels_txt)

# Print the best parameters and best score
print(f"Best Logistic Regression parameters: {logistic_grid_search.best_params_}")
print(f"Best Logistic Regression cross-validation score: {logistic_grid_search.best_score_}")

# Predict using the best model from the grid search
best_logistic_model = logistic_grid_search.best_estimator_
test_predictions = best_logistic_model.predict(test_features_txt)

# Compute the confusion matrix
conf_matrix = confusion_matrix(test_labels_txt, test_predictions)

# Print the confusion matrix
print(f'Confusion Matrix:\n{conf_matrix}')

# Print the classification report
print(classification_report(test_labels_txt, test_predictions))




Best Logistic Regression parameters: {'C': 0.1, 'l1_ratio': 0.75, 'penalty': 'l1', 'solver': 'saga'}
Best Logistic Regression cross-validation score: 0.6167848277654426
Confusion Matrix:
[[4125 1697]
 [2743 2999]]
              precision    recall  f1-score   support

           0       0.60      0.71      0.65      5822
           1       0.64      0.52      0.57      5742

    accuracy                           0.62     11564
   macro avg       0.62      0.62      0.61     11564
weighted avg       0.62      0.62      0.61     11564



Using LG, SVM and KNN for classfication for Image

In [31]:
# Standardize features
scaler = StandardScaler()
train_features_img = scaler.fit_transform(train_features_img)
val_features_img = scaler.transform(val_features_img)
test_features_img = scaler.transform(test_features_img)

In [52]:
# K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train_features_img, train_labels_img)
knn_predictions = knn.predict(test_features_img)
knn_accuracy = accuracy_score(test_labels_img, knn_predictions)
print(f'KNN Test Accuracy: {knn_accuracy}')
# Compute the confusion matrix
conf_matrix = confusion_matrix(test_labels_img, knn_predictions)

# Print the confusion matrix
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'KNN Classification Report:\n{classification_report(test_labels_img, knn_predictions)}')


KNN Test Accuracy: 0.5291421653407126
Confusion Matrix:
[[3198 2624]
 [2821 2921]]
KNN Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.55      0.54      5822
           1       0.53      0.51      0.52      5742

    accuracy                           0.53     11564
   macro avg       0.53      0.53      0.53     11564
weighted avg       0.53      0.53      0.53     11564



In [53]:
# Support Vector Machine
svm = SVC(kernel='rbf')
svm.fit(train_features_img, train_labels_img)
svm_predictions = svm.predict(test_features_img)
svm_accuracy = accuracy_score(test_labels_img, svm_predictions)
print(f'SVM Test Accuracy: {svm_accuracy}')
print(f'SVM Classification Report:\n{classification_report(test_labels_img, svm_predictions)}')

In [32]:

# Create a pipeline that scales the data and then applies logistic regression with the 'saga' solver
pipeline = make_pipeline(StandardScaler(), LogisticRegression(solver='saga', max_iter=10000))

# Train the logistic regression model
pipeline.fit(train_features_img, train_labels_img)

# Predict and evaluate
logistic_predictions = pipeline.predict(test_features_img)
logistic_accuracy = accuracy_score(test_labels_img, logistic_predictions)
print(f'Logistic Regression Test Accuracy: {logistic_accuracy}')
# Compute the confusion matrix
conf_matrix = confusion_matrix(test_labels_img, logistic_predictions)

# Print the confusion matrix
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Logistic Regression Classification Report:\n{classification_report(test_labels_img, logistic_predictions)}')


Logistic Regression Test Accuracy: 0.5396056727775856
Confusion Matrix:
[[3314 2508]
 [2816 2926]]
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.57      0.55      5822
           1       0.54      0.51      0.52      5742

    accuracy                           0.54     11564
   macro avg       0.54      0.54      0.54     11564
weighted avg       0.54      0.54      0.54     11564



Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for KNN
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Initialize the KNN classifier
knn = KNeighborsClassifier()

# Initialize Grid Search with Cross-Validation
knn_grid_search = GridSearchCV(estimator=knn, param_grid=knn_param_grid, cv=5, n_jobs=-1, scoring='accuracy')

# Fit the model
knn_grid_search.fit(train_features_img, train_labels_img)

# Get the best parameters and best score
knn_best_params = knn_grid_search.best_params_
knn_best_score = knn_grid_search.best_score_

print(f'Best KNN parameters: {knn_best_params}')
print(f'Best KNN cross-validation score: {knn_best_score}')

# Predict with the best model
knn_best_model = knn_grid_search.best_estimator_
knn_predictions = knn_best_model.predict(test_features_img)
knn_accuracy = accuracy_score(test_labels_img, knn_predictions)
print(f'KNN Test Accuracy: {knn_accuracy}')
# Compute the confusion matrix
conf_matrix = confusion_matrix(test_labels_img, knn_predictions)

# Print the confusion matrix
print(f'Confusion Matrix:\n{conf_matrix}')

print(f'KNN Classification Report:\n{classification_report(test_labels_img, knn_predictions)}')


Best KNN parameters: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
Best KNN cross-validation score: 0.5284070865567596
KNN Test Accuracy: 0.5283638879280526
KNN Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.56      0.54      5822
           1       0.53      0.50      0.51      5742

    accuracy                           0.53     11564
   macro avg       0.53      0.53      0.53     11564
weighted avg       0.53      0.53      0.53     11564



In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Logistic Regression
logistic_param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['saga'],
    'l1_ratio': [0.0, 0.25, 0.5, 0.75, 1.0]  # Only used if penalty is 'elasticnet'
}

# Initialize the Logistic Regression classifier
logistic = LogisticRegression(max_iter=1000)

# Perform Grid Search with cross-validation
logistic_grid_search = GridSearchCV(logistic, logistic_param_grid, cv=5, n_jobs=-1, scoring='accuracy')
logistic_grid_search.fit(train_features_img, train_labels_img)

# Print the best parameters and best score
print(f"Best Logistic Regression parameters: {logistic_grid_search.best_params_}")
print(f"Best Logistic Regression cross-validation score: {logistic_grid_search.best_score_}")

# Predict using the best model from the grid search
best_logistic_model = logistic_grid_search.best_estimator_
test_predictions = best_logistic_model.predict(test_features_img)

# Compute the confusion matrix
conf_matrix = confusion_matrix(test_labels_img, test_predictions)

# Print the confusion matrix
print(f'Confusion Matrix:\n{conf_matrix}')

# Print the classification report
print(classification_report(test_labels_img, test_predictions))


Best Logistic Regression parameters: {'C': 1, 'l1_ratio': 0.75, 'penalty': 'elasticnet', 'solver': 'saga'}
Best Logistic Regression cross-validation score: 0.5376492167174823
Confusion Matrix:
[[3314 2508]
 [2814 2928]]
              precision    recall  f1-score   support

           0       0.54      0.57      0.55      5822
           1       0.54      0.51      0.52      5742

    accuracy                           0.54     11564
   macro avg       0.54      0.54      0.54     11564
weighted avg       0.54      0.54      0.54     11564

