<a href="https://colab.research.google.com/github/itsmeeeeeee/MML/blob/main/Features_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 ## Group 5: Aldi Halili, Valeriya Herrlein, Chunxue Liu



# Feature extraction with pretrained Modells BERT and RestNet-50

In [5]:
! pip install torch torchvision
! pip install transformers pandas numpy



In [6]:
from torchvision import models, transforms
from PIL import Image, ImageFile

import os
import numpy as np
import torch
import pandas as pd
import torch.nn as nn
import torch.optim as optim
#import torch.nn.functional as F

In [4]:
#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


#**Data Preprocessing**

##**A. Image Preprocessing**

**RestNet50** to extract the images features

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
"""
Initializing the pre-trained ResNet-50 model
For more information on ResNet-50, see https://blog.roboflow.com/what-is-resnet-50/
"""
# Initializing the pre-trained ResNet-50 model
r_model = models.resnet50(pretrained=True)
re_model = r_model.to(device)
re_model.eval()

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:01<00:00, 94.9MB/s]


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [9]:
# define the transformation
"""
This setup includes standard pre-processing steps such as resizing, cropping, and normalizing the images.
The transformation setup below is adapted from an example provided in a blog post: "How to fine-tune the ResNet-50 model on your target dataset using PyTorch"
Available at: https://medium.com/@engr.akhtar.awan/how-to-fine-tune-the-resnet-50-model-on-your-target-dataset-using-pytorch-187abdb9beeb

"""
image_trans = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]),
])

In [10]:
"""
function to extract features from an image using the given model and transformations,
and handles images from a specified directory.

"""

ImageFile.LOAD_TRUNCATED_IMAGES = True

def extract_img_features(image_path, model, transform, device):
    try:
        # open the image and convert it in to RGB
        with Image.open(image_path) as img:
            img = img.convert("RGB")

        # applying the defined transformation
        img = transform(img)
        img = img.unsqueeze(0).to(device)

        # extracting features
        with torch.no_grad():
            features = model(img)
        return features.cpu().squeeze().numpy()
    except Exception as e:
        print(f"Error {img_path}: {e}")
        return None

# path to the image directory

image_path = '/content/drive/MyDrive/MultimodalNLP/projekt/images/'
image_files = os.listdir(image_path)
features_list = []

# Extract features for each image file

for img_file in image_files:
    img_path = os.path.join(image_path, img_file)
    if os.path.isfile(img_path):
        features = extract_img_features(img_path, re_model, image_trans, device)
        if features is not None:
            features_list.append(features)

print(f"Number of successfully extracted features: {len(features_list)}")




Number of successfully extracted features: 6992


In [11]:
# convert the list in a Numpy-Array
img_features = np.array(features_list)
print(len(img_features))
#img_features
img_features.shape

6992


(6992, 1000)

In [None]:
# Path where the numpy array will be saved
image_path = "/content/drive/MyDrive/MultimodalNLP/projekt/features_data/image_features_restnet.npy"

directory = os.path.dirname(image_path)
if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"Directory created: {directory}")

# Save the numpy file to google drive
np.save(image_path, img_features)
print("file saved to Google Drive.")


file saved to Google Drive.


##**B. Text Preprocessing**

- For the text preprocessing step, the following steps are intended for the column 'text_corrected' from the `labels.csv` file: text cleaning, tokenization, stop words removal, lemmatization, etc.

- In the end, we extract the embeddings.



In [None]:
# Import necessary libraries
from transformers import BertTokenizer, BertModel
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [13]:
# Load the CSV-Datei into a Dataframe
csv_path = "/content/drive/MyDrive/MultimodalNLP/projekt/data/labels.csv"


# Laden der CSV-Datei in einen DataFrame
df = pd.read_csv(csv_path)

# Bereinigung der Textdaten: Entfernen von Nicht-String-Werten und Umwandlung in String
df['text_corrected'] = df['text_corrected'].fillna('')  # Ersetzen von NaN mit leeren Strings
df['text_corrected'] = df['text_corrected'].apply(str)  # Umwandlung in String

texts = df['text_corrected']


In [16]:
df.head()


Unnamed: 0.1,Unnamed: 0,image_name,text_ocr,text_corrected,humour,sarcasm,offensive,motivational,overall_sentiment
0,0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,hilarious,general,not_offensive,not_motivational,very_positive
1,1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,The best of #10 YearChallenge! Completed in le...,not_funny,general,not_offensive,motivational,very_positive
2,2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,Sam Thorne @Strippin ( Follow Follow Saw every...,very_funny,not_sarcastic,not_offensive,not_motivational,positive
3,3,image_4.png,10 Year Challenge - Sweet Dee Edition,10 Year Challenge - Sweet Dee Edition,very_funny,twisted_meaning,very_offensive,motivational,positive
4,4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,hilarious,very_twisted,very_offensive,not_motivational,neutral


In [17]:
#texts[:]
texts.head()


0    LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...
1    The best of #10 YearChallenge! Completed in le...
2    Sam Thorne @Strippin ( Follow Follow Saw every...
3                10 Year Challenge - Sweet Dee Edition
4    10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...
Name: text_corrected, dtype: object

In [None]:
# We use BERT Tokenizer and BERT Model for the preprocessing step of text and then extract embeddings

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Initialize the BERT model
model = BertModel.from_pretrained('bert-base-uncased')


In [None]:
# Function to obtain BERT embeddings for our text
# Code partly based on a Stack Overflow discussion: https://stackoverflow.com/questions/78022923/reducing-runtime-of-bert-embedding-extraction-in-pytorch

# Funktion, um BERT Embeddings für gegebenen Text zu erhalten

def get_bert_embeddings(texts):
    """Extrahiert BERT Embeddings für eine Liste von Texten."""
    # Texte in BERT-konforme Tokens umwandeln
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512, add_special_tokens=True)

    # BERT Modell-Forward-Pass, um Embeddings zu extrahieren
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.pooler_output  # Verwenden Sie pooler_output für zusammengefasste Embeddings
    return embeddings.numpy()



In [None]:

# Get embeddings for all texts
text_embeddings = [get_bert_embeddings(text) for text in texts]
print(f"Number of text embeddings: {len(text_embeddings)}")

# path where file will be saved
text_path = "/content/drive/MyDrive/MultimodalNLP/projekt/features_data/text_features_bert_final.npy"


directory = os.path.dirname(text_path)
if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"Directory created: {directory}")

# Save the numpy file
np.save(text_path, text_embeddings)
print("Text features saved to Google Drive.")


Number of text embeddings: 6992
Text features saved to Google Drive.
