In [None]:
import os
from PIL import Image
import torch
import pandas as pd
import gensim
from gensim.models import Word2Vec
from collections import Counter
from IPython.display import display, HTML
from nltk.corpus import stopwords
from transformers import (
    Blip2Processor,
    Blip2ForConditionalGeneration,
    BitsAndBytesConfig,
)


import nltk
nltk.download("stopwords")


device = "cuda" if torch.cuda.is_available() else "cpu"


bnb_config = BitsAndBytesConfig(load_in_8bit=True)


processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    quantization_config=bnb_config,  
    device_map={"": 0},
    torch_dtype=torch.float16,
)


image_folder = "beauty"  


image_files = [
    f for f in os.listdir(image_folder) if f.lower().endswith((".jpg", ".jpeg", ".png"))
]


captions = []
results = []


stop_words = set(stopwords.words("english"))


for image_file in image_files:
    image_path = os.path.join(image_folder, image_file)
    image = Image.open(image_path)

  
    inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
    generated_ids = model.generate(**inputs)
    caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

  
    words = [word for word in caption.lower().split() if word not in stop_words]
    captions.append(words)

   
    img_tag = f'<img src="{image_path}" width="100"/>'  
    results.append({"Image": img_tag, "Image Name": image_file, "Caption": caption})


word2vec_model = Word2Vec(sentences=captions, vector_size=100, window=5, min_count=1, workers=4)


word_counts = Counter(word for caption in captions for word in caption)
total_words = sum(word_counts.values())


word_image_count = {word: sum(1 for caption in captions if word in caption) for word in word_counts.keys()}


young_words = {"young", "little", "girl", "boy", "child", "teen"}  
old_words = {"old", "elderly", "senior", "aged", "grandparent", "veteran"}  


age_related_words = young_words.union(old_words)


age_word_counts = {word: word_counts[word] for word in age_related_words if word in word_counts}
total_age_words = sum(age_word_counts.values())


female_words = {"woman", "female", "girl", "lady", "women", "mother", "sister"}
male_words = {"man", "male", "boy", "gentleman", "men", "father", "brother"}


def classify_gender(caption_words):
    if any(word in female_words for word in caption_words):
        return "Female"
    elif any(word in male_words for word in caption_words):
        return "Male"
    return "Neutral"


gender_labels = [classify_gender(caption) for caption in captions]


df = pd.DataFrame(results)
df["Gender Indicator"] = gender_labels 
df["Age-Related Words"] = [
    ", ".join([word for word in caption if word in age_related_words]) for caption in captions
] 


html = df.to_html(escape=False)
display(HTML(html))


word_freq_df = pd.DataFrame(
    {
        "Word": word_counts.keys(),
        "Total Occurrences": word_counts.values(),  
        "Images Containing Word": word_image_count.values(), 
        "Percentage of Images": [(count / len(image_files)) * 100 for count in word_image_count.values()],  
    }
).sort_values(by="Images Containing Word", ascending=False)


print("\nWord Frequency Analysis:")
print(word_freq_df)


age_df = pd.DataFrame(age_word_counts.items(), columns=["Word", "Count"]).sort_values(by="Count", ascending=False)

print("\nAge-Related Word Analysis:")
print(age_df)


gender_word_counts = {word: word_counts[word] for word in female_words.union(male_words) if word in word_counts}
gender_df = pd.DataFrame(gender_word_counts.items(), columns=["Word", "Count"]).sort_values(by="Count", ascending=False)

print("\nGender-Related Word Analysis:")
print(gender_df)

import json


word_freq_json = word_freq_df.to_dict(orient="records")  
age_json = age_df.to_dict(orient="records") 
gender_json = gender_df.to_dict(orient="records") 


output_data = {
    "image_captions": df.to_dict(orient="records"), 
    "word_frequencies": word_freq_json,  
    "age_related_words": age_json, 
    "gender_related_words": gender_json,  
}


output_file = "beauty.json"
with open(output_file, "w", encoding="utf-8") as json_file:
    json.dump(output_data, json_file, ensure_ascii=False, indent=4)

print(f"\nAnalysis saved to {output_file}")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ctech\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Loading checkpoint shards: 100%|███████████████████████████| 2/2 [00:06<00:00,  3.35s/it]


Unnamed: 0,Image,Image Name,Caption,Gender Indicator,Age-Related Words
0,,Beauty_00031_.png,a woman with short hair and a black jacket is standing in front of a wall,Female,
1,,Beauty_00032_.png,a woman with a flower crown in a field of yellow flowers,Female,
2,,Beauty_00033_.png,a young woman with braids leaning against a wall,Female,young
3,,Beauty_00034_.png,a white mannequin with flowers on its head,Neutral,
4,,Beauty_00035_.png,a beautiful bride with her hair in a field,Neutral,
5,,Beauty_00036_.png,a woman with pink hair and flowers in her hair,Female,
6,,Beauty_00037_.png,a woman with bright makeup and a pink shirt,Female,
7,,Beauty_00038_.png,a woman smiling in front of a pink wall,Female,
8,,Beauty_00039_.png,a woman with curly hair and a hat smiling,Female,
9,,Beauty_00040_.png,a woman with pink hair and a pink hat,Female,



Word Frequency Analysis:
        Word  Total Occurrences  Images Containing Word  Percentage of Images
0      woman                 94                      94                  94.0
2       hair                 45                      43                  43.0
12   flowers                 29                      29                  29.0
27       hat                 21                      21                  21.0
3      black                 22                      20                  20.0
..       ...                ...                     ...                   ...
58      girl                  1                       1                   1.0
59     paint                  1                       1                   1.0
61    orange                  1                       1                   1.0
62   shadows                  1                       1                   1.0
90  american                  1                       1                   1.0

[91 rows x 4 columns]

Age-Related Wo