In [2]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer 
# 1. Define Sample Text Data (Our "Corpus") 
documents = [ 
    "The sun is shining today.", 
    "The weather is good, the sun is great.", 
    "A sunny day is a wonderful day." 
] 
print("--- Raw Text Documents ---") 
for i, doc in enumerate(documents): 
    print(f"Doc {i+1}: {doc}")  # This line needed proper indentation
# 2. Apply Feature Engineering: CountVectorizer (Bag-of-Words) 
# The CountVectorizer performs tokenization and vocabulary building. 
vectorizer = CountVectorizer() 
# fit_transform learns the vocabulary and converts the documents into feature vectors 
X_text = vectorizer.fit_transform(documents) 
# 3. Analyze the Results 
feature_names = vectorizer.get_feature_names_out() 
text_matrix = X_text.toarray() 
print("\n--- Bag-of-Words (BoW) Transformation ---") 
print(f"Vocabulary (Features): {feature_names}") 
print(f"Shape of Feature Matrix: {text_matrix.shape}") 
# Create a DataFrame for a clean view (for student understanding) 
df_text = pd.DataFrame(text_matrix, columns=feature_names,  
index=[f"Doc {i+1}" for i in range(len(documents))]) 
print("\nBoW Numerical Feature Matrix (Machine Understandable Format):") 
print(df_text)

--- Raw Text Documents ---
Doc 1: The sun is shining today.
Doc 2: The weather is good, the sun is great.
Doc 3: A sunny day is a wonderful day.

--- Bag-of-Words (BoW) Transformation ---
Vocabulary (Features): ['day' 'good' 'great' 'is' 'shining' 'sun' 'sunny' 'the' 'today' 'weather'
 'wonderful']
Shape of Feature Matrix: (3, 11)

BoW Numerical Feature Matrix (Machine Understandable Format):
       day  good  great  is  shining  sun  sunny  the  today  weather  \
Doc 1    0     0      0   1        1    1      0    1      1        0   
Doc 2    0     1      1   2        0    1      0    2      0        1   
Doc 3    2     0      0   1        0    0      1    0      0        0   

       wonderful  
Doc 1          0  
Doc 2          0  
Doc 3          1  


In [6]:
import numpy as np 
from PIL import Image 
import os # To check if a file exists 
# --- Configuration for Image Loading --- 
# IMPORTANT: 
# You can replace 'path/to/your/image.jpg' with the actual path to an image file on your computer. 
# Example: 'my_image.png' (if in the same directory as the script) 
# or 'C:/Users/YourUser/Pictures/my_photo.jpg' (on Windows) 
# or '/home/youruser/images/my_photo.png' (on Linux/macOS) 
image_file_path = "C:/Users/Dell/Desktop/Lab 03/SIMBA.jpeg" # Set to None to use dummy image, or a string for your image path 
 
# 1. Load Image or Create a Dummy Image 
original_image = None 
if image_file_path and os.path.exists(image_file_path): 
    try: 
        original_image = Image.open(image_file_path) 
        print(f"--- Loaded Image from File: {image_file_path} ---") 
    except Exception as e: 
        print(f"Error loading image from {image_file_path}: {e}") 
        print("Creating a dummy image instead.") 
        # Fallback to dummy image if file loading fails 
        dummy_image_data = np.random.randint(0, 256, size=(100, 100, 3), dtype=np.uint8) 
        original_image = Image.fromarray(dummy_image_data, 'RGB') 
        print(f"--- Created Dummy Image (100x100 RGB) ---") 
else: 
    print("No valid image file path provided or file not found.") 
    print("Creating a dummy image instead.") 
    # Create a simple 100x100 RGB Image if no valid path is given 
    dummy_image_data = np.random.randint(0, 256, size=(100, 100, 3), dtype=np.uint8) 
    original_image = Image.fromarray(dummy_image_data, 'RGB') 
    print(f"--- Created Dummy Image (100x100 RGB) ---") 
 
# Ensure the image is in RGB format for consistent processing if it wasn't already 
original_image = original_image.convert('RGB') 
original_image_array = np.array(original_image) 
 
print(f"Original Image Size (H, W, Channels): {original_image_array.shape}") 
print(f"Total Features (Pixels) in RGB: {original_image_array.shape[0] * 
original_image_array.shape[1] * original_image_array.shape[2]}") 
 
# Display the original image (optional, requires matplotlib but good for visualization) 
# from matplotlib import pyplot as plt 
# plt.imshow(original_image) 
# plt.title("Original Image") 
# plt.axis('off') 
# plt.show() 
 
 
# 2. Apply Simple Feature Engineering: Grayscale Conversion 
# Grayscale is a simple preprocessing/feature reduction technique. 
grayscale_image = original_image.convert('L') # 'L' mode is for Grayscale 
grayscale_array = np.array(grayscale_image) 
 
print(f"\n--- Grayscale Conversion (Feature Reduction) ---") 
print(f"Grayscale Image Size (H, W, Channels): {grayscale_array.shape}") 
print(f"Total Features (Pixels) after Grayscale: {grayscale_array.shape[0] * 
grayscale_array.shape[1]}") 
# Display the grayscale image (optional) 
# plt.imshow(grayscale_image, cmap='gray') 
# plt.title("Grayscale Image") 
# plt.axis('off') 
# plt.show() 
# 3. Apply Simple Feature Engineering: Pixel Flattening 
# Flattening converts the 2D (or 3D) matrix into a 1D feature vector 
# for input into a traditional ML algorithm. 
flattened_features = grayscale_array.flatten() 
print(f"\n--- Pixel Flattening (Vectorization) ---") 
print(f"Flattened Feature Vector Shape: {flattened_features.shape}") 
print("\nFirst 10 Features (Pixel Values) in Machine Understandable Format:") 
print(flattened_features[:10]) 
# Verify the type - it's a numerical array, ready for an ML model 
print(f"\nData Type of Final Features: {flattened_features.dtype}")

--- Loaded Image from File: C:/Users/Dell/Desktop/Lab 03/SIMBA.jpeg ---
Original Image Size (H, W, Channels): (1080, 1080, 3)
Total Features (Pixels) in RGB: 3499200

--- Grayscale Conversion (Feature Reduction) ---
Grayscale Image Size (H, W, Channels): (1080, 1080)
Total Features (Pixels) after Grayscale: 1166400

--- Pixel Flattening (Vectorization) ---
Flattened Feature Vector Shape: (1166400,)

First 10 Features (Pixel Values) in Machine Understandable Format:
[0 0 0 0 0 0 0 0 0 0]

Data Type of Final Features: uint8
