In [1]:
#EDA for the actual UCF dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Define constants and labels
train_dir = "/Users/rukmini/Downloads/archive/train"
test_dir = "/Users/rukmini/Downloads/archive/test"

SEED = 12
IMG_HEIGHT = 64
IMG_WIDTH = 64
BATCH_SIZE = 64
EPOCHS = 1
LR = 0.00003
NUM_CLASSES = 14
CLASS_LABELS = ['Abuse', 'Arrest', 'Arson', 'Assault', 'Burglary', 'Explosion', 'Fighting', "Normal", 'RoadAccidents', 'Robbery', 'Shooting', 'Shoplifting', 'Stealing', 'Vandalism']

# Data generators
preprocess_fun = None  # Add your preprocessing function if needed

train_datagen = ImageDataGenerator(horizontal_flip=True,
                                   width_shift_range=0.1,
                                   height_shift_range=0.05,
                                   rescale=1./255,
                                   preprocessing_function=preprocess_fun)

test_datagen = ImageDataGenerator(rescale=1./255,
                                  preprocessing_function=preprocess_fun)

train_generator = train_datagen.flow_from_directory(directory=train_dir,
                                                    target_size=(IMG_HEIGHT, IMG_WIDTH),
                                                    batch_size=BATCH_SIZE,
                                                    shuffle=True,
                                                    color_mode="rgb",
                                                    class_mode="categorical",
                                                    seed=SEED)

test_generator = test_datagen.flow_from_directory(directory=test_dir,
                                                   target_size=(IMG_HEIGHT, IMG_WIDTH),
                                                   batch_size=BATCH_SIZE,
                                                   shuffle=False,
                                                   color_mode="rgb",
                                                   class_mode="categorical",
                                                   seed=SEED)


# Extract distribution data for train and test sets
train_class_counts = [list(train_generator.classes).count(i) for i in np.unique(train_generator.classes)]
test_class_counts = [list(test_generator.classes).count(i) for i in np.unique(test_generator.classes)]

# Convert to DataFrame for easier manipulation and visualization
data_distribution = pd.DataFrame({
    'Category': CLASS_LABELS,
    'Train_Count': train_class_counts,
    'Test_Count': test_class_counts
})

# Add additional statistics
data_distribution['Total_Count'] = data_distribution['Train_Count'] + data_distribution['Test_Count']
data_distribution['Train_Percentage'] = (data_distribution['Train_Count'] / data_distribution['Total_Count']) * 100
data_distribution['Test_Percentage'] = (data_distribution['Test_Count'] / data_distribution['Total_Count']) * 100

# Display basic statistics
print("Dataset Distribution Statistics:\n")
print(data_distribution.describe())

# Plot the distribution for train and test sets
plt.figure(figsize=(14, 7))
sns.barplot(data=data_distribution.melt(id_vars='Category', value_vars=['Train_Count', 'Test_Count'], 
                                        var_name='Dataset', value_name='Image_Count'), 
            x='Category', y='Image_Count', hue='Dataset', palette='viridis')
plt.title('Distribution Across Train and Test Datasets', fontsize=16)
plt.xlabel('Category', fontsize=14)
plt.ylabel('Image Count', fontsize=14)
plt.xticks(rotation=45, fontsize=12)
plt.legend(title='Dataset', fontsize=12)
plt.tight_layout()
plt.show()

# Visualize percentage split of train and test sets per category
plt.figure(figsize=(14, 7))
sns.barplot(data=data_distribution, x='Category', y='Train_Percentage', palette='crest')
plt.title('Percentage of Training Images per Category', fontsize=16)
plt.xlabel('Category', fontsize=14)
plt.ylabel('Percentage (%)', fontsize=14)
plt.xticks(rotation=45, fontsize=12)
plt.tight_layout()
plt.show()

# Highlight the most imbalanced categories
imbalanced_categories = data_distribution.sort_values(by='Total_Count', ascending=True)
print("Most Imbalanced Categories:\n")
print(imbalanced_categories[['Category', 'Total_Count', 'Train_Count', 'Test_Count']].head())

# 1. Correlation between train and test distributions
correlation = data_distribution['Train_Count'].corr(data_distribution['Test_Count'])
print(f"Correlation between Train and Test Distributions: {correlation:.2f}")

# 2. Train-to-test ratio per category
data_distribution['Train_Test_Ratio'] = data_distribution['Train_Count'] / data_distribution['Test_Count']
plt.figure(figsize=(14, 7))
sns.barplot(data=data_distribution, x='Category', y='Train_Test_Ratio', palette='coolwarm')
plt.title('Train-to-Test Ratio per Category', fontsize=16)
plt.xlabel('Category', fontsize=14)
plt.ylabel('Train/Test Ratio', fontsize=14)
plt.xticks(rotation=45, fontsize=12)
plt.tight_layout()
plt.show()

# 3. Visualize total image count per category
plt.figure(figsize=(14, 7))
sns.barplot(data=data_distribution, x='Category', y='Total_Count', palette='mako')
plt.title('Total Image Count per Category', fontsize=16)
plt.xlabel('Category', fontsize=14)
plt.ylabel('Total Image Count', fontsize=14)
plt.xticks(rotation=45, fontsize=12)
plt.tight_layout()
plt.show()

# 4. Highlight categories with the highest imbalance
imbalance_ratio = data_distribution['Train_Percentage'] - data_distribution['Test_Percentage']
data_distribution['Imbalance_Ratio'] = imbalance_ratio
most_imbalanced = data_distribution.sort_values(by='Imbalance_Ratio', ascending=False)
print("Categories with Highest Imbalance:\n")
print(most_imbalanced[['Category', 'Imbalance_Ratio']].head())

# Additional Analysis
# 1. Correlation between train and test distributions
correlation = data_distribution['Train_Count'].corr(data_distribution['Test_Count'])
print(f"Correlation between Train and Test Distributions: {correlation:.2f}")


# 2. Train-to-test ratio per category
data_distribution['Train_Test_Ratio'] = data_distribution['Train_Count'] / data_distribution['Test_Count']
plt.figure(figsize=(14, 7))
sns.barplot(data=data_distribution, x='Category', y='Train_Test_Ratio', palette='coolwarm')
plt.title('Train-to-Test Ratio per Category', fontsize=16)
plt.xlabel('Category', fontsize=14)
plt.ylabel('Train/Test Ratio', fontsize=14)
plt.xticks(rotation=45, fontsize=12)
plt.tight_layout()
plt.show()



# 3. Visualize total image count per category
plt.figure(figsize=(14, 7))
sns.barplot(data=data_distribution, x='Category', y='Total_Count', palette='mako')
plt.title('Total Image Count per Category', fontsize=16)
plt.xlabel('Category', fontsize=14)
plt.ylabel('Total Image Count', fontsize=14)
plt.xticks(rotation=45, fontsize=12)
plt.tight_layout()
plt.show()



# 4. Highlight categories with the highest imbalance
imbalance_ratio = data_distribution['Train_Percentage'] - data_distribution['Test_Percentage']
data_distribution['Imbalance_Ratio'] = imbalance_ratio
most_imbalanced = data_distribution.sort_values(by='Imbalance_Ratio', ascending=False)
print("Categories with Highest Imbalance:\n")
print(most_imbalanced[['Category', 'Imbalance_Ratio']].head())




In [2]:
#EDA for the reduced dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Define constants and labels
train_dir = "/Users/rukmini/Downloads/archive/train"
test_dir = "/Users/rukmini/Downloads/archive/test"

SEED = 12
IMG_HEIGHT = 64
IMG_WIDTH = 64
BATCH_SIZE = 64
EPOCHS = 1
LR = 0.00003
NUM_CLASSES = 14
CLASS_LABELS = ['Abuse', 'Arrest', 'Arson', 'Assault', 'Burglary', 'Explosion', 'Fighting', "Normal", 'RoadAccidents', 'Robbery', 'Shooting', 'Shoplifting', 'Stealing', 'Vandalism']

# Data generators
preprocess_fun = None  # Add your preprocessing function if needed

train_datagen = ImageDataGenerator(horizontal_flip=True,
                                   width_shift_range=0.1,
                                   height_shift_range=0.05,
                                   rescale=1./255,
                                   preprocessing_function=preprocess_fun)

test_datagen = ImageDataGenerator(rescale=1./255,
                                  preprocessing_function=preprocess_fun)

train_generator = train_datagen.flow_from_directory(directory=train_dir,
                                                    target_size=(IMG_HEIGHT, IMG_WIDTH),
                                                    batch_size=BATCH_SIZE,
                                                    shuffle=True,
                                                    color_mode="rgb",
                                                    class_mode="categorical",
                                                    seed=SEED)

test_generator = test_datagen.flow_from_directory(directory=test_dir,
                                                   target_size=(IMG_HEIGHT, IMG_WIDTH),
                                                   batch_size=BATCH_SIZE,
                                                   shuffle=False,
                                                   color_mode="rgb",
                                                   class_mode="categorical",
                                                   seed=SEED)


# Extract distribution data for train and test sets
train_class_counts = [list(train_generator.classes).count(i) for i in np.unique(train_generator.classes)]
test_class_counts = [list(test_generator.classes).count(i) for i in np.unique(test_generator.classes)]

# Convert to DataFrame for easier manipulation and visualization
data_distribution = pd.DataFrame({
    'Category': CLASS_LABELS,
    'Train_Count': train_class_counts,
    'Test_Count': test_class_counts
})

# Add additional statistics
data_distribution['Total_Count'] = data_distribution['Train_Count'] + data_distribution['Test_Count']
data_distribution['Train_Percentage'] = (data_distribution['Train_Count'] / data_distribution['Total_Count']) * 100
data_distribution['Test_Percentage'] = (data_distribution['Test_Count'] / data_distribution['Total_Count']) * 100

# Display basic statistics
print("Dataset Distribution Statistics:\n")
print(data_distribution.describe())

# Plot the distribution for train and test sets
plt.figure(figsize=(14, 7))
sns.barplot(data=data_distribution.melt(id_vars='Category', value_vars=['Train_Count', 'Test_Count'], 
                                        var_name='Dataset', value_name='Image_Count'), 
            x='Category', y='Image_Count', hue='Dataset', palette='viridis')
plt.title('Distribution Across Train and Test Datasets', fontsize=16)
plt.xlabel('Category', fontsize=14)
plt.ylabel('Image Count', fontsize=14)
plt.xticks(rotation=45, fontsize=12)
plt.legend(title='Dataset', fontsize=12)
plt.tight_layout()
plt.show()

# Visualize percentage split of train and test sets per category
plt.figure(figsize=(14, 7))
sns.barplot(data=data_distribution, x='Category', y='Train_Percentage', palette='crest')
plt.title('Percentage of Training Images per Category', fontsize=16)
plt.xlabel('Category', fontsize=14)
plt.ylabel('Percentage (%)', fontsize=14)
plt.xticks(rotation=45, fontsize=12)
plt.tight_layout()
plt.show()

# Highlight the most imbalanced categories
imbalanced_categories = data_distribution.sort_values(by='Total_Count', ascending=True)
print("Most Imbalanced Categories:\n")
print(imbalanced_categories[['Category', 'Total_Count', 'Train_Count', 'Test_Count']].head())

# 1. Correlation between train and test distributions
correlation = data_distribution['Train_Count'].corr(data_distribution['Test_Count'])
print(f"Correlation between Train and Test Distributions: {correlation:.2f}")

# 2. Train-to-test ratio per category
data_distribution['Train_Test_Ratio'] = data_distribution['Train_Count'] / data_distribution['Test_Count']
plt.figure(figsize=(14, 7))
sns.barplot(data=data_distribution, x='Category', y='Train_Test_Ratio', palette='coolwarm')
plt.title('Train-to-Test Ratio per Category', fontsize=16)
plt.xlabel('Category', fontsize=14)
plt.ylabel('Train/Test Ratio', fontsize=14)
plt.xticks(rotation=45, fontsize=12)
plt.tight_layout()
plt.show()

# 3. Visualize total image count per category
plt.figure(figsize=(14, 7))
sns.barplot(data=data_distribution, x='Category', y='Total_Count', palette='mako')
plt.title('Total Image Count per Category', fontsize=16)
plt.xlabel('Category', fontsize=14)
plt.ylabel('Total Image Count', fontsize=14)
plt.xticks(rotation=45, fontsize=12)
plt.tight_layout()
plt.show()

# 4. Highlight categories with the highest imbalance
imbalance_ratio = data_distribution['Train_Percentage'] - data_distribution['Test_Percentage']
data_distribution['Imbalance_Ratio'] = imbalance_ratio
most_imbalanced = data_distribution.sort_values(by='Imbalance_Ratio', ascending=False)
print("Categories with Highest Imbalance:\n")
print(most_imbalanced[['Category', 'Imbalance_Ratio']].head())

# Additional Analysis
# 1. Correlation between train and test distributions
correlation = data_distribution['Train_Count'].corr(data_distribution['Test_Count'])
print(f"Correlation between Train and Test Distributions: {correlation:.2f}")


# 2. Train-to-test ratio per category
data_distribution['Train_Test_Ratio'] = data_distribution['Train_Count'] / data_distribution['Test_Count']
plt.figure(figsize=(14, 7))
sns.barplot(data=data_distribution, x='Category', y='Train_Test_Ratio', palette='coolwarm')
plt.title('Train-to-Test Ratio per Category', fontsize=16)
plt.xlabel('Category', fontsize=14)
plt.ylabel('Train/Test Ratio', fontsize=14)
plt.xticks(rotation=45, fontsize=12)
plt.tight_layout()
plt.show()



# 3. Visualize total image count per category
plt.figure(figsize=(14, 7))
sns.barplot(data=data_distribution, x='Category', y='Total_Count', palette='mako')
plt.title('Total Image Count per Category', fontsize=16)
plt.xlabel('Category', fontsize=14)
plt.ylabel('Total Image Count', fontsize=14)
plt.xticks(rotation=45, fontsize=12)
plt.tight_layout()
plt.show()



# 4. Highlight categories with the highest imbalance
imbalance_ratio = data_distribution['Train_Percentage'] - data_distribution['Test_Percentage']
data_distribution['Imbalance_Ratio'] = imbalance_ratio
most_imbalanced = data_distribution.sort_values(by='Imbalance_Ratio', ascending=False)
print("Categories with Highest Imbalance:\n")
print(most_imbalanced[['Category', 'Imbalance_Ratio']].head())


