## Assignment1

### Objective: Find all file paths in the Validation Set that satisfy each requirement in the file name, and arrange all outputs according to the format of the example file.

### Output: Convert all outputs into a JSON file.

### Requirement:
##### 1. “CLASS_NAME” contain “Env3”. 
##### 2. “THE_GENDER_AND_COUNT” contain 2 females with no limit on number of males. 
#### 3. “THE_GENDER_AND_COUNT” contain 1 female without any male. 
##### 4. “TIME” contain from 5/6 18:13:07 to 5/7 23:24:34 (same as 240506_181307 to 240507_232434) 
##### 5. “CLASS_NAME” contain “Env3”, “THE_GENDER_AND_COUNT” contain just 1 male, “POSITION” contain “5_posi” and “TIME” from 5/8 09:00 to 5/8 11:00.

Load package

In [4]:
import torch
import json
from torch.utils.data import Dataset, DataLoader

  cpu = _conversion_method_template(device=torch.device("cpu"))


Load dataset, set requirement's condition and output json file

In [10]:
import json
import re

class CSI_Dataset(Dataset):
    def __init__(self, split='train', requirement=1):
        self.split = split
        self.requirement = requirement
        with open(r'C:\Users\Kevin\Desktop\研究所\DL\HW1\CSI_data.json', 'r') as file:
            self.json_data = json.load(file)

        self.data_split = self.json_data[split]
        self.data = self.filter_data(self.data_split, requirement)
            
    def filter_data(self, data_split, requirement):
        filtered_data = []

        # Process each entry and apply filtering based on the requirement
        for entry in data_split:
            # Split based on '/' to extract components
            components = entry.split('/')
            if len(components) < 5:
                continue
            
            CLASS_NAME = components[0]
            THE_GENDER_AND_COUNT = components[2]
            POSITION = components[3]
            TIME = components[4]
            
            # Apply filtering based on the requirement
            if requirement == 1:
                # Requirement 1: "CLASS_NAME" contains "Env3"
                if "Env3" in CLASS_NAME:
                    filtered_data.append(entry)
                    
            elif requirement == 2:
                # Requirement 2: "THE_GENDER_AND_COUNT" contains 2 females with no limit on males
                if THE_GENDER_AND_COUNT.count('F') == 2:
                    filtered_data.append(entry)
                    
            elif requirement == 3:
                # Requirement 3: "THE_GENDER_AND_COUNT" contains 1 female without any male
                if re.match(r'Female?', THE_GENDER_AND_COUNT):
                    filtered_data.append(entry)
                    
            elif requirement == 4:
                # Requirement 4: "TIME" contains from 5/6 18:13:07 to 5/7 23:24:34
                start_time = '240506_181307'
                end_time = '240507_232434'
                if start_time <= TIME <= end_time:
                    filtered_data.append(entry)
                    
            elif requirement == 5:
                # Requirement 5: “CLASS_NAME” contain “Env3”, “THE_GENDER_AND_COUNT” contain just 1 male, “POSITION” contain “5_posi” and “TIME” from 5/8 09:00 to 5/8 11:00. 
                start_time = '240508_090000'
                end_time = '240508_110000'
                if ("Env3" in CLASS_NAME and 
                    re.match(r'Male\d*', THE_GENDER_AND_COUNT) and
                    "5_posi" in POSITION and 
                    start_time <= TIME <= end_time):
                    filtered_data.append(entry)
                    
        print(f"Filtered data count for requirement {requirement}: {len(filtered_data)}")
        return filtered_data

    def __len__(self):
        return len(self.data)
            
    def __getitem__(self, idx):
        return self.data[idx]

# Process each split and apply requirements
splits = ['train', 'val', 'test']
requirements = [1, 2, 3, 4, 5]

for req in requirements:
    combined_data = {}
    for split in ['train', 'val', 'test']:
        dataset = CSI_Dataset(split=split, requirement=req)
        if len(dataset) > 0:
            dataloader = DataLoader(dataset, batch_size=len(dataset), shuffle=False)
            output = next(iter(dataloader))
            combined_data[split] = {i: value for i, value in enumerate(output)}
        else:
            print(f"{split.capitalize()} set, Requirement {req}: No matching data found.")
            combined_data[split] = {}

    json_filename = f'./A1_313834006_李崇楷_{req}.json'
    with open(json_filename, 'w') as json_file:
        json.dump(combined_data, json_file, indent=4)
    print(f"Saved combined output for Requirement {req} to {json_filename}")

Filtered data count for requirement 1: 158110
Filtered data count for requirement 1: 0
Val set, Requirement 1: No matching data found.
Filtered data count for requirement 1: 0
Test set, Requirement 1: No matching data found.
Saved combined output for Requirement 1 to ./A1_313834006_李崇楷_1.json
Filtered data count for requirement 2: 128820
Filtered data count for requirement 2: 0
Val set, Requirement 2: No matching data found.
Filtered data count for requirement 2: 0
Test set, Requirement 2: No matching data found.
Saved combined output for Requirement 2 to ./A1_313834006_李崇楷_2.json
Filtered data count for requirement 3: 229679
Filtered data count for requirement 3: 3622
Filtered data count for requirement 3: 4807
Saved combined output for Requirement 3 to ./A1_313834006_李崇楷_3.json
Filtered data count for requirement 4: 391067
Filtered data count for requirement 4: 0
Val set, Requirement 4: No matching data found.
Filtered data count for requirement 4: 0
Test set, Requirement 4: No match

Check if there are any errors in the quantity of requirements.

In [1]:
import json
from collections import defaultdict

# Load the JSON data
with open(r'C:\Users\Kevin\Desktop\研究所\DL\HW1\CSI_data.json', 'r') as file:
    json_data = json.load(file)

# Dictionaries to hold the counts for each category in THE_GENDER_AND_COUNT, TIME, and POSITION for each split
gender_count_dict = {
    'train': defaultdict(int),
    'val': defaultdict(int),
    'test': defaultdict(int)
}

time_count_dict = {
    'train': defaultdict(int),
    'val': defaultdict(int),
    'test': defaultdict(int)
}

position_count_dict = {
    'train': defaultdict(int),
    'val': defaultdict(int),
    'test': defaultdict(int)
}

# Define the start and end times
start_time = "240506_181307"
end_time = "240507_232434"

# Iterate over each data split (train, val, test)
entries_in_time_range = []

for split in ['train', 'val', 'test']:
    data_split = json_data.get(split, [])
    
    for entry in data_split:
        components = entry.split('/')
        if len(components) < 5:
            continue
        
        # Extract THE_GENDER_AND_COUNT, POSITION, and TIME
        THE_GENDER_AND_COUNT = components[2]
        POSITION = components[3]
        TIME = components[4]
        
        # Increment the count for these categories in the corresponding split
        gender_count_dict[split][THE_GENDER_AND_COUNT] += 1
        position_count_dict[split][POSITION] += 1
        time_count_dict[split][TIME] += 1

        # Check if the TIME is within the specified range
        if start_time <= TIME <= end_time:
            entries_in_time_range.append(entry)

# Output the counts for each category in THE_GENDER_AND_COUNT, POSITION, and TIME in each split
for split in ['train', 'val', 'test']:
    print(f"\n{split.capitalize()} Set:")
    
    print("  THE_GENDER_AND_COUNT:")
    for gender_category, count in gender_count_dict[split].items():
        print(f"    Category '{gender_category}': {count} entries")
    
    print("  POSITION:")
    for position_category, count in position_count_dict[split].items():
        print(f"    Position '{position_category}': {count} entries")
    
    print("  TIME:")
    for time_category, count in time_count_dict[split].items():
        print(f"    Time '{time_category}': {count} entries")





Train Set:
  THE_GENDER_AND_COUNT:
    Category 'F1M1F2': 12909 entries
    Category 'F1M1M2': 11499 entries
    Category 'F1M3': 26485 entries
    Category 'F2M1': 23590 entries
    Category 'F2M2F3': 11725 entries
    Category 'F2M2M3': 11711 entries
    Category 'F2M3': 24639 entries
    Category 'F3M1': 23520 entries
    Category 'F3M2': 29418 entries
    Category 'F3M3F1': 18154 entries
    Category 'F3M3M1': 12853 entries
    Category 'Female1': 74332 entries
    Category 'Female2': 65283 entries
    Category 'Female3': 90064 entries
    Category 'Male1': 76624 entries
    Category 'Male2': 64852 entries
    Category 'Male3': 72234 entries
    Category 'None': 14773 entries
    Category 'F1M1': 19273 entries
    Category 'F2M2': 19971 entries
    Category 'F3M3': 24164 entries
    Category 'F1M1F3': 15029 entries
    Category 'F1M1M3': 11976 entries
    Category 'F2M2F1': 11645 entries
    Category 'F2M2M1': 11640 entries
    Category 'F3M3F2': 11672 entries
    Category 'F3M3M2