In [2]:
import pandas as pd
import difflib
from itertools import combinations
import networkx as nx
import os

# Define the similarity calculation function using difflib
def calculate_combined_similarity(ocr1, ocr2):
    total_similarity = difflib.SequenceMatcher(None, ocr1, ocr2).ratio()
    
    last_ocr1 = ocr1[-4:]
    last_ocr2 = ocr2[-4:]
    last_similarity = difflib.SequenceMatcher(None, last_ocr1, last_ocr2).ratio()
    
    fifth_ocr1 = ocr1[-5] if len(ocr1) > 4 else ''
    fifth_ocr2 = ocr2[-5] if len(ocr2) > 4 else ''
    fifth_similarity = 1 if fifth_ocr1 == fifth_ocr2 else 0
    
    front_ocr1 = ocr1[:-5]
    front_ocr2 = ocr2[:-5]
    front_similarity = difflib.SequenceMatcher(None, front_ocr1, front_ocr2).ratio()
    
    weight_total = 0.5
    weight_last = 0.3
    weight_fifth = 0.05
    weight_front = 0.15
    
    combined_similarity = (
        (total_similarity * weight_total) + 
        (last_similarity * weight_last) + 
        (fifth_similarity * weight_fifth) + 
        (front_similarity * weight_front)
    )
    
    return combined_similarity

# Set model and accuracy based on the most frequent value in each group
def get_most_frequent(series):
    return series.mode().iloc[0]

# Process each merged CSV file and apply the grouping
def process_file(file_path, output_folder):
    data = pd.read_csv(file_path)
    
    # Group vehicles and calculate similarity
    vehicle_groups = []
    ocr_pairs = combinations(data['ocr'], 2)
    
    for ocr1, ocr2 in ocr_pairs:
        similarity = calculate_combined_similarity(ocr1, ocr2)
        if similarity > 0.7:  # Assuming a similarity threshold of 0.7
            vehicle_groups.append((ocr1, ocr2, similarity))

    # Create a graph to group similar OCRs
    G = nx.Graph()

    # Add edges for each similar OCR pair
    for ocr1, ocr2, similarity in vehicle_groups:
        G.add_edge(ocr1, ocr2, weight=similarity)

    # Find connected components (each component is a group of similar OCRs)
    components = list(nx.connected_components(G))

    # Map each OCR to its group
    ocr_to_group = {}
    for group_id, component in enumerate(components):
        for ocr in component:
            ocr_to_group[ocr] = group_id

    # Add group information to the original dataframe
    data['group'] = data['ocr'].map(ocr_to_group)

    # Group by 'group' and aggregate, excluding 'video', 'ID', 'direction', and 'color'
    grouped_data = data.groupby('group').agg({
        'ocr': list,                  # 'ocr' 리스트로 저장
        'accuracy': 'mean',           # 'accuracy' 평균 계산
        'frame': ['min', 'max'],      # 'frame'의 최소값과 최대값
        'TIME': ['min', 'max']        # 'TIME'의 최소값과 최대값
    }).reset_index()

    # Flatten the multi-level columns
    grouped_data.columns = ['group', 'ocr_list', 'accuracy', 'min_frame', 'max_frame', 'entry_time', 'exit_time']

    # Save grouped data
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    grouped_file_path = os.path.join(output_folder, f'{base_name}_grouped.csv')
    grouped_data.to_csv(grouped_file_path, index=False)

    # Find unmatched OCRs
    matched_ocr = set(ocr_to_group.keys())
    all_ocr = set(data['ocr'])
    unmatched_ocr = all_ocr - matched_ocr

    # Get the unmatched data, excluding 'video', 'ID', and 'color'
    unmatched_data = data[data['ocr'].isin(unmatched_ocr)].drop(columns=['video', 'ID', 'color'])
    unmatched_file_path = os.path.join(output_folder, f'{base_name}_unmatched.csv')
    unmatched_data.to_csv(unmatched_file_path, index=False)

    print(f"Grouped data saved to {grouped_file_path}")
    print(f"Unmatched data saved to {unmatched_file_path}")

# 경로 설정
merged_folder = 'C:/Users/PC/Desktop/caffein/montana/integration_0901_0912'
output_folder = 'C:/Users/PC/Desktop/caffein/montana/in_out_0901_0912'

# 결과를 저장할 폴더가 없으면 생성
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# 병합된 CSV 파일 처리
merged_files = [f for f in os.listdir(merged_folder) if f.endswith('.csv')]

for merged_file in merged_files:
    file_path = os.path.join(merged_folder, merged_file)
    process_file(file_path, output_folder)


Grouped data saved to C:/Users/PC/Desktop/caffein/montana/in_out_0901_0912\0901_merged_grouped.csv
Unmatched data saved to C:/Users/PC/Desktop/caffein/montana/in_out_0901_0912\0901_merged_unmatched.csv
Grouped data saved to C:/Users/PC/Desktop/caffein/montana/in_out_0901_0912\0902_merged_grouped.csv
Unmatched data saved to C:/Users/PC/Desktop/caffein/montana/in_out_0901_0912\0902_merged_unmatched.csv
Grouped data saved to C:/Users/PC/Desktop/caffein/montana/in_out_0901_0912\0903_merged_grouped.csv
Unmatched data saved to C:/Users/PC/Desktop/caffein/montana/in_out_0901_0912\0903_merged_unmatched.csv
Grouped data saved to C:/Users/PC/Desktop/caffein/montana/in_out_0901_0912\0904_merged_grouped.csv
Unmatched data saved to C:/Users/PC/Desktop/caffein/montana/in_out_0901_0912\0904_merged_unmatched.csv
Grouped data saved to C:/Users/PC/Desktop/caffein/montana/in_out_0901_0912\0905_merged_grouped.csv
Unmatched data saved to C:/Users/PC/Desktop/caffein/montana/in_out_0901_0912\0905_merged_unma