In [6]:
import warnings
import pandas as pd
import os
import re
from datetime import datetime
import numpy as np
warnings.filterwarnings('ignore')
%matplotlib inline
#https://discord.com/channels/1134059900666916935/1283610000484208670

In [7]:
# Define input and output folders
input_folder = "data/txt"
output_folder = "data/output"
file_paths = [os.path.join(input_folder, file) for file in os.listdir(input_folder) if file.endswith(".txt")]

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Job dictionary and spec ranges
job_dict = {
    "허": "허밋", "시프": "시프", "썬": "썬콜", "불독": "불독",
    "프": "프리스트", "레": "레인저", "저": "저격수",
    "용": "용기사", "크": "크루세이더", "나": "나이트"
}
job_spec_ranges = {
    "용": (3000, 9000), "크": (2000, 9000), "나": (2000, 9000),
    "허": (1500, 4000), "시프": (1500, 5000), "썬": (500, 1200),
    "불독": (500, 1200), "프": (500, 1200), "레": (2000, 9000),
    "저": (2000, 9000)
}
level_min, level_max = 80, 200

# Regex patterns
time_pattern = r"(오전|오후) \d{1,2}:\d{2}"
job_pattern = r"(\d{2,3})\s?(" + "|".join(job_dict.keys()) + r")"
spec_pattern = r"(\d{3,4})"
map_pattern = r"(망용둥|위둥|남둥|큰둥|와협|블와둥|협동|레와둥|붉켄|검켄|푸켄|불어전|물어전|오징어|깊바협|망둥쩔|듀파|듀미굴|갈림길|산양|하둥)"

In [8]:
def extract_level_and_spec(text, job_start, job_end, level_range, spec_range):
    """
    Extracts level and spec values by searching limited characters around the job.
    Level is searched from the left, and spec is searched from the right.
    """
    # Limit the search space
    left_text = text[job_start-5 : job_start+5]  # 6 characters left of the job
    right_text = text[job_end+1 : job_end + 10]  # 6 characters right of the job

    # print ("left:", left_text, "right:", right_text)
    
    level, spec = None, None

    # Search for level in the left_text
    for match in re.finditer(r"\d{2,3}", left_text[::-1]):  # Reverse for easier parsing
        num = int(match.group()[::-1])  # Reverse back the number
        if level_range[0] <= num <= level_range[1]:
            level = num
            break

    # Search for spec in the right_text
    for match in re.finditer(r"\d{3,4}", right_text):
        num = int(match.group())
        if spec_range[0] <= num <= spec_range[1]:
            spec = num
            break

    return level, spec

In [39]:
def collect_initial_data(file_path):
    columns = ["party_id", "time", "level", "job", "spec", "map", "date", "valid", "valid_spec"]
    df = pd.DataFrame(columns=columns)

    # Extract date from filename
    month = file_path[-8:-6]
    day = file_path[-6:-4]
    date_from_file = f"2024-{month}-{day}"
    
    party_id = 1
    
    with open(file_path, "r", encoding="utf-8") as file:
        data = file.read()

    blocks = data.split("오늘")
    for block in blocks[1:]:
        time_match = re.search(time_pattern, block)
        time = time_match.group(0) if time_match else None
        map_match = re.search(map_pattern, block)
        map_name = map_match.group(0) if map_match else None

        job_matches = list(re.finditer(job_pattern, block))
        for job_match in job_matches:
            job_short = job_match.group(2)
            job_full = job_dict.get(job_short, "Unknown")

            level, spec = extract_level_and_spec(
                block, job_match.start(), job_match.end(),
                (level_min, level_max), job_spec_ranges.get(job_short, (None, None))
            )
            
            valid = map_name is not None and job_full != "Unknown"

            # print(level, job_full, spec)
            
            # Append new row to DataFrame
            df = pd.concat([
                df,
                pd.DataFrame([{
                    "party_id": party_id,
                    "time": time,
                    "level": level,
                    "job": job_full,
                    "spec": spec,
                    "map": map_name,
                    "date": date_from_file,
                    "valid": valid,
                    "valid_spec": None
                }])
            ], ignore_index=True)

        party_id += 1

    # Remove duplicates
    df = df.drop_duplicates(subset=["map", "level", "job", "spec"], keep="first")

    # Filter Level Outlier
    level_percentile=0.02
    df_level_not_na = df[df['level'].notna()]
    level_threshold = np.percentile(df_level_not_na['level'], level_percentile)
    df = df_level_not_na[df_level_not_na['level'] >= level_threshold]
    
    #스펙 결측치 처리
    # Step 1: Calculate spec_by_level, handling NaN values for spec and level
    df["spec_by_level"] = df.apply(
        lambda row: row["spec"] / row["level"] if pd.notna(row["spec"]) and pd.notna(row["level"]) else None,
        axis=1
    )
    
    # Step 2: Calculate average spec_by_level by job
    avg_spec_by_level = df.groupby("job")["spec_by_level"].mean().rename("avg_spec_by_level")
    
    # Step 3: Merge the avg_spec_by_level back into the dataframe
    df = df.merge(avg_spec_by_level, on="job", how="left")
    
    # Step 4: Fill NaN in spec_by_level with the average spec_by_level for the job
    df["spec_by_level"].fillna(df["avg_spec_by_level"], inplace=True)
    
    # Step 5: Determine spec_valid based on 20% margin from avg_spec_by_level
    def is_spec_valid(row):
        if pd.notna(row["spec"]):  # Only proceed if spec is not NaN
            if pd.notna(row["spec_by_level"]) and pd.notna(row["avg_spec_by_level"]):
                lower_bound = 0.8 * row["avg_spec_by_level"]
                upper_bound = 1.2 * row["avg_spec_by_level"]
                return lower_bound <= row["spec_by_level"] <= upper_bound
        return False  # Return False if spec is NaN or not within the range
    
    # Step 6: Apply the is_spec_valid function to the dataframe
    df["valid_spec"] = df.apply(is_spec_valid, axis=1)
    
    # Step 7: Create spec_filled: if spec is NaN, fill with level * avg_spec_by_level, else keep spec as is
    df["spec_filled"] = df.apply(
        lambda row: row["spec"] if pd.notna(row["spec"]) else (
            row["level"] * row["avg_spec_by_level"] if pd.notna(row["level"]) and pd.notna(row["avg_spec_by_level"]) else None
        ),
        axis=1
    )
    
    # Step 8: Calculate the average distance between spec and avg_spec_by_level for each job
    df["spec_distance"] = df.apply(
        lambda row: abs(row["spec"] - row["avg_spec_by_level"]) if pd.notna(row["spec"]) and pd.notna(row["avg_spec_by_level"]) else None,
        axis=1
    )
    
    # Step 9: Calculate the average distance for each job
    job_avg_distances = df.groupby("job")["spec_distance"].mean().rename("avg_distance_per_job")
    
    # Step 10: Merge the average distance per job back into the dataframe
    df = df.merge(job_avg_distances, on="job", how="left")

    JITTER_PERCENTAGE = 0.03
    # Step 11: Apply jitter based on the average distance per job
    # Apply jitter ONLY if spec_filled is created (i.e., spec is NaN)
    df["spec_filled"] = df.apply(
    lambda row: row["spec_filled"] + np.random.uniform(-row["spec_filled"] * JITTER_PERCENTAGE, row["spec_filled"] * JITTER_PERCENTAGE)
    if pd.isna(row["spec"]) and pd.notna(row["spec_filled"]) else row["spec_filled"],  # Apply jitter only when spec is missing
    axis=1
    )
    
    output_file = os.path.join(output_folder, os.path.basename(file_path).replace('.txt', '.csv'))
    df.to_csv(output_file, index=False)
    print(f"Data saved as {output_file}")
    return df

In [18]:
def process_data(df, file_path):
    """
    Filters and processes data to save only rows where valid and valid_spec are True.
    """
    # Filter rows where both 'valid' and 'valid_spec' are True
    processed_df = df[(df["valid"]) & (df["valid_spec"])]

    # Save the processed DataFrame to a CSV file
    processed_output_file = os.path.join(output_folder, os.path.basename(file_path).replace('.txt', '_processed.csv'))
    processed_df.to_csv(processed_output_file, index=False)

    print(f"Processed valid data saved as {processed_output_file}")
    return processed_df

In [19]:
def process_files(file_paths, output_folder):
    """
    Processes multiple text files, combines them into a single DataFrame for valid processed data,
    and saves the final output as CSV files.

    Parameters:
        file_paths (list): List of paths to the text files.
        output_folder (str): Path to the folder where CSV files will be saved.
    """
    all_raw_data = []
    all_valid_processed_data = []

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    for file_path in file_paths:
        # Collect initial raw data
        raw_df = collect_initial_data(file_path)

        # Process and filter valid data
        valid_processed_df = process_data(raw_df, file_path)

        all_raw_data.append(raw_df)
        all_valid_processed_data.append(valid_processed_df)

    # Combine all raw DataFrames
    combined_raw_df = pd.concat(all_raw_data, ignore_index=True)
    combined_valid_processed_df = pd.concat(all_valid_processed_data, ignore_index=True)

    # Save the combined DataFrames
    combined_raw_output = os.path.join(output_folder, "df.csv")
    combined_processed_output = os.path.join(output_folder, "processed_df.csv")

    combined_raw_df.to_csv(combined_raw_output, index=False)
    combined_valid_processed_df.to_csv(combined_processed_output, index=False)

    print(f"Combined raw data saved as '{combined_raw_output}'")
    print(f"Combined valid processed data saved as '{combined_processed_output}'")

    return combined_raw_df, combined_valid_processed_df

In [29]:
# Process files and save results
df, processed_df = process_files(file_paths, output_folder)

Data saved as data/output/1219.csv
Processed valid data saved as data/output/1219_processed.csv
Data saved as data/output/1218.csv
Processed valid data saved as data/output/1218_processed.csv
Data saved as data/output/1220.csv
Processed valid data saved as data/output/1220_processed.csv
Data saved as data/output/1221.csv
Processed valid data saved as data/output/1221_processed.csv
Data saved as data/output/1213.csv
Processed valid data saved as data/output/1213_processed.csv
Data saved as data/output/1212.csv
Processed valid data saved as data/output/1212_processed.csv
Data saved as data/output/1211.csv
Processed valid data saved as data/output/1211_processed.csv
Data saved as data/output/1215.csv
Processed valid data saved as data/output/1215_processed.csv
Data saved as data/output/1214.csv
Processed valid data saved as data/output/1214_processed.csv
Data saved as data/output/1216.csv
Processed valid data saved as data/output/1216_processed.csv
Data saved as data/output/1217.csv
Proce