# 제주어 데이터프레임화 시키기

In [2]:
import json
import pandas as pd

# JSON 파일 경로
file_path = 'Training/DZES20000002.json'

# 필요한 speaker_id, form, standard_form, dialect_form, isDialect 부분만 추출
def load_and_extract_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    utterances = data['utterance']
    speakers = {speaker['id']: speaker for speaker in data['speaker'] if speaker['id'] is not None}
    default_speaker_info = {
        'age': 'Unknown', 
        'sex': 'Unknown', 
        'occupation': 'Unknown', 
        'birthplace': 'Unknown', 
        'principal_residence' : 'Unknown',
        'current_residence' : 'Unknown',
        'education': 'Unknown'
    }
    extracted_data = []

    for utterance in utterances:
        speaker_id = utterance['speaker_id']
        speaker_info = speakers.get(speaker_id, default_speaker_info)
        form = utterance['form']
        standard_form = utterance['standard_form']
        dialect_form = utterance['dialect_form']
        isDialect = any(eojeol['isDialect'] for eojeol in utterance['eojeolList'])  # Check if any eojeol is a dialect

        utterance_data = {
            'speaker_id': speaker_id,
            'age': speaker_info['age'],
            'sex': speaker_info['sex'],
            'occupation': speaker_info['occupation'],
            'birthplace': speaker_info['birthplace'],
            'principal_residence': speaker_info['principal_residence'],
            'current_residence': speaker_info['current_residence'],
            'education': speaker_info['education'],
            'form': form,
            'standard_form': standard_form,
            'dialect_form': dialect_form,
            'isDialect': isDialect,
        }

        extracted_data.append(utterance_data)

    return extracted_data

# 데이터프레임화
extracted_data = load_and_extract_data(file_path)
df = pd.DataFrame(extracted_data)

Training 폴더 안에 많은 json 파일들이 있어.

폴더 경로만 입력해주면 자동으로 json 파일을 탐색해서
아래 코드처럼 데이터프레임화 시켜주는 코드를 작성해줄래??

## json 파일 읽어서 데이터프레임화

In [4]:
import json
import pandas as pd
import os
import glob

# Define the function to process JSON files in a directory
def process_json_files(directory_path):
    # Use glob to find all JSON files in the directory
    json_files = glob.glob(os.path.join(directory_path, '*.json'))

    # Loop over each file path
    for file_path in json_files:
        # Use your existing function to load and extract data
        extracted_data = load_and_extract_data(file_path)

        # Create a DataFrame from the extracted data
        df = pd.DataFrame(extracted_data)

        # Here you can either return the DataFrame,
        # print it out, or perhaps write it to a CSV or another file
        print(f"Processed {file_path}")
        # For example, you could save each as a CSV
        df.to_csv(file_path.replace('.json', '.csv'), index=False)

        # If you want to combine all DataFrames into one, you could append them to a list
        # and concatenate them after the loop

    # If combining, return the concatenated DataFrame
    # return pd.concat(dfs_list, ignore_index=True)

# Path to the Training directory
training_dir_path = '../2024_1학기_제주어번역모델생성/Training'  # Replace with your actual directory path

# Execute the function
process_json_files(training_dir_path)

Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000002.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000006.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000008.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000009.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000011.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000012.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000013.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000022.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000023.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000026.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000027.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000028.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000029.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000030.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000031.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000032.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000033.js

## csv 파일 읽어서 위 아래로 병합

In [5]:
# Execute the function
process_json_files(training_dir_path)

import pandas as pd
import glob

# Replace with your actual directory path
training_dir_path = '../2024_1학기_제주어번역모델생성/Training'

# Use glob to list all CSV files in the directory
csv_files = glob.glob(os.path.join(training_dir_path, '*.csv'))

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through all the files
for csv_file in csv_files:
    # Read the current CSV into a DataFrame
    df = pd.read_csv(csv_file)
    # Append the DataFrame to the list
    dataframes.append(df)

# Concatenate all the DataFrames into a single one
merged_df = pd.concat(dataframes, ignore_index=True)

Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000002.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000006.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000008.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000009.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000011.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000012.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000013.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000022.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000023.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000026.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000027.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000028.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000029.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000030.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000031.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000032.json
Processed ../2024_1학기_제주어번역모델생성/Training\DZES20000033.js

In [8]:
merged_df = merged_df.loc[merged_df.isDialect == True]

In [9]:
# 불용어 제거 - 감정표현
merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace('(', ''))
merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace(')', ''))
merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace('#', ''))
merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace('{laughing}', ''))
merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace('{laughing5}', ''))
merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace('{laughing6}', ''))
merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace('{applauding}', ''))
merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace('{clearing}', ''))
merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace('{singing}', ''))

merged_df['dialect_form'] = merged_df['dialect_form'].apply(lambda x: str(x).replace('(', ''))
merged_df['dialect_form'] = merged_df['dialect_form'].apply(lambda x: str(x).replace(')', ''))
merged_df['dialect_form'] = merged_df['dialect_form'].apply(lambda x: str(x).replace('#', ''))
merged_df['dialect_form'] = merged_df['dialect_form'].apply(lambda x: str(x).replace('{laughing}', ''))
merged_df['dialect_form'] = merged_df['dialect_form'].apply(lambda x: str(x).replace('{laughing5}', ''))
merged_df['dialect_form'] = merged_df['dialect_form'].apply(lambda x: str(x).replace('{laughing6}', ''))
merged_df['dialect_form'] = merged_df['dialect_form'].apply(lambda x: str(x).replace('{applauding}', ''))
merged_df['dialect_form'] = merged_df['dialect_form'].apply(lambda x: str(x).replace('{clearing}', ''))
merged_df['dialect_form'] = merged_df['dialect_form'].apply(lambda x: str(x).replace('{singing}', ''))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace('(', ''))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace(')', ''))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['

In [10]:
# 불용어 제거 - 이름

for i in range(1, 101):
    merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace(f'&name{i}&', ''))
    merged_df['dialect_form'] = merged_df['dialect_form'].apply(lambda x: str(x).replace(f'&name{i}&', ''))

    merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace(f'&company-name{i}&', ''))
    merged_df['dialect_form'] = merged_df['dialect_form'].apply(lambda x: str(x).replace(f'&company-name{i}&', ''))


merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace('&name&', ''))
merged_df['dialect_form'] = merged_df['dialect_form'].apply(lambda x: str(x).replace('&name&', ''))
merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace('&name5 &', ''))
merged_df['dialect_form'] = merged_df['dialect_form'].apply(lambda x: str(x).replace('&name5 &', ''))
merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace('&namE5&', ''))
merged_df['dialect_form'] = merged_df['dialect_form'].apply(lambda x: str(x).replace('&namE5&', ''))
merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace('&name3$', ''))
merged_df['dialect_form'] = merged_df['dialect_form'].apply(lambda x: str(x).replace('&name3$', ''))
merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace('@name@', ''))
merged_df['dialect_form'] = merged_df['dialect_form'].apply(lambda x: str(x).replace('@name@', ''))
merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace('@name3', ''))
merged_df['dialect_form'] = merged_df['dialect_form'].apply(lambda x: str(x).replace('@name3', ''))
merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace('&name@', ''))
merged_df['dialect_form'] = merged_df['dialect_form'].apply(lambda x: str(x).replace('&name@', ''))
merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace('&company-name&', ''))
merged_df['dialect_form'] = merged_df['dialect_form'].apply(lambda x: str(x).replace('&company-name&', ''))
merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace('&compny-name2&', ''))
merged_df['dialect_form'] = merged_df['dialect_form'].apply(lambda x: str(x).replace('&compny-name2&', ''))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['standard_form'] = merged_df['standard_form'].apply(lambda x: str(x).replace(f'&name{i}&', ''))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['dialect_form'] = merged_df['dialect_form'].apply(lambda x: str(x).replace(f'&name{i}&', ''))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-co

파이썬 데이터프레임의 standard_form column과 dialect_form column에 대해서 전처리 해줄거야.

띄어쓰기를 기준으로 토큰화해서 각 요소들을 살피는데
만약 토큰 중에 '/' 이 포함되어 있다면

제주어 column에서는 / 앞에 위치한 단어를 고르고
표준어 column에서는 / 뒤에 위치한 단어를 고르도록 전처리해주고 싶어.

함수에 대한 자세한 설명과 실제 데이터프레임에 적용하는 부분까지 코드 작성 부탁해.

<예시_1>\
예시문장 : 이렇게 하다그네/하다가 큰일 나\
dialect_form : 이렇게 하다그네 큰일 나\
standard_form : 이렇게 하다가 큰일 나

<예시_2>\
예시문장 : 겅허고/그리고 예초하다보면 돌멩이가 많이 튀어 위험해\
dialect_form : 겅허고 예초하다보면 돌멩이가 많이 튀어 위험해\
standard_form : 그리고 예초하다보면 돌멩이가 많이 튀어 위험해

In [11]:
# Let's define a function to process the standard_form and dialect_form columns as described
def preprocess_forms(row):
    # Split the sentence into tokens based on whitespace
    dialect_tokens = row['dialect_form'].split()
    standard_tokens = row['standard_form'].split()

    # For tokens that contain a '/', choose the part before '/' for dialect_form
    # and the part after '/' for standard_form
    row['dialect_form'] = ' '.join(token.split('/')[0] if '/' in token else token for token in dialect_tokens)
    row['standard_form'] = ' '.join(token.split('/')[1] if '/' in token else token for token in standard_tokens)

    return row
# Apply the preprocess function to each row in the DataFrame
processed_df = merged_df.apply(preprocess_forms, axis=1)

In [13]:
processed_df.to_csv('processed_df.csv', index = False)