In [1]:
pip install numpy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import os
import re

def read_markov_document(file_path="./data/markov.md"):
    """
    마크다운 문서 markov.md를 읽어오는 함수
    
    Args:
        file_path (str): 마크다운 파일의 경로
    
    Returns:
        str: 마크다운 문서의 전체 내용
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        print(f"마크다운 문서를 성공적으로 읽었습니다: {file_path}")
        print(f"문서 길이: {len(content)} 문자")
        return content
    except FileNotFoundError:
        print(f"파일을 찾을 수 없습니다: {file_path}")
        return None
    except Exception as e:
        print(f"파일을 읽는 중 오류가 발생했습니다: {e}")
        return None

def extract_text_content(markdown_content):
    """
    마크다운 문서에서 순수 텍스트만 추출하는 함수
    
    Args:
        markdown_content (str): 마크다운 문서 내용
    
    Returns:
        str: 추출된 순수 텍스트
    """
    if not markdown_content:
        return ""
    
    text = re.sub(r'^#{1,6}\s+', '', markdown_content, flags=re.MULTILINE)
    
    text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
    text = re.sub(r'\*([^*]+)\*', r'\1', text)
    
    text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
    
    text = re.sub(r'`([^`]+)`', r'\1', text)
    
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
    
    text = re.sub(r'\^[0-9]+', '', text)
    
    text = re.sub(r'\n\s*\n', '\n\n', text)
    
    return text.strip()

def analyze_markov_text():
    """
    마크다운 문서를 읽어서 분석하는 메인 함수
    """
    markdown_content = read_markov_document()
    
    if markdown_content:
        print("\n=== 마크다운 문서 내용 (처음 500자) ===")
        print(markdown_content[:500] + "...")
        
        clean_text = extract_text_content(markdown_content)
        
        print("\n=== 추출된 순수 텍스트 (처음 500자) ===")
        print(clean_text[:500] + "...")
        
        print(f"\n=== 텍스트 통계 ===")
        print(f"전체 문자 수: {len(clean_text)}")
        print(f"단어 수: {len(clean_text.split())}")
        print(f"줄 수: {len(clean_text.split(chr(10)))}")
        
        return clean_text
    
    return None

markov_text = analyze_markov_text()

마크다운 문서를 성공적으로 읽었습니다: ./data/markov.md
문서 길이: 15753 문자

=== 마크다운 문서 내용 (처음 500자) ===
_Science in Context_ **19** (4), 591–600 (2006). Copyright©CCambridge University Press
doi:10.1017/S0269889706001074 Printed in the United Kingdom

# Classical Text in Translation

# An Example of Statistical Investigation of the Text Eugene

# Onegin Concerning the Connection of Samples in Chains

## A. A. Markov

_(Lecture at the physical-mathematical faculty, Royal Academy of Sciences, St. Petersburg, 23 January
1913)_^1

This study investigates a text excerpt containing 20,000 Russian letter...

=== 추출된 순수 텍스트 (처음 500자) ===
_Science in Context_ 19 (4), 591–600 (2006). Copyright©CCambridge University Press
doi:10.1017/S0269889706001074 Printed in the United Kingdom

Classical Text in Translation

An Example of Statistical Investigation of the Text Eugene

Onegin Concerning the Connection of Samples in Chains

A. A. Markov

_(Lecture at the physical-mathematical faculty, Royal Academy of Sciences, St

In [2]:
import numpy as np
import random

# 1단계: 실제 데이터의 분산 계산하기


- 데이터 100개씩 자르기

In [12]:
def split_text_into_chunks(text, chunk_size=100):
    """
    텍스트를 100개씩 잘라서 리스트로 반환하는 함수
    """
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# 데이터 100개씩 자르기
splitted_text = split_text_into_chunks(markov_text)
print(len(splitted_text))
print(splitted_text[0])

125
_Science in Context_ 19 (4), 591–600 (2006). Copyright©CCambridge University Press
doi:10.1017/S0269


- 모음 개수 구하기

In [17]:
def count_vowels(text):
    """
    텍스트에서 모음 개수 세는 함수
    """
    vowels = 'aeiou'
    return sum(1 for char in text if char.lower() in vowels)

# count_vowels(splitted_text[0])
vowels_counts = []
for i in range(len(splitted_text)):
    vowels_counts.append(count_vowels(splitted_text[i]))

print(vowels_counts)

[18, 27, 33, 28, 26, 29, 29, 29, 33, 25, 17, 26, 24, 27, 26, 26, 30, 28, 28, 33, 29, 22, 32, 27, 30, 29, 29, 31, 29, 16, 18, 24, 25, 20, 27, 25, 24, 26, 30, 26, 27, 28, 29, 26, 35, 31, 22, 32, 29, 33, 31, 34, 29, 27, 24, 28, 25, 31, 26, 18, 34, 27, 27, 30, 28, 29, 23, 31, 22, 24, 31, 22, 31, 24, 20, 31, 25, 30, 32, 27, 34, 31, 29, 21, 33, 14, 30, 31, 31, 35, 30, 35, 29, 29, 32, 32, 31, 30, 31, 31, 34, 28, 33, 31, 27, 26, 31, 29, 28, 30, 28, 26, 35, 24, 37, 30, 27, 29, 30, 24, 32, 24, 27, 32, 13]


- 평균($\mu$) 구하기

In [18]:
def calculate_mean(count_list):
    return sum(count_list) / len(count_list)

mean_vowels = calculate_mean(vowels_counts)
print(mean_vowels)

27.904


- 편차 제곱 합 구하기

In [20]:
def calculate_squared_deviation(count_list, mean):
    return sum((count - mean) ** 2 for count in count_list)

squared_deviation = calculate_squared_deviation(vowels_counts, mean_vowels)
print(squared_deviation)

2458.848


- 분산($\sigma^2$) 구하기

In [21]:
def calculate_variance(squared_deviation, count_list):
    return squared_deviation / len(count_list)

variance = calculate_variance(squared_deviation, vowels_counts)
print(variance)

19.670784


# 2단계: 이론적 분산과 비교하기


- 이론적 분산 계산

In [22]:
n = 100
p = 0.28

n * p * (1 - p)

20.16

- 두 분산 비교

> 실제 분산: $19.670784$ <br>

> 이론적 분산: $20.16$

# 연결성

- 분산계수 구하기

In [23]:
19.670784 / 20.16

0.9757333333333333

# 4단계: ‘단순 연쇄(Simple Chain)’ 모델로 종속성 설명하기

- 1. 조건부 확률 계산

In [30]:
def calculate_conditional_probability(text):
    count_target = 0
    count_target_next = 0
    vowels = 'aeiou'

    for i in range(len(text)):
        if text[i] in vowels:
            count_target += 1
            if i + 1 < len(text) and text[i+1] in vowels:  # 범위 체크 추가
                count_target_next += 1

    return count_target_next / count_target

def calculate_conditional_probability_consonant(text):
    count_target = 0
    count_target_next = 0
    consonants = 'bcdfghjklmnpqrstvwxyz'
    vowels = 'aeiou'

    for i in range(len(text)):
        if text[i] in consonants:
            count_target += 1
            if i + 1 < len(text) and text[i+1] in vowels:  # 범위 체크 추가
                count_target_next += 1

    return count_target_next / count_target

print(calculate_conditional_probability(splitted_text[0]))
print(calculate_conditional_probability_consonant(splitted_text[0]))

0.11764705882352941
0.3793103448275862
