<a href="https://colab.research.google.com/github/joony0512/HTML_CSS/blob/main/Word_Crawling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import re

try:
    from xml.etree.cElementTree import XML
except ImportError:
    from xml.etree.ElementTree import XML
import zipfile

NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
PARA = NAMESPACE + 'p'
TEXT = NAMESPACE + 't'

def get_docx_text(filename):
    document = zipfile.ZipFile(filename)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)
    paragraphs = []
    for paragraph in tree.iter(PARA):
        texts = [node.text
                for node in paragraph.iter(TEXT)
                if node.text]
        if texts:
            paragraphs.append(''.join(texts))
    return '\n\n'.join(paragraphs)
    
def extract_text_to_dataframe(filename):
    extracted_text = get_docx_text(filename)
    paragraphs = extracted_text.split('\n\n')
    table_data = []
    for paragraph in paragraphs:
        row_data = paragraph.split('\n')
        table_data.append(row_data)
    df = pd.DataFrame(table_data)
    return df

def extract_location(df):
    location = df[df.astype(str).apply(lambda x: x.str.contains('소재지', na=False)).any(axis=1)].iloc[:, 0]
    location = location.str.extract(r'([가-힣\s]+)').values.flatten()
    location = [loc.strip() for loc in location if loc.strip() != '소재지']
    location = ' '.join(location)
    return location

# 추출된 데이터 프레임 생성
extracted_text = extract_text_to_dataframe('/content/drive/MyDrive/노동연구원/IR_KOR.docx')

# 소재지 추출
location = extract_location(extracted_text)

# "소재지" 키워드 제거
location = location.replace('소재지', '')
location = location.replace('설립일', '')
location = location.replace(' ', '')



text = get_docx_text('/content/drive/MyDrive/노동연구원/IR_KOR.docx')


# 대표자 (생년) 추출
# 대표자 이름과 생년 추출
founder_pattern = r"대표자 \(생년\)\s*(\S+)\s*\((\d{4})년\)"
founder_match = re.search(founder_pattern, text)
if founder_match:
    founder_name = founder_match.group(1)
    founder = founder_match.group(2)
else:
    founder_name = None
    founder= None


# 사업화 단계 추출
business_stage = re.findall(r"시리즈\s*([A-Za-z]+)", text)
if business_stage:
    business_stage = business_stage[0].strip()
else:
    business_stage = None


# 후속투자 추출
follow_up_investment = re.findall(r"후속투자 ([\d.]+)억원", text)
if follow_up_investment:
    follow_up_investment = follow_up_investment[0]

# 고용창출 (선정당시) 추출
job_creation_initial = re.findall(r"고용창출 \(선정당시\) (\d+)명", text)
if job_creation_initial:
    job_creation_initial = job_creation_initial[0]

# 고용창출 (최신) 추출
job_creation_latest_year= re.findall(r"고용창출 \(선정당시\) \d+명 -> \('(\d+)\) \d+명", text)
if job_creation_latest_year:
    job_creation_latest_year = job_creation_latest_year[0]

job_creation_latest = re.findall(r"고용창출 \(선정당시\) \d+명 -> \('\d+\) (\d+)명", text)
if job_creation_latest:
    job_creation_latest = job_creation_latest[0]

# 논문 및 특허 추출
paper_patent_count = re.findall(r"논문 및 특허 \((\w+)\) (\d+)건", text)
if paper_patent_count:
    paper_count = paper_patent_count[0][1]

# 매출 (‘20) 추출
revenue_2020 = re.findall(r"매출 \(‘20\) ([\d.]+)억원", text)
if revenue_2020:
    revenue_2020 = revenue_2020[0]

# 추출한 정보 출력
print("소재지:", location)
print("대표자 (생년):", founder)
print("사업화 단계:", business_stage)
print("후속투자:", follow_up_investment)
print("고용창출 (선정당시):", job_creation_initial, "명")
print("고용창출 (최신):", job_creation_latest_year,'년',job_creation_latest, "명")
print("논문 및 특허:", paper_count, "건")
print("매출 (‘20):", revenue_2020, "억원")


소재지: 경기화성시
대표자 (생년): 1973
사업화 단계: B
후속투자: 120.2
고용창출 (선정당시): 5 명
고용창출 (최신): 20 년 42 명
논문 및 특허: 29 건
매출 (‘20): 48 억원
