<a href="https://colab.research.google.com/github/hoonZeee/textminer-pro/blob/main/textminer_pro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# textminer-pro: 텍스트 전처리 & 분석 패키지
## 이지훈

이 프로젝트는 텍스트 데이터를 보다 효과적으로 처리하고 분석하기 위해 만든 Python 기반 경량 패키지입니다.  
NLTK, scikit-learn, Sumy, langdetect 등을 활용하여 다음 기능을 제공합니다:

- 불용어 제거 (`remove_stopwords`)
- 키워드 추출 (`extract_keywords`)
- 텍스트 요약 (`summarize_text`)
- 언어 감지 (`detect_language`)



## 디렉터리 생성

In [1]:
!mkdir -p textminer_pro/textminer
!mkdir -p textminer_pro/tests
!mkdir -p textminer_pro/.github/workflows


In [None]:
!pip install langdetect
!pip install sumy
!pip install twine

### cleaner.py

In [3]:
%%writefile textminer_pro/textminer/cleaner.py
import nltk
import os

NLTK_PATH = "/content/nltk_data"
nltk.data.path.append(NLTK_PATH)

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

def remove_stopwords(text: str) -> str:
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered)

def extract_keywords(text: str, top_n=5) -> list:
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([text])
    scores = zip(vectorizer.get_feature_names_out(), tfidf_matrix.toarray()[0])
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    keywords = [word for word, score in sorted_scores[:top_n]]
    return keywords


Writing textminer_pro/textminer/cleaner.py


## cleaner.py 테스트코드

In [4]:
%%writefile textminer_pro/tests/test_cleaner.py
import sys
sys.path.append('/content/textminer_pro')
import os
import nltk
from textminer import remove_stopwords, extract_keywords


NLTK_PATH = "/content/nltk_data"
os.environ["NLTK_DATA"] = NLTK_PATH
nltk.data.path.append(NLTK_PATH)


nltk.download('punkt', download_dir=NLTK_PATH)
nltk.download('punkt_tab', download_dir=NLTK_PATH)
nltk.download('stopwords', download_dir=NLTK_PATH)

text = "This is a test sentence with simple words."
print("Stopword 제거 결과:")
print(remove_stopwords(text))

text2 = "Machine learning is fun and powerful."
print("키워드 추출 결과:")
print(extract_keywords(text2))


Writing textminer_pro/tests/test_cleaner.py


## summarizer.py

In [5]:
%%writefile textminer_pro/textminer/summarizer.py
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

def summarize_text(text: str, num_sentences=2) -> str:
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return " ".join(str(sentence) for sentence in summary)


Writing textminer_pro/textminer/summarizer.py


## detector.py

In [6]:
%%writefile textminer_pro/textminer/detector.py
from langdetect import detect

def detect_language(text: str) -> str:
    try:
        return detect(text)
    except:
        return "언어를 감지할 수 없습니다"


Writing textminer_pro/textminer/detector.py


## init.py

In [9]:
%%writefile textminer_pro/textminer/__init__.py
from .cleaner import remove_stopwords, extract_keywords
from .summarizer import summarize_text
from .detector import detect_language

Writing textminer_pro/textminer/__init__.py


## detector.py 테스트코드

In [7]:
%%writefile textminer_pro/tests/test_detector.py
import sys
sys.path.append('/content/textminer_pro')
from textminer import detect_language

text1 = "This is an English sentence."
text2 = "이 문장은 한국어입니다."

print("English 감지 결과:", detect_language(text1))
print("Korean 감지 결과:", detect_language(text2))


Writing textminer_pro/tests/test_detector.py


## 테스트코드 실행

In [10]:
!python3 textminer_pro/tests/test_cleaner.py
!python3 textminer_pro/tests/test_detector.py

[nltk_data] Downloading package punkt to /content/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /content/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /content/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Stopword 제거 결과:
test sentence simple words .
키워드 추출 결과:
['fun', 'learning', 'machine', 'powerful']
English 감지 결과: en
Korean 감지 결과: ko


In [12]:
%%writefile textminer_pro/setup.py
from setuptools import setup, find_packages

setup(
    name='textminer-pro-jihoonLee',
    version='0.1.0',
    packages=find_packages(),
    install_requires=[
        'nltk',
        'scikit-learn',
        'sumy',
        'langdetect'
    ],
    author='Jihoon Lee',
    author_email='dlwlgns7540@naver.com',
    description='A simple text mining package with stopword removal, keyword extraction, summarization, and language detection.',
    long_description=open('README.md').read(),
    long_description_content_type='text/markdown',
    url='https://github.com/hoonZeee/Oss_2025/tree/main/pypi',
    classifiers=[
        'Programming Language :: Python :: 3',
        'License :: OSI Approved :: MIT License',
    ],
    python_requires='>=3.7',
)


Writing textminer_pro/setup.py


In [None]:
%cd textminer_pro
!python setup.py sdist bdist_wheel

In [18]:
%%writefile /root/.pypirc
[distutils]
index-servers =
    pypi

[pypi]
repository: https://upload.pypi.org/legacy/
username: __token__
password: 보안상 삭제합니다.


Overwriting /root/.pypirc


In [19]:
!twine upload dist/*


Uploading distributions to https://upload.pypi.org/legacy/
Uploading textminer_pro_jihoonLee-0.1.0-py3-none-any.whl
[2K[35m100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 kB[0m • [33m00:00[0m • [31m?[0m
[?25hUploading textminer_pro_jihoonlee-0.1.0.tar.gz
[2K[35m100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 kB[0m • [33m00:00[0m • [31m?[0m
[?25h
[32mView at:[0m
https://pypi.org/project/textminer-pro-jihoonLee/0.1.0/


### 최종 확인

In [None]:
!pip install -i https://test.pypi.org/simple/ textminer-pro

In [24]:
from textminer import remove_stopwords, extract_keywords, summarize_text, detect_language

text = "The OSS assignment is fun and educational, making it easier to extract key insights, summarize complex ideas, and even detect the language used!"
text2 = "OSS 과제는 재미있고 유익해서, 텍스트 요약이나 키워드 추출, 언어 감지까지 쉽게 해볼 수 있다."


print(remove_stopwords(text))
print(extract_keywords(text))
print(summarize_text(text))
print(detect_language(text))

print(remove_stopwords(text2))
print(extract_keywords(text2))
print(summarize_text(text2))
print(detect_language(text2))


OSS assignment fun educational , making easier extract key insights , summarize complex ideas , even detect language used !
['assignment', 'complex', 'detect', 'easier', 'educational']
The OSS assignment is fun and educational, making it easier to extract key insights, summarize complex ideas, and even detect the language used!
en
OSS 과제는 재미있고 유익해서 , 텍스트 요약이나 키워드 추출 , 언어 감지까지 쉽게 해볼 수 있다 .
['oss', '감지까지', '과제는', '쉽게', '언어']

ko


## Git 연동

In [None]:
%cd /content

!git init
!git rm -r --cached nltk_data sample_data .config

In [None]:
%%writefile .gitignore
nltk_data/
sample_data/
.config/
__pycache__/
*.pyc
*.ipynb_checkpoints
*.egg-info/
build/
dist/


In [None]:
# 커밋 푸쉬는 보안상 생략하겠습니다