# 텍스트 전처리(영어)

## 소문자화

In [None]:
texts=["South Korea","south korea","South Korea","South korea"]
lower_words=[word.lower() for word in texts]
lower_words

## 어간 추출

In [None]:
import nltk
import pandas as pd
from nltk.stem import PorterStemmer

# 어간 추출기 생성
porter_stemmer=PorterStemmer()

In [None]:
# 예시 단어 connect 
words=["connect","connected","connection","connections","connects"]
stemmed_words=[porter_stemmer.stem(word=word) for word in words]

stemdf= pd.DataFrame({'original_word': words,'stemmed_word': stemmed_words})
stemdf

In [None]:
# 예시 단어 trouble
words=["trouble","troubled","troubles","troublemsome"]
stemmed_words=[porter_stemmer.stem(word=word) for word in words]

stemdf= pd.DataFrame({'original_word': words,'stemmed_word': stemmed_words})
stemdf

## 표제어 복원

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

# 표제어 복원기 생성
lemmatizer = WordNetLemmatizer()


In [None]:
# trouble 변이단어 표제어 복원
words=["trouble","troubling","troubled","troubles","troublesome","run","ran","runing",]
lemmatized_words=[lemmatizer.lemmatize(word=word,pos='v') for word in words]
lemmatizeddf= pd.DataFrame({'original_word': words,'lemmatized_word': lemmatized_words})
lemmatizeddf=lemmatizeddf[['original_word','lemmatized_word']]
lemmatizeddf

In [None]:
# goose 표제어 복원 예시
words=["goose","geese"]
lemmatized_words=[lemmatizer.lemmatize(word=word,pos='n') for word in words]
lemmatizeddf= pd.DataFrame({'original_word': words,'lemmatized_word': lemmatized_words})
lemmatizeddf=lemmatizeddf[['original_word','lemmatized_word']]
lemmatizeddf

## 불용어 제거

In [None]:
stopwords=['this','that','and','a','we','it','to','is','of','up','need'] # 이때 중요한 정보가 있을 수 있으니, 주의해야 한다.
text="this is a text full of content and we need to clean it up"

In [None]:
words=text.split(" ")
shortlisted_words=[]

# 불용어 제거
for w in words:
    if w not in stopwords:
        shortlisted_words.append(w)
    else:
        shortlisted_words.append("W")

print("original sentence = ",text)    
print("sentence with stop words removed= ",' '.join(shortlisted_words))    

## 노이즈 제거


In [None]:
import nltk
import pandas as pd
import re
from nltk.stem import PorterStemmer

porter_stemmer=PorterStemmer()


In [None]:
# 노이즈 섞인 단어 처리
raw_words=["..trouble..","trouble<","trouble!","<a>trouble</a>",'1.trouble']
stemmed_words=[porter_stemmer.stem(word=word) for word in raw_words]
stemdf= pd.DataFrame({'raw_word': raw_words,'stemmed_word': stemmed_words})
stemdf

In [None]:
def scrub_words(text):
    """Basic cleaning of texts."""
    
    # HTML 마크업 심볼 제거 
    text=re.sub("(<.*?>)","",text)
    
    # 단어 이외의 심볼 제거
    text=re.sub("(\\W|\\d)"," ",text)
    
    # 공백 문자 제거
    text=text.strip()
    return text

In [None]:
# 노이즈 제거된 단어를 추가적으로 stemming
cleaned_words=[scrub_words(w) for w in raw_words]
cleaned_stemmed_words=[porter_stemmer.stem(word=word) for word in cleaned_words]
stemdf= pd.DataFrame({'raw_word': raw_words,'cleaned_word':cleaned_words,'stemmed_word': cleaned_stemmed_words})
stemdf=stemdf[['raw_word','cleaned_word','stemmed_word']]
stemdf