<a href="https://colab.research.google.com/github/ich-20211101/domo/blob/main/Text_Mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Text Analysis

In [11]:
!pip install nltk



In [12]:
# Tokenization
import nltk
# nltk.download('all')  # 모든 패키지 다운로드

text = "This is a Big Data course in CCTB."
tokens = nltk.word_tokenize(text)
print(tokens)

['This', 'is', 'a', 'Big', 'Data', 'course', 'in', 'CCTB', '.']


In [14]:
paragraph_text = "Tis is a Big Data course in CCTB. This is our second semester. We are loving it."
sent_tokens = nltk.sent_tokenize(paragraph_text)
print(sent_tokens)

['Tis is a Big Data course in CCTB.', 'This is our second semester.', 'We are loving it.']


In [15]:
# Counter Tokens
from collections import Counter

word_counts = Counter(tokens)
print(word_counts)

Counter({'This': 1, 'is': 1, 'a': 1, 'Big': 1, 'Data': 1, 'course': 1, 'in': 1, 'CCTB': 1, '.': 1})


In [16]:
# StopWords (The, an , or ,etc.)

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print(filtered_tokens)

['Big', 'Data', 'course', 'CCTB', '.']


In [20]:
print(stopwords.words('english'))
print(stopwords.words('spanish'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [22]:
# Stemming and Lemitizing (Back to root form)

from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in tokens]
print("\nstemmed_words", stemmed_words)

lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
print("lemmatized_words", lemmatized_words)


stemmed_words ['thi', 'is', 'a', 'big', 'data', 'cours', 'in', 'cctb', '.']
lemmatized_words ['This', 'is', 'a', 'Big', 'Data', 'course', 'in', 'CCTB', '.']


In [23]:
print(stemmer.stem('delicious'))
print(lemmatizer.lemmatize('delicious'))

delici
delicious


In [29]:
# Sentiment Analysis

from nltk.sentiment import SentimentIntensityAnalyzer

text = "I love this course! This is very exciting and amazing!"
sia = SentimentIntensityAnalyzer()
sentiment_scores = sia.polarity_scores(text)
print(sentiment_scores)

# neg: 부정적 감정
# neu: 중립적 감정
# pos: 긍정적 감정
# compound: 전체적 감성 점수 (1: 긍정적, -1: 부정적)

# compound > 0.5: positive
# compound < -0.5: negative
# -0.5 <= compound <= 0.5: neutral

{'neg': 0.0, 'neu': 0.327, 'pos': 0.673, 'compound': 0.9237}


In [28]:
from nltk.sentiment import SentimentIntensityAnalyzer

text = "I Hate this course! Its very hard and confusing"
sia = SentimentIntensityAnalyzer()
sentiment_scores = sia.polarity_scores(text)
print(sentiment_scores)

{'neg': 0.611, 'neu': 0.389, 'pos': 0.0, 'compound': -0.7813}


In [30]:
# Text Classification

import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Training data
documents = [
    ("I love this course", "positive"),
    ("I hate this program", "negative"),
    ("This was an awesome movie", "positive"),
    ("The course was terrible", "negative")
]

# Prepare features and labels
vectorizer = CountVectorizer()
features = vectorizer.fit_transform([doc[0] for doc in documents])
labels = [doc[1] for doc in documents]

# Train a classifier (Naive Bayes)
classifier = MultinomialNB()
classifier.fit(features, labels)

# Test with a new example
new_example = vectorizer.transform(["I really enjoyed watching this film"])
prediction = classifier.predict(new_example)
print(prediction)

['positive']


In [34]:
# Class12_Participation_Exercise

# 1. Take a paragraph
paragraph = "While I have a few people that I speak to on the phone regularly, most people I consulted view an unbidden phone call as hostile. They assume there’s an emergency if they get a call from someone with whom they don’t have a regular phone relationship."

# 2. Tokenize into sentences and words
paragraph_tokens = nltk.word_tokenize(paragraph)
print(paragraph_tokens)

# 3. Remove stop words
stop_words = set(stopwords.words('english'))
filtered_paragraph_tokens = [word for word in paragraph_tokens if word.lower() not in stop_words]
print(filtered_paragraph_tokens)

# 4. Perform stemming and Lemmatization, and print the outputs of both.
stemmed_paragraph_words = [stemmer.stem(word) for word in paragraph_tokens]
print(stemmed_paragraph_words)

lemmatized_paragraph_words = [lemmatizer.lemmatize(word) for word in paragraph_tokens]
print(lemmatized_paragraph_words)

['While', 'I', 'have', 'a', 'few', 'people', 'that', 'I', 'speak', 'to', 'on', 'the', 'phone', 'regularly', ',', 'most', 'people', 'I', 'consulted', 'view', 'an', 'unbidden', 'phone', 'call', 'as', 'hostile', '.', 'They', 'assume', 'there', '’', 's', 'an', 'emergency', 'if', 'they', 'get', 'a', 'call', 'from', 'someone', 'with', 'whom', 'they', 'don', '’', 't', 'have', 'a', 'regular', 'phone', 'relationship', '.']
['people', 'speak', 'phone', 'regularly', ',', 'people', 'consulted', 'view', 'unbidden', 'phone', 'call', 'hostile', '.', 'assume', '’', 'emergency', 'get', 'call', 'someone', '’', 'regular', 'phone', 'relationship', '.']
['while', 'i', 'have', 'a', 'few', 'peopl', 'that', 'i', 'speak', 'to', 'on', 'the', 'phone', 'regularli', ',', 'most', 'peopl', 'i', 'consult', 'view', 'an', 'unbidden', 'phone', 'call', 'as', 'hostil', '.', 'they', 'assum', 'there', '’', 's', 'an', 'emerg', 'if', 'they', 'get', 'a', 'call', 'from', 'someon', 'with', 'whom', 'they', 'don', '’', 't', 'have'