### Text Mining Practice

In [1]:
import nltk
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
print(movie_reviews.fileids()[:5])

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\yhj59\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.


['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt']


In [3]:
fileids = movie_reviews.fileids()
first_id = fileids[0]

raw_text = movie_reviews.raw(first_id)
print(raw_text[:500], "...")


plot : two teen couples go to a church party , drink and then drive . 
they get into an accident . 
one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . 
what's the deal ? 
watch the movie and " sorta " find out . . . 
critique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . 
which is what makes this review an even harder one to write , since i generally applaud films which attempt ...


## Tokenization

In [9]:
words = movie_reviews.words(first_id)
print(words[:20])

tokens = [w.lower() for w in words if w.isalpha()]
print(tokens[:20])


['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an']
['plot', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', 'drink', 'and', 'then', 'drive', 'they', 'get', 'into', 'an', 'accident', 'one', 'of']


### Bag-of-Words + Naïve Bayes

In [11]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

word_features = list(all_words)[:2000]


In [16]:
def document_features(document_words):
    document_words = set(document_words)
    features = {}
    for wf in word_features:
        features[f"contains({wf})"] = (wf in document_words)
    return features

In [19]:
documents = [
    (movie_reviews.words(fid), category)
    for category in movie_reviews.categories()
    for fid in movie_reviews.fileids(category)
]

In [20]:
import random
random.shuffle(documents)
featuresets = [(document_features(d), c) for (d,c) in documents]

train_set, test_set = featuresets[100:], featuresets[:100]

from nltk import NaiveBayesClassifier, classify
classifier = NaiveBayesClassifier.train(train_set)

accuracy = classify.accuracy(classifier, test_set)
print(f"Test accuracy: {accuracy:.2%}")

classifier.show_most_informative_features(10)


Test accuracy: 79.00%
Most Informative Features
        contains(seagal) = True              neg : pos    =     12.2 : 1.0
   contains(outstanding) = True              pos : neg    =     11.0 : 1.0
         contains(mulan) = True              pos : neg    =      9.1 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.8 : 1.0
         contains(damon) = True              pos : neg    =      6.0 : 1.0
         contains(flynt) = True              pos : neg    =      5.7 : 1.0
         contains(waste) = True              neg : pos    =      5.4 : 1.0
          contains(lame) = True              neg : pos    =      5.3 : 1.0
        contains(wasted) = True              neg : pos    =      5.2 : 1.0
         contains(awful) = True              neg : pos    =      5.1 : 1.0


### TF–IDF + Pipeline

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics

texts = [" ".join(movie_reviews.words(fid)) for fid in movie_reviews.fileids()]
labels = [movie_reviews.categories(fid)[0] for fid in movie_reviews.fileids()]

x_train, x_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        token_pattern=r"\b[a-zA-Z]+\b",
        stop_words="english",
        max_df=0.8,
        min_df=5)),
    ("clf", MultinomialNB())
])

pipeline.fit(x_train, y_train)
y_pred = pipeline.predict(x_test)

print("Accuracy:", f"{metrics.accuracy_score(y_test, y_pred):.2%}")
print("\nConfision Matrix")
print(metrics.confusion_matrix(y_test, y_pred, labels=["neg", "pos"]))
print("\nClassification Report")
print(metrics.classification_report(y_test, y_pred, digits=2))


Accuracy: 81.25%

Confision Matrix
[[169  30]
 [ 45 156]]

Classification Report
              precision    recall  f1-score   support

         neg       0.79      0.85      0.82       199
         pos       0.84      0.78      0.81       201

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.81       400
weighted avg       0.81      0.81      0.81       400

