<a href="https://colab.research.google.com/github/jumbokh/nknu-class/blob/main/NLP/notebooks/Ch16_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
print('\n'.join(train['DESCR'].split('\n')[:10]))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.


In [2]:
train['target_names']

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [3]:
from IPython.display import display
df_train = pd.DataFrame(zip(train['data'], train['target']), 
                        columns=['content','target'])
display(df_train.head())
display(df_train['target'].value_counts())

Unnamed: 0,content,target
0,From: sd345@city.ac.uk (Michael Collier)\nSubj...,1
1,From: ani@ms.uky.edu (Aniruddha B. Deglurkar)\...,1
2,From: djohnson@cs.ucsd.edu (Darin Johnson)\nSu...,3
3,From: s0612596@let.rug.nl (M.M. Zwart)\nSubjec...,3
4,From: stanly@grok11.columbiasc.ncr.com (stanly...,3


3    599
2    594
1    584
0    480
Name: target, dtype: int64

In [4]:
print(f'檢視第一筆資料的類別為：{df_train.loc[0,"target"]}')
print(df_train.loc[0,'content'])

檢視第一筆資料的類別為：1
From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.



In [5]:
X_train = train['data']
y_train = train['target']
X_test = test['data']
y_test = test['target']

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
string = ['He do love her and He does not loves he']
cv = CountVectorizer()
bow = cv.fit_transform(string)
bow.toarray()

array([[1, 1, 1, 3, 1, 1, 1, 1]])

In [7]:
df_bow = pd.DataFrame(bow.toarray(), columns=cv.get_feature_names())
df_bow

Unnamed: 0,and,do,does,he,her,love,loves,not
0,1,1,1,3,1,1,1,1


In [8]:
string = ['He do love her and He does not loves he']
cv = CountVectorizer(ngram_range=(2,2))
bow = cv.fit_transform(string)
df_bow = pd.DataFrame(bow.toarray(), columns=cv.get_feature_names())
df_bow

Unnamed: 0,and he,do love,does not,he do,he does,her and,love her,loves he,not loves
0,1,1,1,1,1,1,1,1,1


In [9]:
string = ['He do love her and He does not loves he']
cv = CountVectorizer(stop_words='english')
bow = cv.fit_transform(string)
df_bow = pd.DataFrame(bow.toarray(), columns=cv.get_feature_names())
df_bow

Unnamed: 0,does,love,loves
0,1,1,1


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
string = ['dog love loves',
         'pig love loves pig']
cv = TfidfVectorizer(use_idf=False, stop_words='english')
bow = cv.fit_transform(string)
df_bow = pd.DataFrame(bow.toarray(), columns=cv.get_feature_names())
df_bow

Unnamed: 0,dog,love,loves,pig
0,0.57735,0.57735,0.57735,0.0
1,0.0,0.408248,0.408248,0.816497


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
string = ['dog love loves',
         'pig love loves pig']
cv = TfidfVectorizer(stop_words='english')
bow = cv.fit_transform(string)
df_bow = pd.DataFrame(bow.toarray(), columns=cv.get_feature_names())
df_bow

Unnamed: 0,dog,love,loves,pig
0,0.704909,0.501549,0.501549,0.0
1,0.0,0.3178,0.3178,0.893312


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [13]:
model_pl = Pipeline([
    ('preprocess', CountVectorizer(stop_words='english')),
    ('model', LogisticRegression())
])
param_grid = {
    'model':[LogisticRegression(), SVC(), 
              KNeighborsClassifier(), MultinomialNB()]
}
gs = GridSearchCV(model_pl, param_grid=param_grid,
                  cv=5, return_train_score=True)
gs.fit(X_train, y_train)
score = gs.best_estimator_.score(X_test, y_test)
print('最佳預測參數', gs.best_params_)
print('訓練集交叉驗證的最佳結果', gs.best_score_.round(3))
print('測試集的結果', score.round(3))
y_pred = gs.best_estimator_.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('綜合報告')
print(classification_report(y_test, y_pred))

最佳預測參數 {'model': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)}
訓練集交叉驗證的最佳結果 0.978
測試集的結果 0.942
[[289   3   5  22]
 [  5 376   6   2]
 [ 11  13 366   6]
 [  5   4   5 384]]
綜合報告
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       319
           1       0.95      0.97      0.96       389
           2       0.96      0.92      0.94       396
           3       0.93      0.96      0.95       398

    accuracy                           0.94      1502
   macro avg       0.94      0.94      0.94      1502
weighted avg       0.94      0.94      0.94      1502



In [14]:
model_pl = Pipeline([
    ('preprocess', CountVectorizer(stop_words='english')),
    ('model', LogisticRegression())
])
param_grid = {
    'preprocess':[CountVectorizer(stop_words='english'),
                 TfidfVectorizer(stop_words='english'),
                 TfidfVectorizer(use_idf=False, stop_words='english')],
    'model':[LogisticRegression(), SVC(), 
              KNeighborsClassifier(), MultinomialNB()]
}
gs = GridSearchCV(model_pl, param_grid=param_grid,
                  cv=5, return_train_score=True)
gs.fit(X_train, y_train)
score = gs.best_estimator_.score(X_test, y_test)
print('最佳預測參數', gs.best_params_)
print('訓練集交叉驗證的最佳結果', gs.best_score_.round(3))
print('測試集的結果', score.round(3))
y_pred = gs.best_estimator_.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('綜合報告')
print(classification_report(y_test, y_pred))

最佳預測參數 {'model': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 'preprocess': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)}
訓練集交叉驗證的最佳結果 0.978
測試集的結果 0.942
[[289   3   5  22]
 [  5 376   6   2]
 [ 11  13 366   6]
 [  5   4   5 384]]
綜合報告
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       319
           1       0.95      0.97      0.96       389
           2       0.96      0.92      0.94       396
           3       0.93      0.96      0.95       398

    accuracy                           0.94      1502
   macro avg       0.94      0.94      0.94     

In [15]:
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(train['data'])
np.random.seed(42)
from sklearn.decomposition import LatentDirichletAllocation 
lda = LatentDirichletAllocation(n_components=4)
X_topics = lda.fit_transform(X)
lda.components_.shape

(4, 35482)

In [16]:
pd.DataFrame(lda.components_[0], index=cv.get_feature_names(),
            columns=['topic']).sort_values(by='topic', ascending=False)[:8]

Unnamed: 0,topic
god,1335.641348
edu,854.573218
subject,562.693718
lines,512.312208
organization,479.429353
people,436.046642
does,387.063643
church,381.461509


In [17]:
n_topics = 4
n_words = 10
words = {}
for topic in range(n_topics):
    word = pd.DataFrame(lda.components_[topic], index=cv.get_feature_names()).\
            sort_values(by=0, ascending=False)[:n_words].index.tolist()
    words[f'主題{topic+1}'] = word
pd.DataFrame(words)

Unnamed: 0,主題1,主題2,主題3,主題4
0,god,edu,edu,edu
1,edu,god,lines,com
2,subject,people,subject,subject
3,lines,subject,organization,lines
4,organization,lines,image,organization
5,people,com,graphics,writes
6,does,organization,com,article
7,church,don,use,posting
8,think,writes,university,keith
9,writes,think,pitt,nntp


In [18]:
# 用這三行可以增加新的停用字詞
from sklearn.feature_extraction import text
extra_words = ['edu','subject','lines','com']
stop_words = text.ENGLISH_STOP_WORDS.union(extra_words)

lda = LatentDirichletAllocation(n_components=4)
cv = CountVectorizer(stop_words=stop_words)
bow = cv.fit_transform(train['data'])
X_topics = lda.fit_transform(bow)
words = {}
for topic in range(n_topics):
    word = pd.DataFrame(lda.components_[topic], index=cv.get_feature_names()).\
            sort_values(by=0, ascending=False)[:n_words].index.tolist()
    words[f'主題{topic+1}'] = word
pd.DataFrame(words)

Unnamed: 0,主題1,主題2,主題3,主題4
0,organization,god,organization,image
1,article,people,writes,graphics
2,cs,organization,keith,organization
3,writes,think,article,file
4,pitt,jesus,sgi,university
5,msg,don,caltech,posting
6,gordon,writes,posting,host
7,banks,does,host,software
8,science,believe,nntp,files
9,geb,just,university,nntp


In [19]:
np.random.seed(42)
model_pl = make_pipeline(
    CountVectorizer(stop_words='english', max_df=0.4),
    LatentDirichletAllocation(n_components=4)
)
X_topics = model_pl.fit_transform(train['data'])
lda = model_pl.named_steps['latentdirichletallocation']
cv = model_pl.named_steps['countvectorizer']

words = {}
for topic in range(n_topics):
    word = pd.DataFrame(lda.components_[topic], index=cv.get_feature_names()).\
            sort_values(by=0, ascending=False)[:n_words].index.tolist()
    words[f'主題{topic+1}'] = word
pd.DataFrame(words)

Unnamed: 0,主題1,主題2,主題3,主題4
0,god,graphics,god,people
1,jesus,image,people,don
2,people,university,don,msg
3,christ,cs,think,like
4,know,posting,does,think
5,christian,host,believe,know
6,think,computer,just,just
7,bible,nntp,atheists,time
8,just,file,say,health
9,like,software,know,use


In [20]:
from IPython.display import display
print('每篇文章在不同主題的機率分布：')
df_class = pd.DataFrame(X_topics, columns=[f'主題{i}' for i in range(1,5)])
df_class['最有可能的主題'] = df_class.idxmax(axis=1) 
display(df_class.head().style.highlight_max(axis=1))

print('第一篇文章')
print(X_train[0])

每篇文章在不同主題的機率分布：


Unnamed: 0,主題1,主題2,主題3,主題4,最有可能的主題
0,0.397524,0.595319,0.00362,0.003536,主題2
1,0.242089,0.751777,0.003071,0.003063,主題2
2,0.001151,0.00114,0.05778,0.939929,主題4
3,0.005034,0.218172,0.005176,0.771618,主題4
4,0.991181,0.002937,0.002915,0.002967,主題1


第一篇文章
From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.

