In [1]:
import spacy
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
from nltk.stem import PorterStemmer,LancasterStemmer
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report

# 加载数据
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
remove = ('headers', 'footers', 'quotes')
data = fetch_20newsgroups(subset='all', categories=categories, remove=remove, shuffle=True,random_state=2021)
label2name = {i: x for i, x in enumerate(data.target_names)}
name2label = {x: i for i, x in enumerate(data.target_names)}
df = pd.DataFrame({'label_name':data.target, 'label':data.target, 'text':data.data})
# print(df[:5])
df['label_name'] = df['label_name'].map(label2name)
# print(df[:5])
print(df.shape)

(3387, 3)


In [2]:
# 文本清洗
option = 'none'  # Speed: 'none'>'stem'>'lemma'
class TextCleaner(object):
    d1 = "We're having a.b.c.d or i.e. u.s.a and 16.2 and U.K. and others."
    d2 = "We buy 12 apples, good apples, from Ms. ABC at 16.2 dollars/pound 24/7 from Monday-Friday. How's that?"
    d3 = "1. I won't eat 1/12 of the #1 top cakes. I got 1.2 dollars or 1.2usd and a long-term/short-term goal"
    d4 = "I lost 22 pounds at 9:30, 11:05 or 1:30pm or 2pm or 3pm or 1-2pm on 2016-01-02 or 01-02-2016."
    d5 = "  --He's a dedicated person. \t He dedicated his time to work! \n --are you sure?"
    d6 = "I am interested in these interesting things that interested me."
    sample_docs = [d1, d2, d3, d4, d5, d6]

    def __init__(self, option='none'):
        self.option = option
        self.nlp = spacy.load('en_core_web_sm')
        self.stemmer = PorterStemmer()

    def lemmatize(self, text):
        tuples = [(str(token), token.lemma_) for token in self.nlp(text)]
        text = ' '.join([lemma if lemma != '-PRON-' else token for token, lemma in tuples])
        text = re.sub(' ([_-]) ', '\\1', text)  # 'short - term' -> 'short-term'
        return text

    def text_cleanner(self, text):
        stopwords = set(['i', 'I', 'you', 'he', 'she', 'it', 'we', 'they', 'am', 'is', 'are'])
        text = text.lower()  # 1.lower strings
        text = text.replace("'s", 's')  # 2.handle contractions
        text = re.sub('([a-zA-Z])\\.([a-zA-Z])\\.*', '\\1\\2', text)  # 3. handle abbreviations
        text = re.sub('\\d+', '', text)  # 4. handle numbers
        text = re.sub('[^a-zA-Z0-9\\s_-]', ' ', text)  # 5. remove punctuations
        if 'lemma' in self.option:
            text = self.lemmatize(text)  # 6. perform lemmatization
        elif 'stem' in self.option:
            text = ' '.join(self.stemmer.stem(x) for x in text.split(' '))  # 6. perform stemming
        text = ' '.join(x for x in text.split(' ') if
                        x not in stopwords)  # 7. remove stopwords(stopwords类型是set而不是list，因为set的查找速度更快)
        text = ' '.join(x.strip('_-') for x in text.split())  # 8. remove spaces,'_' and '-'
        return text

    def transform(self, docs):  # transformer必须要有fit()和transform(）方法
        clean_docs = []
        self.fails = []
        for doc in tqdm(docs):
            try:
                clean_docs.append(self.text_cleanner(doc))
            except:
                self.fails.append(doc)
        if len(self.fails) > 0:
            print("Some documents failed to be converted. Check self.fails for failed documents")
        return clean_docs

    def fit(self, docs, y=None):
        return self

    def fit_transform(self, docs, y=None):
        return self.fit(docs,y).transform(docs)

cleaner = TextCleaner(option=option)
# print(df['text'])
x = cleaner.transform(df['text'])
y = df['label'].astype('int16').to_list()
print(len(x), len(y))
# print(x[:5])

100%|██████████| 3387/3387 [00:00<00:00, 6288.89it/s]

3387 3387





In [3]:
# 划分训练集和数据集
X_train, X_test, Y_train, Y_test = train_test_split(x, y, train_size=0.9, random_state=2021)
print('X_train shape:', len(X_train), '\t', 'Y_train shape:', len(Y_train))
print('X_test shape:', len(X_test), '\t', 'Y_test shape:', len(Y_test))

X_train shape: 3048 	 Y_train shape: 3048
X_test shape: 339 	 Y_test shape: 339


In [4]:
# tf-idf 创建 Document-Term Matrix
vectorizer = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=5,
    max_df=0.95,
    max_features=4000,
    sublinear_tf=True
)
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()
print('X_train shape:', X_train.shape, '\t', 'Y_train shape', len(Y_train))
print('X_test shape:', X_test.shape, '\t', 'Y_test shape', len(Y_test))

X_train shape: (3048, 4000) 	 Y_train shape 3048
X_test shape: (339, 4000) 	 Y_test shape 339


In [14]:
# 多分类 multi-class classification； 注意和 multi-label 的区别
logreg = LogisticRegression(
    penalty='l2',                # default; {'l1','l2','elasticnet','none'}
    l1_ratio=None,               # range (0,1) used if penalty='elasticnet'
    C=1.0,                       # default; inverse of alphs - smaller valves give stronger regularization
    multi_class='ovr',   # default='multinomial'; {'auto','ovr','multinomial'} handles multi-class only
    solver='liblinear',          # default; {'liblinear','lbfgs','newton-cg','sag','saga'}
    max_iter=100,                # default;
    tol=1e-4,                    # default; tolerance for stopping criteria
)
# 'ovr' fits multiple binary classifiers while 'multinomial' fits one multi-class classifier

In [31]:
logreg.fit(X_train, Y_train)
y_prob = logreg.predict_proba(X_test)  # 先预测概率probability
print(y_prob.shape);y_prob[:5]   # 每行概率相加必得1
y_pred = logreg.predict(X_test)
print(y_pred.shape);y_pred[:5]
print(y_prob[:5].argmax(1))

print('accuracy:',accuracy_score(Y_test,y_pred).round(4))
print('precision-macro:',precision_score(Y_test,y_pred,average='macro').round(4))
print('precision-micro:',precision_score(Y_test,y_pred,average='micro').round(4))
print('precision-weighted:',precision_score(Y_test,y_pred,average='weighted').round(4))
print('recall-macro:',recall_score(Y_test,y_pred,average='macro').round(4))
print('recall-micro:',recall_score(Y_test,y_pred,average='micro').round(4))
print('recall-weighted:',recall_score(Y_test,y_pred,average='weighted').round(4))
print('f1-macro:',f1_score(Y_test,y_pred,average='macro').round(4))
print('f1-micro:',f1_score(Y_test,y_pred,average='micro').round(4))
print('f1-weighted:',f1_score(Y_test,y_pred,average='weighted').round(4))
# or
print(classification_report(Y_test,y_pred,digits=4))
pdat = pd.DataFrame(classification_report(Y_test,y_pred,digits=4,output_dict=True)).round(4).T
print(pdat)
print(pdat[:4].mean(0).round(4))

(339, 4)
(339,)
[1 1 1 1 2]
accuracy: 0.8201
precision-macro: 0.8145
precision-micro: 0.8201
precision-weighted: 0.8231
recall-macro: 0.7894
recall-micro: 0.8201
recall-weighted: 0.8201
f1-macro: 0.7909
f1-micro: 0.8201
f1-weighted: 0.8121
              precision    recall  f1-score   support

           0     0.6842    0.7324    0.7075        71
           1     0.9406    0.9694    0.9548        98
           2     0.8000    0.9412    0.8649       102
           3     0.8333    0.5147    0.6364        68

    accuracy                         0.8201       339
   macro avg     0.8145    0.7894    0.7909       339
weighted avg     0.8231    0.8201    0.8121       339

              precision  recall  f1-score   support
0                0.6842  0.7324    0.7075   71.0000
1                0.9406  0.9694    0.9548   98.0000
2                0.8000  0.9412    0.8649  102.0000
3                0.8333  0.5147    0.6364   68.0000
accuracy         0.8201  0.8201    0.8201    0.8201
macro avg    

(339, 4)


array([[0.03231944, 0.84011974, 0.08124227, 0.04631855],
       [0.229273  , 0.40270175, 0.23883123, 0.12919402],
       [0.26838048, 0.43073684, 0.09403736, 0.20684532],
       [0.08815394, 0.67874423, 0.18136425, 0.05173758],
       [0.29917276, 0.06455529, 0.43830047, 0.19797148]])