In [1]:
# Count Vectors + RidgeClassifier

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import f1_score

In [2]:
train_df = pd.read_csv('E:/Dataset/新闻文本分类/train_set.csv', sep='\t', nrows=15000)

In [4]:
# try bag of word
# bag of word进行编码
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = CountVectorizer(max_features=10)
bag_word = vectorizer.fit_transform(train_df['text'])
bag_word.toarray()

array([[ 2,  2, 64, ..., 24,  8, 30],
       [ 2,  2, 25, ..., 11,  4, 14],
       [ 0,  0, 27, ..., 14,  6,  7],
       ...,
       [ 0,  0, 23, ..., 14,  4, 12],
       [ 3,  1, 15, ...,  8,  5,  8],
       [ 0,  2,  7, ..., 11,  0,  9]], dtype=int64)

In [6]:
# 词袋 + 岭回归进行预测
from sklearn.linear_model import RidgeClassifier

rc = RidgeClassifier()
rc.fit(bag_word[0:10000], train_df.label[0:10000])
result = rc.predict(bag_word[10001:])

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print(accuracy_score(train_df.label[10001:], result))
print(precision_score(train_df.label[10001:], result, average='micro'))
print(recall_score(train_df.label[10001:], result, average='micro'))
f1_score(train_df.label[10001:], result, average='micro')

In [22]:
# TF-IDF + 岭回归进行预测
tfidf = TfidfVectorizer(ngram_range=(1,3), max_features=3000)
train_test = tfidf.fit_transform(train_df['text'])

0.3056611322264453
0.3056611322264453
0.3056611322264453


0.3056611322264453

In [18]:
# 测试多分类
import numpy as np

y_true = np.array([-1]*30 + [0]*240 + [1]*30 + [2]*30)
y_pred = np.array([-1]*10 + [0]*10 + [1]*10 + 
                  [-1]*40 + [0]*160 + [1]*40 + 
                  [-1]*5 + [0]*5 + [1]*20 + [2]*30)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true, y_pred)

In [None]:
# 测试词袋模型
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'hello hello hello hello',
    'didi',
    'sun sun',
    'moon moon moon',
    'ace'
]
vectorizer = CountVectorizer(max_features=5)
words = vectorizer.fit_transform(corpus)
print(vectorizer.vocabulary_)
words.toarray() # 这句话中有几个单词，分别在文章中出现了几次

In [49]:
# TF-IDF测试
# wordCount/wordTotal
#  文章总数/包含该词的文档数
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'didi didi moon',
    'didi hello'
]

tfidf = TfidfVectorizer()
# if not smooth is False  /  idf(t) = log [ n / (df(t) + 1) ])
# 文章1：didi: tf-2/3 idf-log(2/2)+1
# if smooth is True, then need norm 2  /  idf(d, t) = log [ (1 + n) / (1 + df(d, t)) ] + 1
# 文章1：didi: tf-2/3 idf-log(3/3)+1

# then norm2
print(tfidf.fit_transform(corpus).toarray())
print(tfidf.vocabulary_)
tfidf.get_feature_names()

[[0.81818021 0.         0.57496187]
 [0.57973867 0.81480247 0.        ]]
{'didi': 0, 'moon': 2, 'hello': 1}


['didi', 'hello', 'moon']

In [52]:
# didi
import math
tf_didi_1 = 2/3 * (np.log(3/3)+1)
tf_moon_1 = 1/3 * (np.log(3/2)+1)

norm = np.sqrt(tf_didi_1**2 + tf_moon_1**2)

[tf_didi_1, tf_moon_1] / norm

array([0.81818021, 0.57496187])

In [13]:
# IDF 常规的的实现方法
import numpy as np # 数值计算、矩阵运算、向量运算
import pandas as pd # 数值分析、科学计算

# 定义文档
docA = 'The cat sat on my bed'
docB = 'The dog sat on my knees'

# 切割文档
bowA = docA.split(' ')
bowB = docB.split(' ')
# bowA # ['The', 'cat', 'sat', 'on', 'my', 'bed']
# bowB # ['The', 'dog', 'sat', 'on', 'my', 'knees']

# 构建词库
wordSet = set(bowA).union(set(bowB))
# wordSet # {'The', 'bed', 'cat', 'dog', 'knees', 'my', 'on', 'sat'}

# 用字典来保存词出现的次数
wordDictA = dict.fromkeys(wordSet, 0)
wordDictB = dict.fromkeys(wordSet, 0)

# 遍历文档，统计词数
# 针对每篇文章，也就是有几个向量
for word in bowA:  
    wordDictA[word] += 1  # 构建bag of words
for word in bowB:
    wordDictB[word] += 1



In [16]:
def computeTF(wordDict, bow):
    # 用一个字典对象保存 TF，把所有对应于 bow 文档里的 TF都计算出来
    tfDict = {}
    nbowCount = len(bow)  # 文章中词的个数
    print(nbowCount)

    for word, count in wordDict.items():
        tfDict[word] = count / nbowCount
    return tfDict
tfA = computeTF(wordDictA, bowA)
tfB = computeTF(wordDictB, bowB)

6
6


In [24]:
def computeIDF(wordDictList):
    print(wordDictList)  # 利用之前构建的bag of words
    # 用一个字典对象保存 IDF，每个词作为 key，初始值为 0
    idfDict = dict.fromkeys(wordDictList[0], 0)
    # 总文档数量
    N = len(wordDictList)
    import math

    for wordDict in wordDictList:  # 统计所有文章中，单词的个数
        # 遍历字典中的每个词汇，统计 Ni
        for word, count in wordDict.items():
            if count > 0 :
                # 先把 Ni 增加 1，存入到 idfDict 中
                idfDict[word] += count

    # 已经得到所有词汇 i 对应的 Ni，现在根据公式把它替换成 idf 值
    for word, Ni in idfDict.items():
        idfDict[word] = math.log10((N + 1)/(Ni + 1))
    return idfDict

# 测试
idfs = computeIDF([wordDictA, wordDictB])
idfs

[{'The': 1, 'dog': 0, 'knees': 0, 'sat': 1, 'cat': 1, 'bed': 1, 'on': 1, 'my': 1}, {'The': 1, 'dog': 1, 'knees': 1, 'sat': 1, 'cat': 0, 'bed': 0, 'on': 1, 'my': 1}]


{'The': 0.0,
 'dog': 0.17609125905568124,
 'knees': 0.17609125905568124,
 'sat': 0.0,
 'cat': 0.17609125905568124,
 'bed': 0.17609125905568124,
 'on': 0.0,
 'my': 0.0}