In [1]:
import pandas as pd
import os
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

In [2]:
def read_file(file, encode='utf-8'):
    try:
        with open(file, encoding=encode) as f:
            return f.read()
    except Exception as e:
        print(file)
        print(e)

In [3]:
def get_data(root):
    datas = []
    cats = []
    for r, dirs, files in os.walk(root):
        for f in files:
            if f.endswith('.txt'):
                file = os.path.join(r, f)
                content = read_file(file, 'gb18030')
                catog = os.path.split(r)[-1]
                datas.append(content)
                cats.append(catog)
    
    return pd.DataFrame({'data': datas, 'class': cats})

In [4]:
train_path = r'C:\Data\Study\Data Analysis\JiKe_ChenYang\code\21朴素贝叶斯\text_classification\text classification\train'
test_path = r'C:\Data\Study\Data Analysis\JiKe_ChenYang\code\21朴素贝叶斯\text_classification\text classification\test'
stopw_path = r'C:\Data\Study\Data Analysis\JiKe_ChenYang\code\21朴素贝叶斯\text_classification\text classification\stop\stopword.txt'

In [5]:
train = get_data(train_path)
test = get_data(test_path)

In [6]:
train.head()

Unnamed: 0,data,class
0,球场禁用招数“少先队员之惩戒”冠,体育
1,可以直接到编辑部买，地址，北京体育馆路8号，中国体育报业总社院内，后楼51700：羽毛球杂志...,体育
2,直播贴阿内尔卡正式加盟申花，最现场连线申花新闻官带来第一手消息，上海男篮2分惜败惨遭4连败，...,体育
3,组图：金妍儿黑丝亮相热心公益 OL套装透成熟 http:url.cn/1bNQDP (分享...,体育
4,北京23分落后末节大反扑 惜败佛山终结13连胜 http:url.cn/2nGdrq (...,体育


In [7]:
train_test = pd.concat([train, test])

In [8]:
train_test['split_word'] = train_test['data'].apply(lambda x: ' '.join(jieba.cut(x)))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\hotheat\AppData\Local\Temp\jieba.cache
Loading model cost 0.841 seconds.
Prefix dict has been built succesfully.


In [9]:
stop_words = read_file(stopw_path).split('\n')

In [10]:
tf = TfidfVectorizer(stop_words=stop_words, max_df=0.5)
features = tf.fit_transform(train_test['split_word'])

In [11]:
# from sklearn.feature_extraction.text import CountVectorizer

# tfidf = CountVectorizer(stop_words=stop_words, max_features=300, max_df=0.5)
# features = tfidf.fit_transform(train_test["split_word"])

In [12]:
lbl = LabelEncoder()
train_test['class'] = lbl.fit_transform(train_test['class'])

In [13]:
train_features = features[:train.shape[0], :]
test_features = features[:test.shape[0], :]
y_train = train['class']
y_test = test['class']

In [14]:
train_features.shape, train.shape

((3306, 23734), (3306, 2))

In [15]:
clf = MultinomialNB(alpha=0.001).fit(train_features, y_train)

In [16]:
y_test_pred = clf.predict(test_features)

In [17]:
print(metrics.accuracy_score(y_test_pred, y_test))

0.575
