In [1]:
import jieba
import pandas as pd

In [7]:
df_military = pd.read_csv("data/military_news.csv",index_col = 0, encoding='utf-8')
df_military = df_military.dropna()

df_car = pd.read_csv("data/car_news.csv", index_col = 0, encoding='utf-8')
df_car = df_car.dropna()


In [8]:
car = df_car.content.values.tolist()[1000:21000]
military = df_military.content.values.tolist()[:20000]

In [14]:
stopwords = pd.read_csv('data/stopwords.txt', encoding = 'utf-8',index_col=False, quoting = 3, names=['stopword'])
stopwords = stopwords['stopword'].tolist()

# 处理数据，去掉停用词，然后合并到一个数据集里面

In [16]:
def process_data(content_lines, sentences, category):
    for line in content_lines:
        segs = jieba.cut(line)
        segs = filter(lambda x:len(x) > 1, segs)
        segs = filter(lambda x: x not in stopwords, segs)
        sentences.append((" ".join(segs), category))
        
sentences = []
process_data(car, sentences, 'car')
process_data(military, sentences, 'military')

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\LL\AppData\Local\Temp\jieba.cache
Loading model cost 1.152 seconds.
Prefix dict has been built succesfully.


In [18]:
import random
random.shuffle(sentences)

In [19]:
from sklearn.model_selection import train_test_split
x, y = zip(*sentences)    #这里的zip用的很灵性啊
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=123)

先做词袋处理，然后用NB训练

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(
    max_features=6000,  # keep the most common 1000 ngrams
)
vec.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=6000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [22]:
from sklearn.naive_bayes import MultinomialNB

In [27]:
clf = MultinomialNB()
clf.fit(vec.transform(x_train), y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [29]:
clf.score(vec.transform(x_test), y_test)

0.96347296709668073

In [42]:
vec = CountVectorizer(
    max_features=10000,  # keep the most common 1000 ngrams
    ngram_range=(1,4),  # use ngrams of size 1 and 2
)
vec.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 4), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [44]:
clf = MultinomialNB()
clf.fit(vec.transform(x_train), y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [45]:
clf.score(vec.transform(x_test), y_test)

0.96579214378895495

In [46]:
from sklearn.model_selection import cross_val_score
cross_val_score(clf, vec.transform(x_train), y_train, cv=5)

array([ 0.97101449,  0.96859903,  0.97004107,  0.96400097,  0.97003383])

# 自己来做一个文本分类器

In [52]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

class TextClassifier():
    def __init__(self, classifier =  MultinomialNB()):
        self.classifier = classifier
        self.vectorizer = CountVectorizer(ngram_range=(1,4), max_features=10000)
    
    def features(self, X):
        return self.vectorizer.transform(X)
    
    def fit(self, X, y):
        self.vectorizer.fit(X)
        self.classifier.fit(self.features(X), y)
        
    def predict(self, x):
        return self.classifier.predict(self.features([x]))

    def score(self, X, y):
        return self.classifier.score(self.features(X), y)

In [53]:
text_classifier = TextClassifier()
text_classifier.fit(x_train, y_train)

In [57]:
print(text_classifier.predict('我 爱 奔驰 和吧'))
print(text_classifier.score(x_test, y_test))

['military']
0.965792143789
