In [9]:
#Loading the data set - training data.
from sklearn.datasets import fetch_20newsgroups
import json


In [10]:
with open("train.json",'r') as load_f:
    load_data = json.load(load_f)

In [11]:
import random
t = range(0, 100)   # 范围在0到100之间，需要用到range()函数。
test_index = random.sample(range(0, len(load_data)), int(len(load_data)*0.2))
train_data = []
train_label = []
test_data = []
test_label = []
for i in range(len(load_data)):
    if i in test_index:
        test_data.append(load_data[i]['data'])
        test_label.append(load_data[i]['label'])
    else:
        train_data.append(load_data[i]['data'])
        train_label.append(load_data[i]['label'])

In [12]:
# Extracting features from text files
from sklearn.feature_extraction.text import CountVectorizer
max_df = 0.999 # 在超过这一比例的文档中出现的关键词（过于平凡），去除掉。
min_df = 0.001 # 在低于这一数量的文档中出现的关键词（过于独特），去除掉。
count_vect = CountVectorizer(max_df = max_df,
                       min_df = min_df,
                       lowercase = False)
X_train_counts = count_vect.fit_transform(train_data)
X_train_counts.shape

(20000, 10227)

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, train_label)

In [14]:
# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
# We will be using the 'text_clf' going forward.
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(train_data, train_label)

In [15]:
# Performance of NB Classifier
import numpy as np
predicted = text_clf.predict(test_data)
np.mean(predicted == test_label)

0.8658

In [16]:
# Training Support Vector Machines - SVM and calculating its performance

from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(train_data, train_label)
predicted_svm = text_clf_svm.predict(test_data)
np.mean(predicted_svm == test_label)



0.8534

In [None]:
# -*- coding: utf-8 -*-
from sklearn import datasets
from sklearn import svm
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
import numpy
import json

#调整了格式，一行是一条数据
def inputdata(filename):
    with open(filename,'r') as load_f:
        load_data = json.load(load_f)
    return load_data

def splitset(trainset,testset):
    train_words = []
    train_tags = []
    test_words = []
    test_tags = []
    for i in trainset:
        # index = i.index(':')
        train_words.append(i['data'])
        # print i
        train_tags.append(int(i['label']))

    for i in testset:
        # index = i.index(':')
        test_words.append(i['data'])
        # print i
        test_tags.append(int(i['label']))

    return train_words,train_tags,test_words,test_tags

#完成打开文件后的准备工作


def tfvectorize(train_words,test_words):
    v = TfidfVectorizer(tokenizer=comma_tokenizer,binary = False, decode_error = 'ignore',stop_words = 'english')
    train_data = v.fit_transform(train_words)
    test_data = v.transform(test_words)
    return train_data,test_data

#按比例划分训练集与测试集
def splitDataset(dataset,splitRatio):
    trainSize = int(len(dataset)*splitRatio)
    trainSet = []
    copy = dataset
    while len(trainSet)<trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return trainSet,copy

#得到准确率和召回率
def evaluate(actual, pred):
    m_precision = metrics.precision_score(actual, pred,average='macro')
    m_recall = metrics.recall_score(actual,pred,average='macro')
    print( 'precision:{0:.3f}'.format(m_precision))
    print ('recall:{0:0.3f}'.format(m_recall))

#创建svm分类器
def train_clf(train_data, train_tags):
    clf = svm.SVC(C=0.1, cache_size=200, class_weight=None, decision_function_shape='ovr'
                  , kernel='linear', max_iter=-1, probability=True, random_state=42, shrinking=True,
                  tol=0.0001, verbose=False)
    clf.fit(train_data, numpy.asarray(train_tags))

    return clf

def covectorize(train_words,test_words):
    max_df = 0.999 # 在超过这一比例的文档中出现的关键词（过于平凡），去除掉。
    min_df = 0.001 # 在低于这一数量的文档中出现的关键词（过于独特），去除掉。
    count_vect = CountVectorizer(max_df = max_df,
                           min_df = min_df,
                           lowercase = False)    
    train_data = count_vect.fit_transform(train_words)
    test_data = count_vect.transform(test_words)
    return train_data,test_data

if __name__ == '__main__':
    linelist = inputdata('./train.json')
    # for i in linelist:
    #     print i.decode('utf-8')

    # 划分成两个list
    trainset, testset = splitDataset(linelist, 0.7)
    # for i in trainset:
    #     print i.decode('utf-8')
    print ('train number:', len(trainset))
    print ('test number:', len(testset))
    

    train_words, train_tags, test_words, test_tags = splitset(trainset, testset)
    
    # for i in train_words:
    #     print i
    # for i in train_tags:
    #     print i
    # for i in numpy.asarray(train_tags):
    #     print i
    # for i in test_words:
    #     print i
    # for i in test_tags:
    #     print i


    # train_data, test_data = tfvectorize(train_words, test_words)
    train_data, test_data = covectorize(train_words, test_words)
    # for i in test_data:
    #     print i
    clf = train_clf(train_data,train_tags)

    re =  clf.predict(test_data)
    # print re
    evaluate(numpy.asarray(test_tags),re)
    # print re

train number: 17500
test number: 7500
