In [104]:
import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics

import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook
import re
import jieba  

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

In [105]:
data = pd.read_csv('DMSC.csv', index_col=0)
data = data.assign(Star=data['Star'].map(lambda x: 0 if x <=3 else 1))
sample_df = data.groupby(['Movie_Name_CN', 'Star']).apply(
    lambda x: x.sample(n=int(2125056/(28*100)), replace=True, random_state=0))
sample_df.head(2)

  mask |= (ar1 == a)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Movie_Name_EN,Movie_Name_CN,Crawl_Date,Number,Username,Date,Star,Comment,Like
Movie_Name_CN,Star,ID,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
七月与安生,0,1012184,Soulmate,七月与安生,2017-01-05,7152,陈信宏大过天,2016-10-13,0,也许是七月 也许是安生 总有一颗不想稳定下来的心,0
七月与安生,0,1034976,Soulmate,七月与安生,2017-01-05,30357,砂糖的砂,2016-11-07,0,跟全世界路过差不多水平吧 摄影也有很大问题 全片抓不出来一帧称得上电影构图的画面 剧情理解...,0


In [106]:
comments = sample_df.values[:, 7]
star = sample_df.values[:, 6]

x_train, x_test, y_train, y_test, = train_test_split(comments, star, test_size=0.2, random_state=0)

len(y_train), len(y_test), len(x_train), len(x_test)

(33958, 8490, 33958, 8490)

### 数据预处理
1. 清理非中文字符
2. 结巴分词
3. 去除停用词语

In [107]:
# 清理非中文字符
def clean_str(line):
    line.strip('\n')
    line = re.sub(r"[^\u4e00-\u9fff]", "", line)
    line = re.sub(
        "[0-9a-zA-Z\-\s+\.\!\/_,$%^*\(\)\+(+\"\')]+|[+——！，。？、~@#￥%……&*（）<>\[\]:：★◆【】《》;；=?？]+", "", line)
    return line.strip()

In [108]:
# 加载停用词
with open('stopWord.txt') as f:
    stopwords = [line.strip('\n') for line in f.readlines()]

In [109]:
def cut(text_data, labels, stopwords):
    result = []
    new_labels = []
    for index in tqdm_notebook(range(len(text_data))):
        comment = clean_str(text_data[index])
        label = labels[index]
        # 分词
        seg_list = jieba.cut(comment, cut_all=False, HMM=True)
        seg_list = [x.strip('\n')
                    for x in seg_list if x not in stopwords and len(x) > 1]
        if len(seg_list) > 1:
            result.append(" ".join(seg_list))
            new_labels.append(label)
    # 返回分词结果和对应的标签
    return result, new_labels

In [132]:
# 分别对训练数据和测试数据进行数据预处理
train_cut_result, train_labels = cut(x_train, y_train, stopwords)
test_cut_result, test_labels = cut(x_test, y_test, stopwords)

HBox(children=(IntProgress(value=0, max=33958), HTML(value='')))

HBox(children=(IntProgress(value=0, max=8490), HTML(value='')))

In [152]:
# 计算TF-IDF
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words=None)
X_train = vectorizer.fit_transform(train_cut_result)

In [153]:
X_test = vectorizer.transform(test_cut_result)

In [154]:
feature_names = vectorizer.get_feature_names()
print("n_features: %d" % len(feature_names))
print("features: %s" % feature_names[:10])

n_features: 34898
features: ['一一', '一丁点', '一万', '一万个', '一万倍', '一万年', '一万次', '一万遍', '一上', '一上午']


In [155]:
# 卡方检验，特征选择。选100个。
select_chi2 = 100
print("Extracting %d best features by a chi-squared test" % select_chi2)
t0 = time()
ch2 = SelectKBest(chi2, k=select_chi2)
X_train = ch2.fit_transform(X_train, train_labels)
X_test = ch2.transform(X_test)

feature_names = [feature_names[i] for i in ch2.get_support(indices=True)]
print("done in %fs" % (time() - t0))

Extracting 100 best features by a chi-squared test
done in 0.063143s


In [156]:
def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."

In [157]:
print_report = True
target_names = ["积极", "消极"]

def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, train_labels)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(test_labels, pred)
    print("accuracy:   %0.3f" % score)
    
    print("top 20 keywords per class:")
    top20 = np.argsort(clf.coef_[0])[-20:]
    end20 = np.argsort(clf.coef_[0])[:20]
    print(trim("%s: %s" % ("积极", " ".join([feature_names[i] for i in top20]))))
    print(trim("%s: %s" % ("消极", " ".join([feature_names[i] for i in end20]))))

    if print_report:
        print("classification report:")
        print(metrics.classification_report(test_labels, pred,
                                            target_names=target_names))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time

In [158]:
results = []
# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01)))
results.append(benchmark(BernoulliNB(alpha=.01)))
results.append(benchmark(ComplementNB(alpha=.1)))

Naive Bayes
________________________________________________________________________________
Training: 
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
train time: 0.015s
test time:  0.001s
accuracy:   0.637
top 20 keywords per class:
积极: 支持 爱情 精彩 值得 很棒 五星 感动 国产 我们 什么 非常 很多 虽然 青春 一部 没有 剧情 喜欢 好看 不错
消极: 虚高 侮辱 最烂 负分 玩意 空洞 看不下去 浪费 乱七八糟 两颗 无力 太烂 圈钱 垃圾 小品 三分 混乱 无感 新意 毫无
classification report:
              precision    recall  f1-score   support

          积极       0.73      0.43      0.54      3767
          消极       0.60      0.84      0.70      3831

   micro avg       0.64      0.64      0.64      7598
   macro avg       0.66      0.64      0.62      7598
weighted avg       0.66      0.64      0.62      7598


________________________________________________________________________________
Training: 
BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)
train time: 0.011s
test time:  0.001s
accuracy:   0.636
top 20 keywords per class:
积极: 很棒 不能 那些 值得 五星 爱情 国