In [2]:
import multiprocessing
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score,f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from tqdm import tqdm
from sklearn import utils

cores = multiprocessing.cpu_count()
print("cpu number is :", cores)


# 第一步 数据预处理，
def cut(document):
    # 定义删除除字母,数字
    doc = nltk.word_tokenize(document)
    filtered = [w for w in doc if (w not in stopwords.words('english') and w not in '!@#$%^&*()_+<>?":,./;{}[]')]
    return filtered


df_train = pd.read_csv('/Users/yuchk/PycharmProjects/IMDB/0_dataset/orign/train_imdb.tsv',
                 usecols=['tag', 'sen'], sep='\t')
df_test = pd.read_csv('/Users/yuchk/PycharmProjects/IMDB/0_dataset/orign/test_imdb.tsv',
                 usecols=['tag', 'sen'], sep='\t')
# 分词，并过滤停用词
df_train['cut_sen'] = df_train['sen'].apply(cut)

print(df_train)

cpu number is : 4
       tag                                                sen  \
0        0  It really boggles my mind when someone comes a...   
1        0  Mary Pickford becomes the chieftain of a Scott...   
2        0  Well, at least my theater group did, lol. So o...   
3        1  I must give How She Move a near-perfect rating...   
4        0  I must say, when I read the storyline on the b...   
...    ...                                                ...   
19995    1  Simple, meaningful and delivers an emotional p...   
19996    1  I'm fan of ART, I like anything about Art, I l...   
19997    0  Despite being a sequel to the more potent orig...   
19998    0  Also known in a different form as "House of Ex...   
19999    0  This has the absolute worst performance from R...   

                                                 cut_sen  
0      [It, really, boggles, mind, someone, comes, ac...  
1      [Mary, Pickford, becomes, chieftain, Scottish,...  
2      [Well, least, the

In [4]:

train, test = train_test_split(df_train, random_state=42, test_size = 0.1)
# 创建标签化文档
train_tagged = train.apply(
    lambda r: TaggedDocument(words=r['cut_sen'], tags=[r['tag']]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=r['cut_sen'], tags=[r['tag']]), axis=1)

model_dbow = Doc2Vec(dm=0, negative=5, hs=0, min_count=2, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])


100%|██████████| 18000/18000 [00:00<00:00, 2732742.32it/s]


In [5]:
model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values),
                     epochs=10)
model_dbow.alpha -= 0.002
model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 18000/18000 [00:00<00:00, 1519797.73it/s]


In [6]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [9]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

In [10]:
print("############### 使用逻辑回归来预测###########")
log_reg = LogisticRegression(n_jobs=cores, C=1e5, solver='liblinear', max_iter=10000)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

print("#############向量机################")
modell = LinearSVC()
modell.fit(X_train, y_train)
y_pred2 = modell.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred2))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred2, average='weighted')))

############### 使用逻辑回归来预测###########


  " = {}.".format(effective_n_jobs(self.n_jobs)))


Testing accuracy 0.896
Testing F1 score: 0.8959999999999999
#############向量机################
Testing accuracy 0.892
Testing F1 score: 0.8919999999999999




In [11]:
print("###############使用随机森林预测##############")
rfc = RandomForestClassifier(n_estimators=200, max_depth=4, random_state=0,n_jobs=4).fit(X_train, y_train)
y_pre_rfc = rfc.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pre_rfc))

###############使用随机森林预测##############
Testing accuracy 0.861


In [None]:

mlp = MLPClassifier().fit(X_train, y_train)
y_pre_mlp = mlp.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pre_mlp))


In [12]:
from sklearn.linear_model import SGDClassifier
sgdc = SGDClassifier()
sgdc.fit(X_train, y_train)
sgdc_predict_y = sgdc.predict(X_test)
sgdr = accuracy_score(y_test, sgdc_predict_y)
print(sgdr)

0.8925
