In [1]:
# -*- coding: utf8 -*-
import jieba
import pandas as pd
import random

cate_dic = {'technology':1, 'car':2, 'entertainment':3, 'military':4, 'sports':5}

df_technology = pd.read_csv("./data/technology_news.csv", encoding='utf-8')
df_technology = df_technology.dropna()

df_car = pd.read_csv("./data/car_news.csv", encoding='utf-8')
df_car = df_car.dropna()

df_entertainment = pd.read_csv("./data/entertainment_news.csv", encoding='utf-8')
df_entertainment = df_entertainment.dropna()

df_military = pd.read_csv("./data/military_news.csv", encoding='utf-8')
df_military = df_military.dropna()

df_sports = pd.read_csv("./data/sports_news.csv", encoding='utf-8')
df_sports = df_sports.dropna()

technology = df_technology.content.values.tolist()[1000:21000]
car = df_car.content.values.tolist()[1000:21000]
entertainment = df_entertainment.content.values.tolist()[:20000]
military = df_military.content.values.tolist()[:20000]
sports = df_sports.content.values.tolist()[:20000]

In [2]:
stopwords=pd.read_csv("data/stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
stopwords=stopwords['stopword'].values

def preprocess_text(content_lines, sentences, category):
    for line in content_lines:
        try:
            segs=jieba.lcut(line)
            segs = filter(lambda x:len(x)>1, segs)
            segs = filter(lambda x:x not in stopwords, segs)
            sentences.append("__label__"+str(category)+" "+" ".join(segs))
        except Exception as e:
            print(line)
            continue

In [3]:
#生成训练数据
sentences = []

preprocess_text(technology, sentences, cate_dic['technology'])
preprocess_text(car, sentences, cate_dic['car'])
preprocess_text(entertainment, sentences, cate_dic['entertainment'])
preprocess_text(military, sentences, cate_dic['military'])
preprocess_text(sports, sentences, cate_dic['sports'])

random.shuffle(sentences)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.588 seconds.
Prefix dict has been built succesfully.


In [4]:
print("writing data to fasttext format...")
out = open('train_data.txt', 'w')
for sentence in sentences:
    out.write(str(sentence.strip()+"\n"))
print("done!")

writing data to fasttext format...
done!


In [5]:
import fasttext

# model = fasttext.train_supervised("train.txt", lr=0.1, dim=100, epoch=5, , word_ngrams=2, loss='softmax')
# model.save_model("model_file.bin")
classifier = fasttext.train_supervised('train_data.txt')
classifier.save_model('classifier.model')

In [6]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*classifier.test('train_data.txt'))

N	87578
P@1	0.972
R@1	0.972


In [7]:
result = classifier.test('train_data.txt')
print('P@1:', result[1])
print('R@1:', result[2])
print('Number of examples:', result[0])

P@1: 0.9722875608029414
R@1: 0.9722875608029414
Number of examples: 87578


In [8]:
label_to_cate = {1:'technology', 2:'car', 3:'entertainment', 4:'military', 5:'sports'}

texts = ['中新网 日电 2018 预赛 亚洲区 强赛 中国队 韩国队 较量 比赛 上半场 分钟 主场 作战 中国队 率先 打破 场上 僵局 利用 角球 机会 大宝 前点 攻门 得手 中国队 领先']
labels = classifier.predict(texts)
print(labels)
print(label_to_cate[int(labels[0][0][0][-1])])

([['__label__5']], array([[0.99996436]]))
sports


In [9]:
labels = classifier.predict(texts)
print(labels)
labels = classifier.predict(texts, k=3)
print(labels)

([['__label__5']], array([[0.99996436]]))
([['__label__5', '__label__4', '__label__1']], array([[9.99964356e-01, 3.91520443e-05, 2.24316627e-05]]))


In [10]:
def preprocess_text_unsupervised(content_lines, sentences, category):
    for line in content_lines:
        try:
            segs=jieba.lcut(line)
            segs = filter(lambda x:len(x)>1, segs)
            segs = filter(lambda x:x not in stopwords, segs)
            sentences.append(" ".join(segs))
        except Exception as e:
            print(line)
            continue

#生成无监督训练数据
sentences = []

preprocess_text_unsupervised(technology, sentences, cate_dic['technology'])
preprocess_text_unsupervised(car, sentences, cate_dic['car'])
preprocess_text_unsupervised(entertainment, sentences, cate_dic['entertainment'])
preprocess_text_unsupervised(military, sentences, cate_dic['military'])
preprocess_text_unsupervised(sports, sentences, cate_dic['sports'])


In [11]:
print("writing data to fasttext unsupervised learning format...")
out = open('unsupervised_train_data.txt', 'w')
for sentence in sentences:
#     out.write(str(sentence.encode('utf8')+b"\n"))
    out.write(str(sentence.strip()+"\n"))
print("done!")

writing data to fasttext unsupervised learning format...
done!


In [12]:
import fasttext

# Skipgram model
# model = fasttext.skipgram('unsupervised_train_data.txt', 'model')
model = fasttext.train_unsupervised('unsupervised_train_data.txt', model='skipgram')
print(model.words)# list of words in dictionary
print(model['赛季'])

['</s>', '中国', '发展', '汽车', '用户', '技术', '比赛', '市场', '平台', '服务', '电影', '产品', '2017', '企业', '数据', '北京', '公司', '互联网', '行业', '手机', '提供', '内容', '美国', '未来', '时间', '工作', '品牌', '日电', '网络', '智能', '国家', '合作', '观众', '能力', '系统', '世界', '全球', '球员', '领域', '直播', '创新', '训练', '提升', '中新网', '国际', '足球', '希望', '国内', '信息', '战略', '节目', '方式', '情况', '活动', '媒体', '生活', '去年', '球队', '视频', '项目', '带来', '科技', '包括', '发布', '消费者', '现场', '产业', '相关', '体验', '俱乐部', '建设', '增长', '体育', '显示', '超过', '全国', '需求', '设计', '人工智能', '百度', '城市', '模式', '赛事', '关注', '打造', '新能源', '上海', '参加', '推出', '文化', '业务', '表现', '功能', '选择', '集团', '导演', '海军', '拥有', '发现', '时代', '网友', '正式', '孩子', '游戏', '音乐', '车辆', '中心', '优势', '最终', '联赛', '计划', '运动', '第一', '航母', '影响', '传统', '全新', '团队', '升级', '成功', '专业', '联合', '社会', '管理', '票房', '日本', '冠军', '这是', '部队', '作战', '演员', '目标', '明星', '参与', '机会', '春节', '支持', '特别', '过程', '一场', '报道', '代表', '介绍', '销售', '研发', '车型', '体系', '故事', '消费', '首次', '粉丝', '360', '保障', '推动', '经济', '一种', '组织', '春晚', '更是', '基础', '进一步', '持续', '机器人', '影片', '

In [13]:
# # CBOW model
model = fasttext.train_unsupervised('unsupervised_train_data.txt', model='cbow')
# model = fasttext.cbow('unsupervised_train_data.txt', 'model')
print(model.words)# list of words in dictionary
print(model['赛季'])

['</s>', '中国', '发展', '汽车', '用户', '技术', '比赛', '市场', '平台', '服务', '电影', '产品', '2017', '企业', '数据', '北京', '公司', '互联网', '行业', '手机', '提供', '内容', '美国', '未来', '时间', '工作', '品牌', '日电', '网络', '智能', '国家', '合作', '观众', '能力', '系统', '世界', '全球', '球员', '领域', '直播', '创新', '训练', '提升', '中新网', '国际', '足球', '希望', '国内', '信息', '战略', '节目', '方式', '情况', '活动', '媒体', '生活', '去年', '球队', '视频', '项目', '带来', '科技', '包括', '发布', '消费者', '现场', '产业', '相关', '体验', '俱乐部', '建设', '增长', '体育', '显示', '超过', '全国', '需求', '设计', '人工智能', '百度', '城市', '模式', '赛事', '关注', '打造', '新能源', '上海', '参加', '推出', '文化', '业务', '表现', '功能', '选择', '集团', '导演', '海军', '拥有', '发现', '时代', '网友', '正式', '孩子', '游戏', '音乐', '车辆', '中心', '优势', '最终', '联赛', '计划', '运动', '第一', '航母', '影响', '传统', '全新', '团队', '升级', '成功', '专业', '联合', '社会', '管理', '票房', '日本', '冠军', '这是', '部队', '作战', '演员', '目标', '明星', '参与', '机会', '春节', '支持', '特别', '过程', '一场', '报道', '代表', '介绍', '销售', '研发', '车型', '体系', '故事', '消费', '首次', '粉丝', '360', '保障', '推动', '经济', '一种', '组织', '春晚', '更是', '基础', '进一步', '持续', '机器人', '影片', '

# Gemsim Word2vec

In [14]:
# # -*- coding: utf8 -*-
# import jieba
# import pandas as pd
# import random

# cate_dic = {'technology':1, 'car':2, 'entertainment':3, 'military':4, 'sports':5}

# df_technology = pd.read_csv("./data/technology_news.csv", encoding='utf-8')
# df_technology = df_technology.dropna()

# technology = df_technology.content.values.tolist()[1000:21000]

# stopwords=pd.read_csv("data/stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
# stopwords=stopwords['stopword'].values

def preprocess_text_unsupervised(content_lines, sentences):
    for line in content_lines:
        try:
            segs=jieba.lcut(line)
            segs = filter(lambda x:len(x)>1, segs)
            segs = filter(lambda x:x not in stopwords, segs)
            sentences.append(list(segs))
        except Exception as e:
            print(line)
            continue
#生成无监督训练数据
sentences = []

preprocess_text_unsupervised(technology, sentences)
preprocess_text_unsupervised(car, sentences)
preprocess_text_unsupervised(entertainment, sentences)
preprocess_text_unsupervised(military, sentences)
preprocess_text_unsupervised(sports, sentences)

In [15]:
from gensim.models import Word2Vec
model = Word2Vec(sentences, size=100, window=5, min_count=1, workers=4)
model.save("gensim_word2vec.model")

In [16]:
model.wv['去年']
model.wv.most_similar('去年')

[('月份', 0.9055346846580505),
 ('577', 0.8854329586029053),
 ('年初', 0.8834093809127808),
 ('年底', 0.8717650771141052),
 ('月末', 0.8713198900222778),
 ('财年', 0.8599600791931152),
 ('22.3%', 0.8592226505279541),
 ('2012', 0.8577165007591248),
 ('十二五', 0.8533818125724792),
 ('月底', 0.848516583442688)]