# 股市数据可视化

In [None]:
import imp
import pandas as pd
import matplotlib as mpl
from matplotlib import pyplot as plt
import dash
import dash_core_components as dcc
import dash_html_components as html
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime

In [None]:
#导入数据
df_sh_index = pd.read_csv("/Users/mac/Desktop/tianchi_game/data/sh_index.csv")
df_sz_index = pd.read_csv("/Users/mac/Desktop/tianchi_game/data/sz_index.csv")
df_sh_margin_trade = pd.read_csv("/Users/mac/Desktop/tianchi_game/data/sh_margin_trade.csv")
df_sz_margin_trade = pd.read_csv("/Users/mac/Desktop/tianchi_game/data/sz_margin_trade.csv")

In [None]:
df_sh_margin_trade.head()

In [None]:
date = [str(x) for x in df_sh_margin_trade["date"].to_list()]
margin_balance = df_sh_margin_trade["margin_balance"].to_list()
date_new = pd.to_datetime(date, format="%Y-%m-%d")
df = pd.DataFrame({"date":date_new, "margin_balance":margin_balance})

In [None]:
fig = px.bar(df, x ="date", y = "margin_balance", title = "融资融券趋势图", labels={"date":"日期", "margin_balance":"融资融券余额"})
fig.show()

In [None]:
app = dash.Dash()

In [None]:
app.run_server(debug=True)

In [None]:
date = [str(x) for x in df_sh_index["date"].to_list()]
vol = df_sh_index["vol"]
df = pd.DataFrame({"date":date_new, "vol":vol})

# 用户情感分析可视化

In [None]:
import numpy as np
import pandas as pd
import jieba
import jieba.analyse
import wordcloud
import matplotlib.pyplot as plt
from pyecharts.charts import WordCloud, Bar, Line
import pyecharts.options as opts
from collections import Counter

In [None]:
df = pd.read_csv("/Users/mac/Documents/同步空间/tianchi_game/data/earphone_sentiment.csv")
df.head()

In [None]:
stop_words = [line.strip() for line in open("/Users/mac/Documents/同步空间/python_data/tech_data/jieba_dict/stop_words.txt").readlines()]

In [None]:
word_list = []
for i in range(len(df)):
    try:
        content = df["content"][i]
        outword = ""
        words = jieba.lcut(content, cut_all=False)
        for word in words:
            if word not in stop_words:
                outword += word + " "
            words_str = "".join(outword)
        word_list.append(words_str)
        i+=1
    except:
        pass

In [None]:
df.insert(5,"word",word_list)
df.head()

## 对正向情感做词云图

In [None]:
data = df.query("sentiment_value == 1")["word"]
data

In [None]:
words_data = ""
for i in range(len(data)):
    try:
        words_list = data[i].split(" ")
        for x in words_list:
            words_data = " ".join([words_data, x])
        i+=1
    except:
        pass
text_count = Counter(words_data)

In [None]:
test = jieba.analyse.extract_tags(words_data, topK=100, withWeight=True, allowPOS=('ns', 'n', 'vn', 'v'))

In [None]:
(WordCloud()
.add(series_name="wordcloud",data_pair=test,word_size_range=[20,100])
.render("wordcloud.html")
)


In [None]:
plt.rcParams['font.family'] = ['Arial Unicode MS']
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
plt.rcParams['axes.unicode_minus'] = False

In [None]:
test = jieba.analyse.extract_tags(words_data, topK=100, withWeight=True, allowPOS=('ns', 'n', 'vn', 'v'))
word_dict ={}
for i in range(100):
    word_dict[test[i][0]] = test[i][1]
word_dict

In [None]:
wc = wordcloud.WordCloud(background_color="gray", max_words =100, max_font_size=50, random_state=42,width=800,height=600)
wc.generate_from_frequencies(word_dict)
plt.imshow(wc)
plt.axis("off")
plt.show()

## 根据情感绘制柱状图

In [None]:
good_sen = df.query("sentiment_value == 1")["word"]
bad_sen = df.query("sentiment_value == 0")["word"]


In [None]:
good_sen.value_counts()

In [None]:
bad_sen.size

In [None]:
sen_data = pd.DataFrame([["good",good_sen.size],["bad",bad_sen.size]],columns=["sen","count"])
sen_data

### 针对所有主题的柱状图

In [None]:
# 生成好坏评价个数柱状图
plt.bar(sen_data["sen"],sen_data["count"])
plt.show()

In [None]:
# 取出评价为好的评论高频词及评论次数
good_word_key = []
good_word_value = []
for key ,value in good_sen.value_counts().items():
    good_word_key.append(key)
    good_word_value.append(value)
good_word = [good_word_key,good_word_value]

In [None]:
#  dataframe转置
good_words = pd.DataFrame(good_word)
good_words = good_words.T
good_words.columns = ["word","count"]
good_words.query("count >= 3")
#  这种value_count出来的是针对单条评论拆分的词进行比较，并不是针对单个词统计词频；

### 针对不同主题的评价柱状图

In [None]:
# 按情感分成两个数据框
data_good = df.query("sentiment_value == 1")
data_bad = df.query("sentiment_value == 0")
data_good.head()

In [None]:
# 按主题分成框
# df["subject"].value_counts() 共7类：其他 配置 音质 价格 外形 功能 舒适

In [None]:
sub_good = df.query("sentiment_value == 1")["subject"]
sub_bad = df.query("sentiment_value == 0")["subject"]
sub_label = ["其他","配置","音质","价格","外形","功能","舒适"]

In [None]:
# 各主题评价好的个数
sub_good_key = []
sub_good_value = []
for key ,value in sub_good.value_counts().items():
    sub_good_key.append(key)
    sub_good_value.append(value)
sub_good_data = [sub_good_key,sub_good_value]
sub_good_datas = pd.DataFrame(sub_good_data)
sub_good_datas = sub_good_datas.T
sub_good_datas.columns = ["sub","count"]
sub_good_datas

In [None]:
# 各主题评价不好的个数
sub_bad_key = []
sub_bad_value = []
for key ,value in sub_bad.value_counts().items():
    sub_bad_key.append(key)
    sub_bad_value.append(value)
sub_bad_data = [sub_bad_key,sub_bad_value]
sub_bad_datas = pd.DataFrame(sub_bad_data)
sub_bad_datas = sub_bad_datas.T
sub_bad_datas.columns = ["sub","count"]
sub_bad_datas

In [None]:
# 作图数据
from enum import auto
from matplotlib.pyplot import legend

ind = np.arange(7)
width = 0.35

fig,ax = plt.subplots()
sub_good_1= ax.bar(ind - width/2 ,sub_good_datas["count"], width,color = "SkyBlue",label = "good") 
sub_bad_1 = ax.bar(ind + width/2 ,sub_bad_datas["count"], width,color = "IndianRed",label = "bad")

ax.set_ylabel("Number of comments")
ax.set_title("主题情感分析")
ax.set_xticks(ind)
ax.set_xticklabels(sub_label)
ax.legend()

def autolabel(rects,xpos='center'):
    xpos = xpos.lower()
    ha = {'center':'center','right':'left','left':'right'}
    offset = {'center':0.5,'right':0.57,'left':0.43}
    
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()*offset[xpos], 1.01*height,
                '%s' % int(height),
                ha=ha[xpos], va='bottom')

autolabel(sub_good_1,"left")
autolabel(sub_bad_1,"right")

plt.show()

## 相关性分析及headmap图

# 新闻文本分类

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

In [None]:
#  让mat plt 正常显示中文
plt.rcParams['font.family'] = ['Arial Unicode MS']
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gra中文
plt.rcParams['axes.unicode_minus'] = False

In [None]:
df = pd.read_csv("data/train_set.csv", sep = "\t")
df.head()

In [None]:
df['text_len'] = df['text'].apply(lambda x: len(x.split(' ')))
df['text_len'].describe()

In [None]:
df['label'].value_counts().plot(kind='bar')
x= df['label'].value_counts().index
x_label =['科技','股票','体育','娱乐','时政','社会','教育','财经','家居','游戏','房产','时尚','彩票','星座'] 
plt.xticks(x,x_label)
plt.show()


以上是基于数据做的基础的可视化分析


## 基于深度学习-bert进行分类

In [1]:
import joblib
from config import *
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import os
import re



### 加载数据及数据预处理

In [4]:
def data_preprocess():
    rawdata = pd.read_csv("./data/NLP_news_train_set.csv", sep='\t', encoding='UTF-8')
    #用正则表达式按标点替换文本

    rawdata['words']=rawdata['text'].apply(lambda x: re.sub('3750|900|648',"",x))
    del rawdata['text']

    #预测
    final_test_data = pd.read_csv('./data/NLP_news_test_a.csv', sep='\t', encoding='UTF-8')
    
    final_test_data['words'] = final_test_data['text'].apply(lambda x: re.sub('3750|900|648',"",x))
    del final_test_data['text']
    all_value= rawdata['words'].append(final_test_data['words'])
    all_value.columns=['text']
    all_value.to_csv('../alldata.csv',index=False)

In [5]:
data_preprocess()

In [7]:
from tokenizers import Tokenizer
from tokenizers.models import BPE,WordLevel

In [8]:
tokenizer = Tokenizer(WordLevel(unk_token='[UNK]'))

In [9]:
from tokenizers.trainers import BpeTrainer,WordLevelTrainer

In [10]:
trainer = WordLevelTrainer(special_tokens = ['[UNK]','[CLS]','[SEP]','[PAD]','[MASK]'])


In [11]:
from tokenizers.pre_tokenizers import Whitespace
tokenizer.pre_tokenizer = Whitespace()

In [12]:
tokenizer.train(['./alldata.csv'], trainer)
tokenizer.mask_token = '[MASK]'
tokenizer.save('./tokenizer_whitespace.json')

### 设定预训练模型参数，初始化预训练模型

选择XLNet模型作为预训练模型

In [20]:
from transformers import RobertaConfig, AlbertConfig, XLNetConfig

In [None]:
# 模型参数
config_kwargs = {
    "d_model":512,
    "n_head":4,
    "vocab_size":tokenizer.get_vocab_size(), # 自己设置词汇大小
    "embedding_size":64,
    "bi_data":True,
    "n_player":8
}

In [None]:
config = XLNetConfig(**config_kwargs)

In [None]:
from transformers import RobertaForMaskedLM, AlbertForMaskedLM, XLNetLMHeadModel

In [None]:
model = XLNetLMHeadModel(config = config)