In [1]:
import numpy as np
import pandas as pd
import json
import re
import jieba
from gensim.models import Word2Vec
import gensim
import multiprocessing
from gensim.corpora.dictionary import Dictionary
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import one_hot,Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Bidirectional,LSTM,Dense,Embedding,Dropout,Activation,Softmax,Flatten,Conv1D, SimpleRNN
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Sequential

import matplotlib.pyplot as plt
from wordcloud import WordCloud


## 处理申请数据

In [3]:
# 申请数据
apps = pd.read_excel("/Users/yansong/Nutstore Files/基金研究/data/2013--2022面青/2020.xlsx",sheet_name = "申请")
apps.drop(columns=['Unnamed: 0','Unnamed: 2','Unnamed: 5', 'Unnamed: 6','Unnamed: 7', 'Unnamed: 8','项目名称2','姓名','申请单位'],inplace=True)
apps_comments = apps["反馈评议意见"].str.split("<", expand = True)
df_app = pd.merge(apps, apps_comments, left_index=True, right_index=True)
df_app.drop(columns=0,inplace=True)
df_app.rename(columns={1: "意见1",2: "意见2",3: "意见3",4: "意见4",5: "意见5",6: "意见6"},inplace=True)
df_app.drop(columns="反馈评议意见",inplace=True)

# 立项数据
grants = pd.read_excel("/Users/yansong/Nutstore Files/基金研究/data/2013--2022面青/2020.xlsx",sheet_name = "立项")
grants_comments = grants["反馈评议意见"].str.split("<", expand = True)
grants_comments.drop(columns=0,inplace=True)
grants_comments.rename(columns={1: "意见1",2: "意见2",3: "意见3",4: "意见4",5: "意见5"},inplace=True)
df_grants = pd.merge(grants, grants_comments, left_index=True, right_index=True)
df_grants.drop(columns="反馈评议意见",inplace=True)

# 计算是否立项
def check_grant(a):
    return df_grants["项目名称"].isin([a]).sum()
df_app["立项"] = df_app["项目名称"].apply(check_grant)

df_app.head()
df_app = df_app[df_app['资助类别']=='面上项目']

In [4]:
df_app.groupby(['资助类别'])["立项"].mean()

资助类别
面上项目    0.229133
Name: 立项, dtype: float64

## 意见处理
{
	"1": {
		"id": "1",
		"records": {
			"情感": [
				"正"
			]
		},
		"content": "这本书不错啊"
	},
	"2": {
		"id": "2",
		"records": {
			"情感": [
				"负"
			]
		},
		"content": "这个东西评价不行"
	}
}

In [5]:
df_app.head()

Unnamed: 0,项目名称,申请人,申请部门,资助类别,意见1,意见2,意见3,意见4,意见5,立项
41,自守形式和自守L-函数的相关解析问题,徐钊,数学学院,面上项目,1>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,2>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,3>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,4>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,5>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,0
42,有界格上聚合与推理算子的理论与应用,刘华文,数学学院,面上项目,1>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,2>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,3>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,4>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,5>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,1
43,多凯勒参数卡拉比-丘型的纽Gromov-Witten不变量,王新,数学学院,面上项目,1>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,2>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,3>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,4>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,5>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,0
44,有向图的圈划分及相关问题研究,颜谨,数学学院,面上项目,1>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,2>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,3>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,4>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,5>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,1
45,哈密顿偏微分方程的稳定性,李静,数学与统计学院,面上项目,1>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,2>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,3>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,4>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,5>具体评价意见：\n一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理...,1


In [144]:
df_json_full = df_app[["申请人","意见1","意见2","意见3","意见4","意见5","立项"]]
df_json_full['id'] = df_json_full.index
df_json = df_json_full[['id',"意见1","意见2","意见3","意见4","意见5","立项"]]

df_json_long = pd.wide_to_long(df_json, stubnames='意见', i=['id'], j='评审')
df_json_long.reset_index(inplace=True)
df_json_long.rename(columns={"立项":"records","意见":"content"},inplace=True)
df_json_long.dropna(inplace=True)
df_json_long.sort_values(['id'])
df_json_long["records"].replace(to_replace=0, value="负",inplace=True)
df_json_long["records"].replace(to_replace=1, value="正",inplace=True)
df_json_long = df_json_long[["id","records","content"]]


#sample_size = 5000
sample_size = df_json_long.shape[0]

df_json_sample = df_json_long.iloc[0:sample_size,:]
result = df_json_sample.to_json(orient="index",force_ascii=False)

with open('data.json', 'w') as f:
    json.dump(result,f,ensure_ascii=False)
    
df_json_sample.head()


KeyError: "['意见6'] not in index"

## 处理步骤
1. 去除重复词语
2. 去除stop words
3. 分词
4. 生成embedding

## 重复词语

In [None]:
def clean_pat(line):
    pat = ['一、该申请项目所关注的科学问题是否源于多学科领域交叉的共性问题，具有明确的学科交叉特征？请详细阐述判断理由并评价预期成果的科学价值。',
          '二、请针对学科交叉特点评述申请项目研究方案或技术路线的创新性和可行性。',
          '三、请评述申请人的多学科背景、研究专长和创新潜力。',
          '四、其他建议',
          '1>',
          '（1）',
          '（2）',
          '（3）',
          '（4）',
          '（5）',
           '具体评价意见：',
          '一、该申请项目的研究思想或方案是否具有新颖性和独特性？请详细阐述判断理由。',
          '二、请评述申请项目所关注问题的科学价值以及对相关前沿领域的潜在贡献。',
          '三、请评述申请人的创新潜力与研究方案的可行性',
          '一、该申请项目是否面向国家需求并试图解决技术瓶颈背后的基础问题？请结合应用需求详细阐述判断理由。',
          '二、请评述申请项目所提出的科学问题与预期成果的科学价值。',
          '三、请评述申请人的创新潜力及研究方案的创新性和可行性。',
          '。',
          '，',
          '、',
          '的',
          '研究',
          '三请',
          '申请人',
          '拟']

    line = re.sub("\n", "", line)
    for x in pat:
        line = re.sub(x, "", line)
    
    # stopwords
    stopwords=pd.read_csv('stopwords.txt', header=None)[0].tolist() 
    for x in stopwords:
        line = re.sub(x, "", line)
    return line

In [None]:
df_json_sample["content_clean"] = df_json_sample["content"].apply(clean_pat)

In [None]:
df_json_sample["content_clean"][0]

## 使用机器学习模型预测评审意见的正负

In [None]:
#我们定义一个文档集合存储于List，每个文档为list的一个元素，每个文档都对应一个标签,存储于labels
seg_lists = list()
lines = list(df_json_sample['content_clean'])
for line in lines:
    #print(line)
    #seg_list = list(seg_list)
    seg_lists.append(jieba.cut(line))
    #print(len(seg_list))
sentences = []
for i in seg_lists:      
    sentences.append(' '.join(i))    
print(sentences[0])

In [None]:
labels = df_json_sample["records"]
labels.replace("负",0,inplace=True)
labels.replace("正",1,inplace=True)
labels = list(labels)

In [None]:
texts = ''.join(map(str, sentences))
# = re.sub("\n", "", line)


In [None]:
wordcloud = WordCloud(font_path = '/System/Library/Fonts/STHeiti Light.ttc',
            background_color="white",# 设置背景颜色
           max_words=80, # 词云显示的最大词数
           height=400, # 图片高度
           width=800, # 图片宽度
           max_font_size=50).generate(texts)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
# The pil way (if you don't have matplotlib)
# image = wordcloud.to_image()
# image.show()

In [None]:
vocab_size = 14000 #估计的词汇表大小，设置时要比真实的词汇量大，不然会产生不同单词分配了相同的索引。

# #通过索引对上面句子进行编码，one_hot编码映射到[1,vocab_size]，不包括0
# encoded_docs = [one_hot(s, vocab_size) for s in sentences]
# # 文本编码成数字格式并padding到相同长度，这里长度设置为4，在后面补0，这也是为什么前面one-hot不会映射到0的原因。

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(sentences)
encoded_docs = tokenizer.texts_to_sequences(sentences)
max_length = 200
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [None]:
#X_train, X_val, y_train, y_val = train_test_split(padded_docs,labels, test_size=0.2)
train_size = 2000
X_train  = padded_docs[0:train_size,:]
X_val  = padded_docs[train_size:,:]
y_train  = labels[0:train_size]
y_val    = labels[train_size:]

y_train = np.array(y_train)
y_val = np.array(y_val)

In [None]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 20, input_length=max_length))  # Embedding layer
model.add(Bidirectional(LSTM(15)))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid',kernel_regularizer='l2'))
model.compile(loss='binary_crossentropy',
                     optimizer='adam', 
                     metrics=['accuracy'])
print(model.summary())

In [None]:
history = model.fit(X_train, y_train, epochs=10)

In [None]:
score = model.evaluate(X_val, y_val, verbose=1)

In [None]:
senti_train = []
for i in X_train:
    input = np.expand_dims(i,axis=0)
    senti_train.append(model.predict(input))

senti_train_num = []
for i in range(len(senti_train)):
    senti_train_num.append(senti_train[i][0][0])

senti_train = pd.DataFrame(list(zip(senti_train_num, y_train)), columns =['Senti', 'Y'])
senti_train.groupby('Y').mean()

In [None]:
senti_train.loc[senti_train['Y']==0,'Senti'].plot.density()
senti_train.loc[senti_train['Y']==1,'Senti'].plot.density()

In [None]:
senti_val = []
for i in X_val:
    input = np.expand_dims(i,axis=0)
    senti_val.append(model.predict(input))
    
senti_val_num = []
for i in range(len(senti_val)):
    senti_val_num.append(senti_val[i][0][0])

senti_val = pd.DataFrame(list(zip(senti_val_num, y_val)), columns =['Senti', 'Y'])
senti_val.groupby('Y').mean()

In [None]:
senti_val.loc[senti_val['Y']==0,'Senti'].plot.density()
senti_val.loc[senti_val['Y']==1,'Senti'].plot.density()

In [None]:
# 绘制训练 & 验证的损失值
plt.plot(history.history['accuracy'])
plt.title('Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

## Merging the sentiment back to original data

In [None]:
senti = senti_train.append(senti_val)

In [None]:
senti.head()

In [None]:
senti_id = df_json_sample.join(senti)[['id','Senti']]

In [None]:
senti_id.dropna(inplace=True)

In [None]:
senti_id.head(1)

In [None]:
senti_id_avg = senti_id.groupby('id')['Senti'].agg('mean')

In [None]:
df_json_full.shape

In [None]:
df_final = df_json_full.merge(senti_id_avg, how='inner', on='id')

In [None]:
df_final.head(1)

In [None]:
df_final.groupby('立项')['Senti'].agg('mean')

In [None]:
df_final.loc[df_final['立项']==0,'Senti'].plot.density()
df_final.loc[df_final['立项']==1,'Senti'].plot.density()

In [None]:
rd_negative = (df_final['立项']==0) & (df_final['Senti']>=0.2) & (df_final['Senti']<=0.6)
df_rd_neg = df_final[rd_negative]
rd_positive = (df_final['立项']==1) & (df_final['Senti']>=0.2) & (df_final['Senti']<=0.6)
df_rd_positive = df_final[rd_positive]

In [None]:
df_rd_neg.shape

In [None]:
df_rd_neg['申请人']

In [None]:
df_rd_positive.shape