In [1]:
import os
import nltk
import math
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
import matplotlib.pyplot as plt
from collections import Counter
from itertools import chain
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.metrics import accuracy_score, f1_score
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Embedding, BatchNormalization, Activation, Input, Add, Concatenate,\
                         Bidirectional, SimpleRNN, LSTM, GRU
from keras_layer_normalization import LayerNormalization
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Embedding, Dropout, BatchNormalization, Activation, Input, \
    Conv1D, MaxPool1D, Flatten, Concatenate, Add, MaxPooling1D,LSTM
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stopwords = set(stopwords.words("english"))
ps = PorterStemmer()

Using TensorFlow backend.


In [2]:
def tokenize(text):
    """
    :param text: a doc with multiple sentences, type: str
    return a word list, 
    
type|: list
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    """
    return nltk.word_tokenize(text)

def stem(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of stemmed words, type: list
    e.g.
    Input: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    Output: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     results.append(ps.stem(token))
    # return results

    return [ps.stem(token) for token in tokens]

def n_gram(tokens, n=1):
    """
    :param tokens: a list of tokens, type: list
    :param n: the corresponding n-gram, type: int
    return a list of n-gram tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.'], 2
    Output: ['text mine', 'mine is', 'is to', 'to identifi', 'identifi use', 'use inform', 'inform .']
    """
    if n == 1:
        return tokens
    else:
        results = list()
        for i in range(len(tokens)-n+1):
            # tokens[i:i+n] will return a sublist from i th to i+n th (i+n th is not included)
            results.append(" ".join(tokens[i:i+n]))
        return results
    
def filter_stopwords(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of filtered tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    Output: ['text', 'mine', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     if token not in stopwords and not token.isnumeric():
    #         results.append(token)
    # return results

    return [token for token in tokens if token not in stopwords and not token.isnumeric()]

In [3]:
df = pd.read_csv("Quoraquestions.csv")
#answer_count_list = non_anonymous_answer_count+anonymous_answer_count
labels = []

for i in range(df.shape[0]):   
    if(df["anonymous_answer_count"][i] > 0):
        labels.append(1)
        
    else:
        labels.append(0)
Counter(labels)
#Construct the labels of questions: 1 means receiving anonymous answers, 0 means not.
xtrain = df["question_title_list"]
x_train = np.array(xtrain)
y_train = np.array(labels)
#x_train contains question texts, y_train contains labels

In [5]:
Counter(labels)

Counter({0: 10028, 1: 1099})

In [6]:
#Preprocessing the question texts

train_texts = x_train
#print(train_texts[0])
train_tokens = [tokenize(text) for text in train_texts]
#print(train_tokens[0])
tokens = []
for text in train_tokens:
    text2 = [i.lower() for i in text]
    text1 = []
    for i in text2:
        if(i != "?"):
            text1.append(i)
    tokens.append(text1)
train_tokens = tokens
#print(train_tokens[0])
filtered_sentence = [filter_stopwords(text) for text in train_tokens]
#print(filtered_sentence[0])
#print(len(train_tokens))
train_stemmed = [stem(tokens) for tokens in filtered_sentence]
#print(train_stemmed[0])
#print(len(train_stemmed))
train_2_gram = [n_gram(tokens, 2) for tokens in train_stemmed]
#print(train_2_gram[0])
train_3_gram = [n_gram(tokens, 3) for tokens in train_stemmed]
#print(train_3_gram[0])
train_feats = list()
for i in range(len(train_texts)):
    train_feats.append(
        train_stemmed[i])


In [7]:
def get_feats_dict(feats, min_freq=-1, max_freq=-1, max_size=-1):
    """
    :param data: a list of features, type: list(list)
    :param min_freq: the lowest fequency that the fequency of a feature smaller than it will be filtered out
    :param max_freq: the highest fequency that the fequency of a feature larger than it will be filtered out
    return a feature dict that maps features to indices, sorted by frequencies
    # Counter document: https://docs.python.org/3.6/library/collections.html#collections.Counter
    """
    # count all features
    feat_cnt = Counter(feats) # ["text", "text", "mine"] --> {"text": 2, "mine": 1}
    if max_size > 0 and min_freq == -1 and max_freq == -1:
        valid_feats = [f for f, cnt in feat_cnt.most_common(max_size)]
    else:
        valid_feats = list()
        for f, cnt in feat_cnt.most_common():
            if (min_freq == -1 or cnt >= min_freq) and \
                (max_freq == -1 or cnt <= max_freq):
                valid_feats.append(f)
    if max_size > 0 and len(valid_feats) > max_size:
        valid_feats = valid_feats[:max_size]        
    print("Size of features:", len(valid_feats))
    
    # build a mapping from features to indices
    feats_dict = dict(zip(valid_feats, range(len(valid_feats))))
    return feats_dict

def get_idf_dict(feats_list):
    """
    :param feats_list: a list of lists of features, type: list(list)
    return an idf vector,
    """
    N = len(feats_list)
    df_dict = Counter()
    for feats in feats_list:
        df_dict.update(set(feats))
    # IDF: log(1 + N/n)
    idf_dict = {f: math.log2(1+N/cnt) for f, cnt in df_dict.items()}
    
    return idf_dict

def get_tfidf_vector(feats, feats_dict, idf_dict):
    """
    :param feats: a list of features, type: list
    :param feats_dict: a dict from features to indices, type: dict
    :param idf_dict: a dict from features to idf, type: dict
    return a feature vector,
    """
    # initialize the vector as all zeros
    vector = np.zeros(len(feats_dict), dtype=np.float)
    # TF: 1 + log(f)
    tf_dict = {f: 1+math.log2(cnt) for f, cnt in Counter(feats).items()}
    for f in tf_dict:
        # get the feature index, return -1 if the feature is not existed
        f_idx = feats_dict.get(f, -1)
        if f_idx != -1:
            tf = tf_dict[f]
            idf = idf_dict[f]
            # set the corresponding element as tf*idf
            vector[f_idx] = tf*idf
    return vector

In [8]:
feats_dict = get_feats_dict(
    chain.from_iterable(train_feats))

idf_dict = get_idf_dict(train_feats)
train_tfidf_feats_matrix = np.vstack([
    get_tfidf_vector(f, feats_dict, idf_dict) for f in train_feats])

Size of features: 10703


In [10]:
from imblearn.over_sampling import SMOTE, ADASYN

#Using smote to solve the unbalanced data problems
X_resampled_smote, y_resampled_smote = SMOTE().fit_sample(train_tfidf_feats_matrix,y_train)
print(len(y_resampled_smote))
Counter(y_resampled_smote)
x_train = X_resampled_smote
y_train = y_resampled_smote
data_num= x_train.shape[0]
index = np.arange(data_num)  # 生成下标  
np.random.shuffle(index)
print(index)
x_train = x_train[index]
y_train = y_train[index]

20056
[11518 18115 19550 ... 16468 12261   299]


In [11]:
from sklearn.metrics import roc_auc_score
def computeAUC(y_true,y_score):
    auc = roc_auc_score(y_true,y_score)
    print("auc = ",auc)
    return auc

In [12]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_train,y_train, random_state=0)




In [13]:
#Using logistic regression to evaluate the classification model.
from sklearn.linear_model.logistic import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)
predicted_probs_train = lr.predict_proba(x_train)
predicted_probs_train = [x[1] for  x in predicted_probs_train]
computeAUC(y_train, predicted_probs_train)

predicted_probs_test_new = lr.predict_proba(x_test)
predicted_probs_test_new = [x[1] for x in predicted_probs_test_new]
computeAUC(y_test, predicted_probs_test_new)


auc =  0.9972859963457588
auc =  0.9253589548361739


0.9253589548361739

In [14]:
#Using the RandomForest Model to evaluate the classification model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100,
                                oob_score= True,
                                min_samples_split=2,
                                min_samples_leaf=50,
                                n_jobs=-1,
                                class_weight='balanced_subsample',
                                bootstrap=True)

rf.fit(x_train, y_train)

predicted_probs_train = rf.predict_proba(x_train)
predicted_probs_train = [x[1] for x in predicted_probs_train]
computeAUC(y_train, predicted_probs_train)
#使用训练的模型来预测test_new数据（validataion data）
predicted_probs_test_new = rf.predict_proba(x_test)
predicted_probs_test_new = [x[1] for x in predicted_probs_test_new]
computeAUC(y_test, predicted_probs_test_new)

  from numpy.core.umath_tests import inner1d


auc =  0.9356448463312719
auc =  0.9377458901292293


0.9377458901292293

In [15]:
def build_classifier(input_size, output_size, learning_rate=0.1,
                     l2_reg=0.0,
                     loss="binary_crossentropy",
                     optimizer="SGD",
                     metric="accuracy"):
    """
    :param input_size: the dimension of the input, type: int
    :param output_size: the dimension of the prediction, type: int
    :param learning_rate: the learning rate for the optimizer
    :param l2_reg: the weight for the L2 regularizer
    :param loss: the training loss
    :param optimizer: the optimizer
    :param metric: the metric
    return a 1-layer perceptron,
    # losses document: https://keras.io/losses/
    # optimizers document: https://keras.io/optimizers/
    # metrics document: https://keras.io/metrics/
    """
    model = Sequential()
    
    # the projection layer
    model.add(Dense(output_size,
                    activation="softmax",
                    input_dim=input_size,
                    kernel_initializer=keras.initializers.he_normal(seed=0),
                    bias_initializer='zeros',
                    kernel_regularizer=keras.regularizers.l2(l2_reg)))
    
    # set the loss, the optimizer, and the metric
    if optimizer == "SGD":
        optimizer = keras.optimizers.SGD(lr=learning_rate)
    elif optimizer == "RMSprop":
        optmizer = keras.optimizers.RMSprop(learning_rate=learning_rate)
    elif optimizer == "Adam":
        optmizer = keras.optimizers.Adam(learning_rate=learning_rate)
    else:
        raise NotImplementedError
    model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
    
    return model

In [16]:
num_classes = max(y_train)
# convert each label to a one-hot vector, and then stack vectors as a matrix
train_label_matrix = keras.utils.to_categorical(y_train-1, num_classes=num_classes)
test_label_matrix = keras.utils.to_categorical(y_test-1, num_classes=num_classes)
model = build_classifier(len(feats_dict), num_classes,loss="binary_crossentropy")

# train the model
np.random.seed(0)
tf.random.set_seed(0)
history = model.fit(x_train, train_label_matrix,
    epochs=5, batch_size=100, verbose=0)

In [17]:
train_score = model.evaluate(x_train, train_label_matrix,
                             batch_size=100)
test_score = model.evaluate(x_test, test_label_matrix,
                            batch_size=100)
print("training loss:", train_score[0], "training accuracy", train_score[1])
print("test loss:", test_score[0], "test accuracy", test_score[1])

training loss: 0.0 training accuracy 1.0
test loss: 0.0 test accuracy 1.0


In [None]:
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(x_train, y_train)

In [None]:
from sklearn import svm
clf = svm.SVC(kernel='linear') # Linear Kernel
#Train the model using the training sets
clf.fit(x_train, y_train)
print(clf.score(x_train, y_train)) # 精度
#Predict the response for test dataset
y_pred = clf.predict(x_test)

In [None]:
from sklearn import svm
clf = svm.SVC(C=0.9, kernel='rbf', gamma=20, decision_function_shape='ovo')
clf.fit(x_train, y_train.ravel())
print(clf.score(x_train, y_train)) # 精度


In [None]:
y_hat = clf.predict(x_train)
show_accuracy(y_hat, y_train, '训练集')
print(clf.score(x_test, y_test))
y_hat = clf.predict(x_test)
show_accuracy(y_hat, y_test, '测试集')
from sklearn.metrics import classification_report

target_names = ['class 0', 'class 1']
print(classification_report(y_train, y_hat, target_names=target_names))

In [None]:
print(1)

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier

In [None]:
#XGBoost Model
model = XGBClassifier(learning_rate=0.1,
                      n_estimators=1000,         # 树的个数--1000棵树建立xgboost
                      max_depth=6,               # 树的深度
                      min_child_weight = 1,      # 叶子节点最小权重
                      gamma=0.,                  # 惩罚项中叶子结点个数前的参数
                      subsample=0.8,             # 随机选择80%样本建立决策树
                      colsample_btree=0.8,       # 随机选择80%特征建立决策树
                      objective='multi:softmax', # 指定损失函数
                      scale_pos_weight=1,        # 解决样本个数不平衡的问题
                      random_state=27            # 随机数
                      )

In [None]:
model.fit(x_train,
          y_train,
          eval_set = [(x_test,y_test)],
          eval_metric = "mlogloss",
          early_stopping_rounds = 10,
          verbose = True)

### make prediction for test data
y_pred = model.predict(x_test)
computeAUC(y_test,y_pred)

In [18]:
#DNN model
def build_Res_Net(input_size, output_size, num_layers, hidden_size,
              activation="relu",
              dropout_rate=0.0,
              batch_norm=False,
              layer_norm=False,
              l2_reg=0.0,
              loss="binary_crossentropy",
              optimizer="SGD",
              learning_rate=0.1,
              metric="accuracy"):
    """
    :param input_size: the dimension of the input, type: int
    :param output_size: the dimension of the prediction, type: int
    :param num_layers: the number of layers, type: int
    :param hidden_size: the dimension of the hidden states, type: int
    :param activation: the activation type, type: str
    :param dropout_rate: the probability of dropout, type: float
    :param batch_norm: whether to enable batch normalization, type: bool
    :param layer_norm: whether to enable layer normalization, type: bool
    :param l2_reg: the weight for the L2 regularizer, type: str
    :param loss: the training loss, type: str
    :param optimizer: the optimizer, type: str
    :param learning_rate: the learning rate for the optimizer, type: float
    :param metric: the metric, type: str
    return a multi-layer network with residual connections,
    # activation
    # dropout document: https://keras.io/layers/core/#dropout
    # batch normalization document: https://keras.io/layers/normalization/
    # layer normalization: https://github.com/CyberZHG/keras-layer-normalization
    # losses document: https://keras.io/losses/
    # optimizers document: https://keras.io/optimizers/
    # metrics document: https://keras.io/metrics/
    """
    x = Input(shape=(input_size,))
    
    if num_layers == 1:
        y = Dense(output_size,
                  activation="softmax",
                  input_dim=input_size,
                  kernel_initializer=keras.initializers.he_normal(seed=0),
                  bias_initializer="zeros",
                  kernel_regularizer=keras.regularizers.l2(l2_reg))(x)
    else:
        h = x
        for i in range(num_layers-1):
            if i == 0:
                # fitst layer: input -> hidden
                new_h = Dense(hidden_size,
                          input_dim=input_size,
                          kernel_initializer=keras.initializers.he_normal(seed=0),
                          bias_initializer="zeros",
                          kernel_regularizer=keras.regularizers.l2(l2_reg))(h)
            else:
                new_h = Dense(hidden_size,
                          input_dim=hidden_size,
                          kernel_initializer=keras.initializers.he_normal(seed=0),
                          bias_initializer="zeros",
                          kernel_regularizer=keras.regularizers.l2(l2_reg))(h)
            # add layer_norm
            if layer_norm:
                new_h = LayerNormalization()(new_h)
            # add batch_norm
            if batch_norm:
                new_h = BatchNormalization()(new_h)
            # residual connection
            if i == 0:
                h = new_h
            else:
                h = Add()([h, new_h])
            # add activation
            h = Activation(activation)(h)
            # add dropout here (set seed as 0 in order to reproduce)
            if dropout_rate > 0.0:
                h = Dropout(dropout_rate, seed=0)(h)
        # last layer: hidden -> class
        y = Dense(output_size,
                  activation="softmax",
                  input_dim=hidden_size,
                  kernel_initializer=keras.initializers.he_normal(seed=0),
                  bias_initializer="zeros")(h)
    
    # set the loss, the optimizer, and the metric
    if optimizer == "SGD":
        optimizer = keras.optimizers.SGD(lr=learning_rate)
    elif optimizer == "RMSprop":
        optmizer = keras.optimizers.RMSprop(learning_rate=learning_rate)
    elif optimizer == "Adam":
        optmizer = keras.optimizers.Adam(learning_rate=learning_rate)
    else:
        raise NotImplementedError
    model = Model(x, y)
    model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
    
    return model

In [20]:
num_classes = max(y_train)
model = build_Res_Net(input_size=len(feats_dict), output_size=num_classes,
                  num_layers=3, hidden_size=100, activation="relu",
                  l2_reg=0.005, dropout_rate=0.1)
checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join("models", "weights.hdf5"),
    monitor="val_accuracy",
    verbose=0,
    save_best_only=True)

np.random.seed(0)
tf.random.set_seed(0)

res_history = model.fit(x_train,train_label_matrix,
                        validation_split=0.1,
                        epochs=5, batch_size=100, verbose=0,
                        callbacks=[checkpointer])
model = keras.models.load_model(os.path.join("models", "weights.hdf5"))

train_score = model.evaluate(x_train, train_label_matrix,
                             batch_size=100)
test_score = model.evaluate(x_test, test_label_matrix,
                            batch_size=100)
print("training loss:", train_score[0], "training accuracy", train_score[1])
print("test loss:", test_score[0], "test accuracy", test_score[1])


training loss: 1.5248525142669678 training accuracy 1.0
test loss: 1.5248525142669678 test accuracy 1.0
