# セットアップ

In [None]:
import os
import random as rn
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import seaborn as sns
import collections
from keras import backend as K
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.metrics import confusion_matrix

#ドライブをマウント
from google.colab import drive
drive.mount("/content/drive")

#モデルの再現性の担保（毎回の学習結果が異なることを防ぐため）
def seed_everything(SEED=42):
  os.environ['PYTHONHASHSEED'] = str(SEED)
  os.environ['TF_DETERMINISTIC_OPS'] = 'true'
  os.environ['TF_CUDNN_DETERMINISTIC'] = 'true'

  np.random.seed(SEED)
  rn.seed(SEED)
  tf.random.set_seed(SEED)

  session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
  sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
  K.set_session(sess)

seed_everything(777)

# データの前処理

In [None]:
#データの読み込み
df = pd.read_csv("./drive/MyDrive/Sentiment_Analysis/training_cleaned.csv", header=None)
df.head()

#正解ラベルとツイートを抽出
corpus = df[[0, 5]]
corpus.info()

sentences = []
labels = []

for l, s in zip(corpus[0], corpus[5]):
  sentences.append(s)
  if l == 4:
    l = 1
    labels.append(l)
  else:
    labels.append(l)

#学習、検証、テストデータに分割
x_train_full, x_test, y_train_full, y_test = train_test_split(sentences, labels, test_size=0.1, random_state=3407)
x_train, x_valid, y_train, y_valid = train_test_split(x_train_full, y_train_full, test_size=0.2, random_state=3407)

print(f"train_data_size : {len(x_train)}")
print(f"valid_data_size : {len(x_valid)}")
print(f"test_data_size : {len(x_test)}")

#単語ID化、パディング
oov_tok = "<OOV>"
padding_type="post"
max_length=16

tokenizer = Tokenizer(oov_token=oov_tok)
tokenizer.fit_on_texts(x_train)
word_index = tokenizer.word_index
train_sequences = tokenizer.texts_to_sequences(x_train)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

print(f"tokenの種類：{len(word_index)}")
print(f"パディング前のシーケンスの長さ：{len(train_sequences[0])}")
print(f"パディング後のシーケンスの長さ：{len(train_padded[0])}")

valid_sequences = tokenizer.texts_to_sequences(x_valid)
valid_padded = pad_sequences(valid_sequences, padding=padding_type, maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(x_test)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)

#正解ラベルをndarrayに変換する
y_train = np.array(y_train)
y_valid = np.array(y_valid)
y_test = np.array(y_test)

# ベースモデル構築、学習、評価

In [None]:
#モデル構築
vocab_size = len(word_index) + 1
embedding_dim = 16
model = tf.keras.models.Sequential([
                            tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
                            tf.keras.layers.Flatten(),
                            tf.keras.layers.Dense(256, activation="relu"),
                            tf.keras.layers.Dense(64, activation="relu"),
                            tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

#モデル学習
num_epochs = 5
num_batchs = 100
history = model.fit(train_padded, y_train, epochs=num_epochs, batch_size=num_batchs, validation_data=(valid_padded, y_valid))

#モデルの評価
model.evaluate(test_padded, y_test)

# 学習プロセス可視化（ベースモデル）

In [None]:
#学習プロセス可視化
acc=history.history["accuracy"]
val_acc=history.history["val_accuracy"]
loss=history.history["loss"]
val_loss=history.history["val_loss"]
epochs=range(len(acc))

acc_title_n = "Training and validation accuracy"
loss_title_n = "Training and validation loss"
acc_label_n = "Accuracy"
loss_label_n = "Loss"
acc_legend_n = ["Accuracy", "Validation Accuracy"]
loss_legend_n = ["Loss", "Validation Loss"]

def performance_plot(train, valid, title_n, label_n, legend_n):
    plt.plot(epochs, train, "r")
    plt.plot(epochs, valid, "b")
    plt.title(title_n)
    plt.xlabel("Epochs")
    plt.ylabel(label_n)
    plt.legend(legend_n)
    plt.show()

performance_plot(acc, val_acc, acc_title_n, acc_label_n, acc_legend_n)
performance_plot(loss, val_loss, loss_title_n, loss_label_n, acc_legend_n)

# 過学習対策モデルの構築、学習、評価

In [None]:
#モデル構築
vocab_size = len(word_index) + 1
embedding_dim = 16
model2 = tf.keras.models.Sequential([
                            tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
                            tf.keras.layers.Dropout(0.2),
                            tf.keras.layers.Flatten(),
                            tf.keras.layers.Dense(64, activation="relu"),
                            tf.keras.layers.Dense(32, activation="relu"),
                            tf.keras.layers.Dense(1, activation="sigmoid")
])
model2.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model2.summary()

#モデル学習
num_epochs = 5
num_batchs = 100
history2 = model2.fit(train_padded, y_train, epochs=num_epochs, batch_size=num_batchs, validation_data=(valid_padded, y_valid))

#モデルの評価
model2.evaluate(test_padded, y_test)

# 学習プロセス可視化（過学習対策モデル）

In [None]:
acc=history.history["accuracy"]
val_acc=history.history["val_accuracy"]
loss=history.history["loss"]
val_loss=history.history["val_loss"]

d_acc=history2.history["accuracy"]
d_val_acc=history2.history["val_accuracy"]
d_loss=history2.history["loss"]
d_val_loss=history2.history["val_loss"]

epochs=range(len(acc))
epochs2=range(len(d_acc))

def acc_plot(epochs, epochs2, acc, val_acc, d_acc, d_val_acc):

  plt.plot(epochs, acc, "r")
  plt.plot(epochs, val_acc, "b")
  plt.plot(epochs2, d_acc, "m")
  plt.plot(epochs2, d_val_acc, "c")
  
  plt.title("Training and validation accuracy")
  plt.xlabel("Epochs")
  plt.ylabel("Accuracy")
  plt.legend(["Acc", "Valid_Acc", "Dropout_Acc", "Dropout_Val_Acc"])
  plt.show()

def loss_plot(epochs, epochs2, loss, val_loss, d_loss, d_val_loss):
  plt.plot(epochs, loss, "r")
  plt.plot(epochs, val_loss, "b")
  plt.plot(epochs2, d_loss, "m")
  plt.plot(epochs2, d_val_loss, "c")

  plt.title("Training and validation loss")
  plt.xlabel("Epochs")
  plt.ylabel("Loss")
  plt.legend(["Loss", "Validation Loss", "Dropout_Loss", "Dropout_Val_Loss"])
  plt.show()

acc_plot(epochs, epochs2, acc, val_acc, d_acc, d_val_acc)
loss_plot(epochs, epochs2, loss, val_loss, d_loss, d_val_loss)

# モデルの評価

In [None]:
#混同行列
def pred_mat(test_padded, y_test, model):
  predict_prob=model.predict(test_padded)
  predict_classes=np.argmax(predict_prob,axis=1)
  y_pred = []
  for score in predict_prob:
    if score < 0.5:
      y_pred.append(0)
    elif score > 0.5:
      y_pred.append(1)
  c_matrix = confusion_matrix(y_test, y_pred, labels=[0, 1])#混同行列のラベルの順序を指定
  return y_pred, c_matrix

#混同行列_DataFrame
def make_cm(matrix, columns):
    n = len(columns)
    act = ["正解データ"] * n
    pred = ["予測結果"] * n
    cm = pd.DataFrame(matrix, columns=[pred, columns], index=[act, columns])
    return cm

y_pred2, c_matrix2 = pred_mat(test_padded, y_test, model2)
cm = make_cm(c_matrix2, ["NEGATIVE", "POSITIVE"])
cm

# サンプリング

In [None]:
#False_Negative_Texts
samp_index = []
index = 0

for label, pred in zip(y_test, y_pred2):
  if label == 1 and pred ==0:
    samp_index.append(index)
    index += 1
  else:
    index +=1

#False_Negative_Texts_DataFrame
x_test_numpy = np.array(x_test)
x_test_samples = x_test_numpy[samp_index]
sample_df = pd.DataFrame(x_test_samples, columns=["sample_text"])
sample_df.head(20)

In [None]:
#False_Positive_Texts
samp_index2 = []
index2 = 0

for label, pred in zip(y_test, y_pred2):
  if label == 0 and pred ==1:
    samp_index2.append(index2)
    index2 += 1
  else:
    index2 +=1

#False_Positive_Texts_DataFrame
x_test_numpy2 = np.array(x_test)
x_test_samples2 = x_test_numpy2[samp_index2]
sample_df2 = pd.DataFrame(x_test_samples2, columns=["sample_text"])
sample_df2.head(20)

# エラーにおける頻出単語Top20

## 前処理

In [None]:
#False_Negative_Textsの単語を抽出
nltk.download("stopwords")

def filter_stop_words(sentences, stop_words):
    for i, sentence in enumerate(sentences):
        new_sent = [word for word in sentence.split() if word not in stop_words]
        sentences[i] = " ".join(new_sent)
    return sentences

#stop_wordの削除
stop_words = set(stopwords.words("english"))
sw_x_test = filter_stop_words(x_test_samples, stop_words)

#stemming
snowball = SnowballStemmer(language="english")
sw_x_test_texts = [text for text in sw_x_test]
x_test_words = " ".join(sw_x_test_texts).split()
clean_test_words = [snowball.stem(t) for t in x_test_words]

#トークン化
nltk.download("punkt")
w_list = []
for t in clean_test_words:
  t = nltk.word_tokenize(t)
  for w in t:
    w_list.append(w)

In [None]:
#False_Positive_Textsの単語を抽出
def filter_stop_words(sentences, stop_words):
    for i, sentence in enumerate(sentences):
        new_sent = [word for word in sentence.split() if word not in stop_words]
        sentences[i] = " ".join(new_sent)
    return sentences

#stop_wordの削除
stop_words2 = set(stopwords.words("english"))
sw_x_test2 = filter_stop_words(x_test_samples2, stop_words2)
#stemming
snowball2 = SnowballStemmer(language="english")
sw_x_test_texts2 = [text for text in sw_x_test2]
x_test_words2 = " ".join(sw_x_test_texts2).split()
clean_test_words2 = [snowball2.stem(t) for t in x_test_words2]

#トークン化
w_list2 = []
for t in clean_test_words2:
  t = nltk.word_tokenize(t)
  for w in t:
    w_list2.append(w)

## 可視化

In [None]:
c = collections.Counter(w_list)
c2 = collections.Counter(w_list2)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 8))
sns.countplot(y=w_list,order=[i[0] for i in c.most_common(20)], ax=ax1)
sns.countplot(y=w_list2,order=[i[0] for i in c2.most_common(20)], ax=ax2)
ax1.set_title("Pred_Negative", fontsize=20)
ax2.set_title("Pred_Positive", fontsize=20)

# 感情スコア算出

In [None]:
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = NEUTRAL
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = NEGATIVE
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = POSITIVE
        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE

def predict(text, model, include_neutral=True):
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=max_length, padding=padding_type)
    score = model.predict([x_test])
    label = decode_sentiment(score, include_neutral=include_neutral)
    results = f"label : {label}, score : {float(score):.2f}, text : {text}"
    return results

# 検証

## 記号の有無

In [None]:
#記号の有無（正解ラベル：ポジティブ）
print(predict("@amandadoan you'll get the job", model2))
print(predict("amandadoan you'll get the job", model2))
print(predict("you'll get the job", model2))

## URLの有無

In [None]:
#①URLの有無（正解ラベル：ポジティブ）
print(predict("i always dance alone in my room http://tumblr.com/xyx1xdji5", model2))
print(predict("i always dance alone in my room", model2))

#②URLの有無（正解ラベル：ネガティブ）
print(predict("Cherry tree update: But the first day of full bloom also brings the first falling blossom http://twitpic.com/3phky", model2))
print(predict("Cherry tree update: But the first day of full bloom also brings the first falling blossom", model2))

## 'mの有無

In [None]:
#'mの有無（正解ラベル：ポジティブ）
print(predict("Oh man oh man. I found my old CD's. I'm listening to Underoath and Saosin's old schtuff", model2))
print(predict("Oh man oh man. I found my old CD's. I listening to Underoath and Saosin's old schtuff", model2))

## quotの有無

In [None]:
#quotの有無（正解ラベル：ネガティブ）
print(predict("I AM going to bed this time.. Apologies for the many &quot;colourful&quot; tweets, gona stick some Wilco or Grizzly Bear on to calm down..", model2))
print(predict("I AM going to bed this time.. Apologies for the many colourful tweets, gona stick some Wilco or Grizzly Bear on to calm down..", model2))

## 数字の有無

In [None]:
#①数字の有無（正解ラベル：ポジティブ）
print(predict("will be moving on june 10.", model2))
print(predict("will be moving on june.", model2))

#②数字の有無（正解ラベル：ネガティブ）
print(predict("I cannot wait til summer. 14 more days", model2))
print(predict("I cannot wait til summer. more days", model2))

## 文脈を考慮した予測かどうか

In [None]:
#文脈の確認（正解ラベル：ポジティブ）
print(predict("cannot wait for my date tonight... and this weekend. I so bad", model2))
print(predict("cannot wait for my date tonight... and this weekend. I so", model2))

# モデルの保存と読み込み

In [None]:
#モデルの保存と読み込み
#model2.save("./model.h5")
#model2 = tf.keras.models.load_model(""./drive/MyDrive/Sentiment_Analysis/model.h5")