In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [83]:
#imports
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

In [3]:
data = pd.read_json("../input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset_v2.json", lines=True)

In [4]:
data

In [86]:
#define parameters

vocab_size= 10000
max_length =100

In [87]:
#Instantiate tokenizer
tokenizer  = Tokenizer(num_words = vocab_size,oov_token="<OOV>")

#fit the data om tokenizer
tokenizer.fit_on_texts(data['headline'])

In [88]:
#get the word index
word_index = tokenizer.word_index

#print length of word index
print("length of word index is " + str(len(word_index)))

#get the sequences
sequences= tokenizer.texts_to_sequences(data['headline'])

#pad sequences
padded_seq = pad_sequences(sequences, padding="post", maxlen=max_length, truncating="post")



In [89]:
#split the data
X_train_nn, X_test_nn, y_train_nn, y_test_nn =  train_test_split(padded_seq, data['is_sarcastic'], test_size=0.2, random_state=4)

## Modeling using nueral networks

In [90]:
# create a model
model = tf.keras.Sequential([
    
    tf.keras.layers.Embedding(vocab_size, 16, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(3, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# compile the model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# Print the model summary
model.summary()

In [91]:
#fit the model

mod = model.fit(X_train_nn, y_train_nn, epochs=10, validation_data=(X_test_nn, y_test_nn))

As we can see we are getting testing accuracy of 85.94% which is a good starting point, however, the model is currently overfitting because the training accuract is nearly 1. Let's visualize this. 

In [104]:
def plot_graphs(mod, string):
    plt.plot(mod.history[string])
    plt.plot(mod.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, "val_"+string])
    plt.show()


In [105]:
plot_graphs(mod, "accuracy")
plot_graphs(mod, "loss")

### Visualizing the word embedding in Tensorflow embedding Projector

In [62]:
# Get the embedding layer from the model (i.e. first layer)
embedding_layer = model.layers[0]

# Get the weights of the embedding layer
embedding_weights = embedding_layer.get_weights()[0]

# Print the shape. Expected is (vocab_size, embedding_dim)
print(embedding_weights.shape) 

# Get the index-word dictionary
reverse_word_index = tokenizer.index_word

In [66]:
import io

# Open writeable files
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

# Initialize the loop. Start counting at `1` because `0` is just for the padding
for word_num in range(1, vocab_size):

  # Get the word associated at the current index
  word_name = reverse_word_index[word_num]

  # Get the embedding weights associated with the current index
  word_embedding = embedding_weights[word_num]

  # Write the word name
  out_m.write(word_name + "\n")

  # Write the word embedding
  out_v.write('\t'.join([str(x) for x in word_embedding]) + "\n")

# Close the files
out_v.close()
out_m.close()

In [68]:
# Import files utilities in Colab
try:
    from google.colab import files
except ImportError:
    pass

# Download the files
else:
    files.download('vecs.tsv')
    files.download('meta.tsv')

## Modeling Using Naive Bayes Algorithm

In [22]:
#vectorize the sentences data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['headline']).toarray()

#split the data
X_train, X_test, y_train, y_test = train_test_split(X, data['is_sarcastic'], test_size=0.2, random_state=4)

In [10]:
#initializ the model
nb = MultinomialNB()

#get the test accuracy
y_test_predict = nb.fit(X_train, y_train).predict(X_test)

#get the train accuracy
y_train_predict = nb.fit(X_train, y_train).predict(X_train)

In [11]:
#check the accuracy 
print(f'Testing accuracy using Naive Bayes: {accuracy_score(y_test,y_test_predict)}')

print(f'Testing accuracy using Naive Bayes: {accuracy_score(y_train,y_train_predict)}')


## Modeling with Logistic regression



In [12]:
#create a model
lr = LogisticRegression()

#fit the data and predict
y_test_pred_lr = lr.fit(X_train, y_train).predict(X_test)

#predict for train data
y_train_pred_lr = lr.fit(X_train, y_train).predict(X_train)

In [14]:
print(f'Testing accuracy using Logistic Regression: {accuracy_score(y_test,y_test_pred_lr)}')

print(f'Testing accuracy using Logistic Regression: {accuracy_score(y_train,y_train_pred_lr)}')

## Conclusion


We got higher accuracy using neural networks than Naive bayes and Logistic regression. However the nueral network model is overfitting with training accuracy nearly 1, which is not a good approach. 