## Surestart Action Item Day 5
This notebook will train and test a basic neural network, using Keras, to determine whether a headline is sarcastic or not.  

In [12]:
#Necessary packages
import pandas as pd
import numpy as np5
import os
import json
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import cross_val_score as cvs
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import keras
import tensorflow as tf
from keras import models
from keras import layers
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.losses import MeanSquaredError as mse
import sklearn.metrics
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import precision_score, recall_score

In [13]:
#Getting pathnames for each file in the input folder
for dirname, _, filenames in os.walk('/kaggle/input/news-headlines-dataset-for-sarcasm-detection'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#Function to parse data
def parse_data(file):
    for l in open(file,'r'):
        yield json.loads(l)

#Taking in the data in one of the json files (I'm using the slightly larger one)
data = list(parse_data("/kaggle/input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset_v2.json"))
data[0]

/kaggle/input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset_v2.json
/kaggle/input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json


{'is_sarcastic': 1,
 'headline': 'thirtysomething scientists unveil doomsday clock of hair loss',
 'article_link': 'https://www.theonion.com/thirtysomething-scientists-unveil-doomsday-clock-of-hai-1819586205'}

****We need to separate the data from this list into X and y****  
Our X is going to be a vectorized list of words from the headline.  
Our y is going to be "is_sarcastic".  
To do this, we're going to use some NLP packages  

In [28]:
#Creating our X variable
vectorizer = TfidfVectorizer(max_features=50, use_idf=False)
headlines = [i['headline'] for i in data]
X = vectorizer.fit_transform(headlines).toarray()

#Creating our y variable
y = np.ravel([i['is_sarcastic'] for i in data])

#Creating a train and test split
X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.2, random_state = 1693)

In [37]:
#Now we're going to build the model with its layers

#Initialize the model
model = Sequential()

#Add the input layer
model.add(Dense(24, activation = 'softmax', input_shape = (50,)))

#Add first hidden layer
model.add(Dense(12, activation = 'softmax'))

#Add second hidden layer
model.add(Dense(8, activation = 'softmax'))

#Tried adding third layer, didn't change much
model.add(Dense(4, activation = 'softmax'))

#Add output layer
model.add(Dense(1, activation='sigmoid'))

In [38]:
#Now we're going to compile the model
#Our loss function is binary crossentropy
#Our optimizer is adam

model.compile(loss = 'binary_crossentropy', 
              optimizer = 'adam',
              metrics = ['accuracy', 'mse'])

#We're going to also fit the model
#We're going to do 20 epochs
#The batch size will be 224 to get ~100 iterations per epoch
model.fit(X_train, y_train, epochs = 20,
          batch_size = 224, verbose = 1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f4ff271ff50>

In [39]:
#Next, we'll test the model on the test dataset we set aside

#Prediction on the X_test data, round each to an integer (either 0 or 1)
y_pred = np.around(model.predict(X_test))

#We're going to now look at the accuracy and loss
score = model.evaluate(X_test, y_test, verbose=1)
print(score)

#We'll print precision and recall too
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")

[0.566223680973053, 0.7071977853775024, 0.19168050587177277]
Precision: 0.6524353577871317
Recall: 0.8066914498141264


In [40]:
#Now we're going to make a confusion matri
#The rows are the known labels, the columns are the predicted labels
matrix = cm(y_test, y_pred)
df = pd.DataFrame(columns = ['', 'is_sarcastic', 'not_sarcastic'])
df.loc[len(df)] = ['is_sarcastic', matrix[0][0], matrix[0][1]]
df.loc[len(df)] = ['not_sarcastic', matrix[1][0], matrix[1][1]]
print(df)

                 is_sarcastic not_sarcastic
0   is_sarcastic         1878          1156
1  not_sarcastic          520          2170
