In [None]:
#@title MIT License
#
# Copyright (c) 2020 Balázs Pintér 
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

# Downloading the datasets

In [1]:
# run this only once and download "book"
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

# Training a simple neural network

In [5]:
# this is a code cell that contains Python code
# we usually start with the imports
# these are the imports we usually use for machine learning
import numpy as np
import scipy
import scipy.sparse as sps
import matplotlib.pyplot as plt
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import tensorflow as tf
from nltk.corpus import movie_reviews

In [28]:
num_of_features = 5000

## Loading the dataset

Dataset URL: https://www.kaggle.com/neiljs/all-shark-tank-us-pitches-deals

In [20]:
df = pd.read_csv('Sharktankpitchesdeals.csv')
df.head()

Unnamed: 0,Season_Epi_code,Pitched_Business_Identifier,Pitched_Business_Desc,Deal_Status,Deal_Shark
0,826,Bridal Buddy,a functional slip worn under a wedding gown th...,1,KOL+LG
1,826,Laid Brand,hair-care products made with pheromones . Laid...,0,
2,826,Rocketbook,a notebook that can scan contents to cloud ser...,0,
3,826,Wine & Design,painting classes with wine served . Wine & Des...,1,KOL
4,824,Peoples Design,a mixing bowl with a built-in scoop . Peoples ...,1,LG


In [21]:
for pitch in df.loc[:3, 'Pitched_Business_Desc']:
    print(pitch)
    print('-----------------------')

a functional slip worn under a wedding gown that allows the wearer to use the restroom on their own . Bridal Buddy is a lightweight slip worn under the gown that lets brides go to the bathroom while wearing it. When nature calls, the bride can bag up her bustle to safely relieve herself without making a mess.
-----------------------
hair-care products made with pheromones . Laid brand is a Â pheromone-enriched hair care product thatÂ enhances color, in addition to protecting and hydrating hair.Â The pheromones help girls â€œexude confidenceâ€ wherever she goes.
-----------------------
a notebook that can scan contents to cloud services via an app and can be erased by being microwaved . Rocketbook is an intelligent reusable notebook that allows users to write with a traditional pen and notebook. The notebook is erasable and reusable by allowing users to send notes to the cloud. Simply use a smartphone to send writings to the cloud and a microwave oven to erase for future use.
---------

In [36]:
corpus = [pitch for pitch in df.loc[:, 'Pitched_Business_Desc']]
corpus[:1]

['a functional slip worn under a wedding gown that allows the wearer to use the restroom on their own . Bridal Buddy is a lightweight slip worn under the gown that lets brides go to the bathroom while wearing it. When nature calls, the bride can bag up her bustle to safely relieve herself without making a mess.']

In [23]:
targets = [deal for deal in df.loc[:, 'Deal_Status']]
targets[:5]

[1, 0, 0, 1, 1]

In [24]:
set(targets)

{0, 1}

### Bag of words representation

In [None]:
count_vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=2, max_features=20)
bows = count_vectorizer.fit_transform(corpus)
pd.DataFrame(bows.toarray(), columns=count_vectorizer.get_feature_names()).head()

In [None]:
count_vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=2, max_features=num_of_features)
bows = count_vectorizer.fit_transform(corpus)
print("We have {} pitches.".format(bows.shape[0]))

### Producing training and test data

In [None]:
# the problem: we have sparse arrays, but neural network need dense arrays!
# the solution will be word embeddings, here we just convert to dense arrays
bows = bows.toarray().astype(np.float32)
targets = np.array(targets, dtype=np.float32)

In [None]:
num_of_train = 600
X_train, y_train = bows[:num_of_train], targets[:num_of_train]
X_test, y_test = bows[num_of_train:], targets[num_of_train:]

In [None]:
X_train

In [None]:
X_train[0]

In [None]:
# the sigmoid activation function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

### The weights of a single neuron are in a vector

![title](nn_vector.png)

In [None]:
w = np.array([1, 2, 3])
x = np.array([1, 2, 3])
w @ x

### The weights of a layer of neurons are in a matrix
![title](nn_matrix.png)

In [None]:
w = np.array([[1, 2, 3], [1, 1, 1], [2, 2, 2]])
x = np.array([1, 2, 3])
w @ x

In [None]:
w

#### bias

In [None]:
b = [3, 4, 5]
w @ x + b

In [None]:
w @ np.array([0, 0, 0])

In [None]:
w @ np.array([0, 0, 0]) + b

#### activation function

In [None]:
sigmoid(w @ x + b)

In [None]:
sigmoid(w @ sigmoid(w @ x + b) + b)

In [None]:
x = np.arange(-7, 7, 0.01)
fix, ax = plt.subplots(1, 1, figsize=(20, 10))
ax.plot(x, sigmoid(x))

In [None]:
# the relu activation function
x = np.arange(-7, 7, 0.01)
fix, ax = plt.subplots(1, 1, figsize=(20, 10))
ax.plot(x, [max(xe, 0) for xe in x])

### Optimization algorithm: some kind of gradient descent

![title](Gradient_descent.gif)

### Loss function: binary crossentropy

If $y_i$ are the true labels, and $\hat{y}_i$ are the predictions of the network:

$- \frac{1}{N} \sum_{i=1}^{N} y_i * log(\hat{y}_i) + (1-y_i)*log(1-\hat{y}_i)$

In [None]:
x = np.arange(0.001, 1.0, 0.001)
fix, ax = plt.subplots(1, 1, figsize=(20, 10))
ax.plot(x, -np.log(x))

## Computational graph
![title](tensors_flowing.gif)

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(20, activation=tf.keras.activations.relu),
    tf.keras.layers.Dense(20, activation=tf.keras.activations.relu),
    tf.keras.layers.Dense(1, activation=tf.keras.activations.sigmoid)
])

In [None]:
# we compile our neural network model
# we also have to choose an optimizer and a loss function
# for a binary classification task usually binary cross-entropy is fine
# we use accuracy as the metric
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# training or in other words, fitting the model to the data
model.fit(X_train, y_train, epochs=10)

In [None]:
# looks very good, but
# let's evaluate on the test set
model.evaluate(X_test, y_test)

### Overfitting
![title](Overfitting.png)

## Let's try another dataset!
Movie reviews - positive or negative

In [None]:
print(movie_reviews.raw('neg/cv000_29416.txt'))

In [30]:
corpus, targets = zip(*[(movie_reviews.raw(fileid), category)
                         for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)])

In [33]:
print(corpus[:1])
print(targets[:1])

('plot : two teen couples go to a church party , drink and then drive . \nthey get into an accident . \none of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . \nwhat\'s the deal ? \nwatch the movie and " sorta " find out . . . \ncritique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . \nwhich is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn\'t snag this one correctly . \nthey seem to have taken this pretty neat concept , but executed it terribly . \nso what are the problems with the movie ? \nwell , its main problem is that it\'s simply too jumbled . \nit starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience memb

In [26]:
targets[:10]

('neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg')

In [27]:
count_vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=2, max_features=num_of_features)
bows = count_vectorizer.fit_transform(corpus)
print("We have {} documents.".format(bows.shape[0]))

NameError: name 'num_of_features' is not defined

In [None]:
set(targets)

In [None]:
# convert targets to numbers
targets = np.array([0 if target == 'neg' else 1 for target in targets])
targets[:30]

In [None]:
# we need to shuffle
perm = np.random.permutation(len(targets))
bows = bows[perm]
targets = targets[perm].astype(np.float32)

In [None]:
bows = bows.toarray().astype(np.float32)

In [None]:
num_of_train = 1800
X_train, y_train = bows[:num_of_train], targets[:num_of_train]
X_test, y_test = bows[num_of_train:], targets[num_of_train:]

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(20, activation=tf.keras.activations.relu),
    tf.keras.layers.Dense(1, activation=tf.keras.activations.sigmoid)
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, validation_split=0.1)

In [None]:
model.evaluate(X_test, y_test)

NameError: name 'category' is not defined

In [29]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [46]:
corpus = newsgroups_train.data
corpus_test = newsgroups_test.data

In [47]:
count_vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=2, max_features=num_of_features)
bows = count_vectorizer.fit_transform(corpus)
print("We have {} documents.".format(bows.shape[0]))

We have 11314 documents.


In [48]:
count_vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=2, max_features=num_of_features)
bows_test = count_vectorizer.fit_transform(corpus_test)
print("We have {} documents.".format(bows.shape[0]))

We have 11314 documents.


In [49]:
bows_train = bows.toarray().astype(np.float32)
targets_train = np.array(newsgroups_train.target, dtype=np.float32)

bows_test = bows_test.toarray().astype(np.float32)
targets_test = np.array(newsgroups_test.target, dtype=np.float32)

In [50]:
X_train, y_train = bows_train, targets_train
X_test, y_test = bows_test, targets_test

In [51]:
X_train[:1]

array([[0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [52]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(20, activation=tf.keras.activations.relu),
    tf.keras.layers.Dense(20, activation=tf.keras.activations.relu),
    tf.keras.layers.Dense(1, activation=tf.keras.activations.softmax)
])

In [53]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10)
model.evaluate(X_test, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.0, 0.051646310836076736]