In [0]:
!pip install kaggle --u


Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

ambiguous option: --u (--upgrade, --upgrade-strategy, --use-pep517, --user?)


## Mount Google Drive to Google's Linux VM (Colab)

In [0]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [0]:
#Check whether Google Drive is connected

with open('/gdrive/My Drive/test.txt', 'w') as f:
  f.write('Hello Google Drive!')
!cat '/gdrive/My Drive/test.txt'

Hello Google Drive!

In [0]:
#Connecting to Kaggle API and showing all available dataset

!pip install -U -q kaggle
!mkdir -p ~/.kaggle

!cp "/gdrive/My Drive/Deep Learning Workshop/kaggle.json" ~/.kaggle/

In [0]:
!kaggle datasets list

ref                                                          title                                                size  lastUpdated          downloadCount  
-----------------------------------------------------------  --------------------------------------------------  -----  -------------------  -------------  
ronitf/heart-disease-uci                                     Heart Disease UCI                                     3KB  2018-06-25 11:33:56           9170  
lava18/google-play-store-apps                                Google Play Store Apps                                2MB  2019-02-03 13:55:47          37627  
karangadiya/fifa19                                           FIFA 19 complete player dataset                       2MB  2018-12-21 03:52:59           9804  
vjchoudhary7/customer-segmentation-tutorial-in-python        Mall Customer Segmentation Data                       2KB  2018-08-11 07:23:02           2850  
russellyates88/suicide-rates-overview-1985-to-2016        

In [0]:
!kaggle competitions download -c quora-insincere-questions-classification

train.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
embeddings.zip: Skipping, found more recently modified local copy (use --force to force download)
sample_submission.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
test.csv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [0]:
!ls -al

!unzip embeddings.zip
!unzip train.csv.zip

In [0]:
!ls -al

total 6443288
drwxr-xr-x 1 root root       4096 Feb 24 01:09 .
drwxr-xr-x 1 root root       4096 Feb 24 01:01 ..
drwxr-xr-x 4 root root       4096 Feb 15 17:21 .config
-rw-r--r-- 1 root root 6395920052 Feb 24 01:04 embeddings.zip
drwxrwxr-x 2 root root       4096 Oct 31 19:53 glove.840B.300d
drwxrwxr-x 2 root root       4096 Oct 31 20:04 GoogleNews-vectors-negative300
drwxr-xr-x 2 root root       4096 Aug 25  2015 paragram_300_sl999
drwxr-xr-x 1 root root       4096 Feb 15 17:21 sample_data
-rw-r--r-- 1 root root    4282631 Feb 24 01:04 sample_submission.csv.zip
-rw-r--r-- 1 root root   16426497 Feb 24 01:04 test.csv.zip
---------- 1 root root  124206772 Oct 30 16:56 train.csv
-rw-r--r-- 1 root root   57047694 Feb 24 01:01 train.csv.zip
drwxrwxr-x 2 root root       4096 Oct 31 19:58 wiki-news-300d-1M


In [0]:
import pandas as pd

from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, Embedding
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split

In [0]:
quora_df = pd.read_csv('train.csv')

X = quora_df.question_text
y = quora_df.target

In [0]:
#Split the dataset into train and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 10)

In [0]:
max_words = 100000
max_len = 150
embedding_dims = 50

#Convrt the text into tokens

tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)

In [0]:
#Convert the test tokens to sequence + pad the sequence

sequences = tok.texts_to_sequences(X_train)
X_train = sequence.pad_sequences(sequences,maxlen=max_len)

In [0]:
#Defining the neural network architecture

Inp = Input(name='inputs',shape=[max_len])
x = Embedding(max_words,embedding_dims,input_length=max_len)(Inp)
x = LSTM(64,name='LSTM_01')(x)
x = Dropout(0.5,name='Dropout_LSTM')(x)
x = Dense(256,activation='relu',name='Dense_01')(x)
x = Dropout(0.5,name='Dropout_Dense')(x)
out = Dense(1,activation='sigmoid', name='output')(x)

#On a multi-class classification problem, we should be using a sigmoid activation instead of softmax.

In [0]:
#Compiling the model

model = Model(inputs=Inp,outputs=out)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [0]:
#Print the model summary

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 150)               0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 150, 50)           5000000   
_________________________________________________________________
LSTM_01 (LSTM)               (None, 64)                29440     
_________________________________________________________________
Dropout_LSTM (Dropout)       (None, 64)                0         
_________________________________________________________________
Dense_01 (Dense)             (None, 256)               16640     
_________________________________________________________________
Dropout_Dense (Dropout)      (None, 256)               0         
_________________________________________________________________
output (Dense)               (None, 1)                 257       
Total para

In [0]:
#Define an early stopping mechanism: where validation loss is not improving

early_stop = EarlyStopping(monitor='val_loss',min_delta=0.0001)

In [0]:
#Training the single layer LSTM model 

model.fit(X_train,y_train,
          batch_size=2048,
          epochs=10,
          validation_split=0.2,
          callbacks=[early_stop])

Train on 731428 samples, validate on 182857 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10


<tensorflow.python.keras.callbacks.History at 0x7f56b646efd0>

In [0]:
#Training the single layer LSTM model 

test_sequences = tok.texts_to_sequences(X_test)
X_test = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [0]:
#Evaluate the test set 

model.evaluate(X_test,y_test)



[0.11913513604582843, 0.95370525]

In [0]:
`