In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
pd_train = pd.read_csv('../input/nlp-getting-started/train.csv')
pd_test = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
X_train = pd_train.text.values
Y_train = pd_train.target.values
X_test = pd_test.text.values

In [None]:
import string
import nltk
def clean_text(line):
    line = line.lower()
    no_punct = [words for words in line if words not in string.punctuation]
    line = ''.join(no_punct)
    line = line.split()
    line = [words for words in line if words not in nltk.corpus.stopwords.words('english')]
    line = ' '.join(line)
    return line

In [None]:
import tqdm
X_train_clean = [clean_text(line) for line in tqdm.tqdm(X_train)]
X_test_clean = [clean_text(line) for line in tqdm.tqdm(X_test)]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(X_train_clean)
X_train_clean_cv = vectorizer.transform(X_train_clean)
X_test_clean_cv = vectorizer.transform(X_test_clean)

In [None]:
from sklearn.model_selection import train_test_split
X_train_cv,X_val_cv,Y_train_cv,Y_val_cv = train_test_split(X_train_clean_cv,Y_train,test_size=0.2)

In [None]:
Y_train_cv

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X_train_cv, Y_train_cv)

In [None]:
Y_pred_cv = clf.predict(X_val_cv)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(Y_val_cv, Y_pred_cv))

In [None]:
Y_test_pred_cv = clf.predict(X_test_clean_cv)
id1 = pd_test['id'].values
df = pd.DataFrame(data={"id": id1, "target": Y_test_pred_cv})
df.to_csv("./cv.csv", sep=',',index=False)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
tfv.fit(X_train_clean)
X_train_clean_tf = tfv.transform(X_train_clean)
X_test_clean_tf = tfv.transform(X_test_clean)

In [None]:
from sklearn.model_selection import train_test_split
X_train_tf,X_val_tf,Y_train_tf,Y_val_tf = train_test_split(X_train_clean_tf,Y_train,test_size=0.2)

In [None]:
clf_tf = LogisticRegression(random_state=0).fit(X_train_tf, Y_train_tf)

In [None]:
Y_pred_tf = clf_tf.predict(X_val_tf)
print(accuracy_score(Y_val_tf, Y_pred_tf))

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb_cv =  MultinomialNB().fit(X_train_cv, Y_train_cv)
nb_tf =  MultinomialNB().fit(X_train_tf, Y_train_tf)
Y_pred_cv = nb_cv.predict(X_val_cv)
Y_pred_tf = nb_tf.predict(X_val_tf)
print(accuracy_score(Y_val_cv, Y_pred_cv))
print(accuracy_score(Y_val_tf, Y_pred_tf))

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

t = Tokenizer()
t.fit_on_texts(X_train_clean)
X_train_token = t.texts_to_sequences(X_train_clean)
X_test_token = t.texts_to_sequences(X_test_clean)

In [None]:
sent_length = 50
X_train_token_pad = pad_sequences(X_train_token,padding='pre',maxlen=sent_length)
X_test_token_pad = pad_sequences(X_test_token,padding='pre',maxlen=sent_length)

In [None]:
from keras.layers import Embedding
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
embedding_vector_features=100
voc_size = len(t.word_index)+1
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

In [None]:
X_train_token_pad, X_val_token_pad, Y_train_token_pad, Y_val_token_pad = train_test_split(X_train_token_pad, Y_train, test_size=0.33, random_state=42)
model.fit(X_train_token_pad,Y_train_token_pad,validation_data=(X_val_token_pad,Y_val_token_pad),epochs=10,batch_size=64)

In [None]:
Y_pred_token = model.predict_classes(X_test_token_pad)

In [None]:
id1 = pd_test['id'].values
df = pd.DataFrame(data={"id": id1, "target": Y_pred_token.reshape(-1)})
df.to_csv("./token.csv", sep=',',index=False)

In [None]:
def get_vocab(data):
    vocab = {}
    for line in tqdm.tqdm(data):
        line = line.split()
        for word in line:
            try:
                vocab[word] += 1
            except:
                vocab[word] = 1
    return vocab

In [None]:
vocab = get_vocab(X_train)

In [None]:
import gensim.downloader as api
path = api.load("word2vec-google-news-300", return_path=True)

In [None]:
from gensim.models import KeyedVectors
model_google = KeyedVectors.load_word2vec_format(path, binary = True);

In [None]:
import operator 

def check_coverage(voc,model):
    present = {}
    not_present = {}
    k = 0
    i = 0
    for word in (voc):
        try:
            present[word] = model[word]
            k += voc[word]
        except:
            not_present[word] = voc[word]
            i += voc[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(present) / len(voc)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(not_present.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [None]:
not_present = check_coverage(vocab,model_google)

In [None]:
vocab = get_vocab(X_train_clean)
not_present = check_coverage(vocab,model_google)
print(not_present)

In [None]:
EMBEDDING_DIM =300
word_index = t.word_index
num_words = len(t.word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > num_words:
        continue
    try:
        embedding_vector = model_google[word]
        embedding_matrix[i] = embedding_vector
    except:
        None

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten,Dropout,LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.initializers import Constant
max_length = 50
model = Sequential()
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_length,
                            trainable=False)


model.add(embedding_layer)
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())


# fit the model
model1.fit(X_train_token_pad, Y_train_token_pad, batch_size=128, epochs=32, validation_data=(X_val_token_pad,Y_val_token_pad), verbose=2)

In [None]:
Y_pred_token_pre = model.predict_classes(X_test_token_pad)

In [None]:
id1 = pd_test['id'].values
df = pd.DataFrame(data={"id": id1, "target": Y_pred_token_pre.reshape(-1)})
df.to_csv("./token_pre2.csv", sep=',',index=False)