In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('datathon_train (1).csv')
df.head()

Unnamed: 0,URL,Label
0,https://www.peoplescollection.wales/discover/w...,0
1,http://yasli-sad.ru/css/chase/chaseall%20newin...,1
2,http://denizkent.net/wp-admin/js/login.alibaba...,1
3,http://www.marketbiz.net/mbz/wp-includes/js/jq...,1
4,http://guardiaoitau30horas.uniclassdispositivo...,1


In [4]:
df['URL'][0]

'https://www.peoplescollection.wales/discover/what/70/query/Merthyr'

In [3]:
df.isnull().sum()

URL      0
Label    0
dtype: int64

In [4]:
from nltk.tokenize import RegexpTokenizer  
tokenizer = RegexpTokenizer(r'[A-Za-z]+')

In [5]:
tokenizer.tokenize(df.URL[0])

['https',
 'www',
 'peoplescollection',
 'wales',
 'discover',
 'what',
 'query',
 'Merthyr']

In [6]:
df['text_tokenized'] = df.URL.map(lambda t: tokenizer.tokenize(t))

In [7]:
df['text_sent'] = df['text_tokenized'].map(lambda l : ' '.join(l))

In [8]:
df.head()

Unnamed: 0,URL,Label,text_tokenized,text_sent
0,https://www.peoplescollection.wales/discover/w...,0,"[https, www, peoplescollection, wales, discove...",https www peoplescollection wales discover wha...
1,http://yasli-sad.ru/css/chase/chaseall%20newin...,1,"[http, yasli, sad, ru, css, chase, chaseall, n...",http yasli sad ru css chase chaseall newinfo a...
2,http://denizkent.net/wp-admin/js/login.alibaba...,1,"[http, denizkent, net, wp, admin, js, login, a...",http denizkent net wp admin js login alibaba com
3,http://www.marketbiz.net/mbz/wp-includes/js/jq...,1,"[http, www, marketbiz, net, mbz, wp, includes,...",http www marketbiz net mbz wp includes js jque...
4,http://guardiaoitau30horas.uniclassdispositivo...,1,"[http, guardiaoitau, horas, uniclassdispositiv...",http guardiaoitau horas uniclassdispositivos c...


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer().fit(df['text_sent'])

In [10]:
import pickle
filename = 'cv_transformer.pkl'
pickle.dump(cv,open(filename,'wb'))

In [11]:
vector=cv.transform(df['text_sent'])

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(vector,df['Label'],test_size=0.25,random_state=45)

In [13]:
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB().fit(X_train,y_train)

In [14]:
y_pred=nb.predict(X_test)

In [15]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred,y_test))

0.925639500297442
              precision    recall  f1-score   support

           0       0.97      0.92      0.95     18116
           1       0.83      0.93      0.88      7099

    accuracy                           0.93     25215
   macro avg       0.90      0.93      0.91     25215
weighted avg       0.93      0.93      0.93     25215



In [16]:
import pickle
filename = 'nb_model.pkl'
pickle.dump(nb,open(filename,'wb'))

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()

In [18]:
vector1=tfidf.fit_transform(df['text_sent'])

In [19]:
from sklearn.model_selection import train_test_split
X_train1,X_test1,y_train1,y_test1=train_test_split(vector1,df['Label'],test_size=0.25,random_state=45)

In [20]:
from sklearn.naive_bayes import MultinomialNB
nb1=MultinomialNB().fit(X_train1,y_train1)

In [21]:
y_pred1=nb1.predict(X_test1)

In [22]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test1,y_pred1))
print(classification_report(y_test1,y_pred1))

0.9235772357723577
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     17191
           1       0.97      0.79      0.87      8024

    accuracy                           0.92     25215
   macro avg       0.94      0.89      0.91     25215
weighted avg       0.93      0.92      0.92     25215



In [23]:
voc_size=10000

In [24]:
from tensorflow.keras.preprocessing.text import one_hot
one_hot_reps=[one_hot(word,voc_size) for word in df['text_sent']]

In [25]:
one_hot_reps

[[6731, 5935, 211, 6148, 7011, 3483, 5873, 2735],
 [32, 9491, 9872, 2637, 5444, 6348, 3574, 1730, 3070, 4611, 455, 8258, 3074],
 [32, 3563, 2269, 2190, 5727, 9939, 5547, 4609, 5481],
 [32, 5935, 5682, 2269, 1689, 2190, 5487, 9939, 9597, 858, 486, 7504, 2107],
 [32, 9556, 2623, 3826, 5481, 4694, 2107],
 [32, 5214, 5481, 7567, 7567, 4673, 6370, 2107],
 [32, 5935, 607, 2548],
 [32, 5935, 5322, 5042, 2637],
 [6731, 1356, 3629, 5481],
 [6731, 5699, 8213, 5481, 7319, 5018, 8414],
 [32, 5935, 8066, 8574, 3299, 7218, 8897],
 [6731, 6339, 5481],
 [32, 1158, 5481, 6791, 8974, 5462, 4990],
 [32, 5935, 7857, 9445, 2473, 7060, 9495, 1783, 7509, 7924, 6172, 6621, 9730],
 [32, 8707, 6472, 874, 6172, 6172],
 [6731, 5935, 753, 5481, 5168],
 [32,
  4023,
  5770,
  3250,
  5654,
  5893,
  8521,
  1970,
  1878,
  1084,
  9701,
  5219,
  5448,
  8643,
  3113],
 [32,
  1558,
  3076,
  2190,
  5487,
  9939,
  8170,
  9157,
  4984,
  927,
  5208,
  6172,
  7243,
  9157,
  6172,
  4240,
  8258],
 [32,
  1167,


In [26]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [27]:
sent_length=50
embedded_docs=pad_sequences(one_hot_reps,padding='pre',maxlen=sent_length)
embedded_docs

array([[   0,    0,    0, ..., 3483, 5873, 2735],
       [   0,    0,    0, ...,  455, 8258, 3074],
       [   0,    0,    0, ..., 5547, 4609, 5481],
       ...,
       [   0,    0,    0, ..., 4899,  549, 1322],
       [   0,    0,    0, ..., 7659, 8437, 1649],
       [   0,    0,    0, ..., 5248, 2906, 5481]])

In [28]:
x=np.array(embedded_docs)
y=np.array(df['Label'])

In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout

In [30]:
import warnings
warnings.filterwarnings('ignore')
embedded_feature_vector=300
nn=Sequential([
    Embedding(voc_size,embedded_feature_vector,input_length=sent_length),
    Dropout(0.5),
    LSTM(199),
    Dropout(0.4),
    Dense(399,activation='relu'),
    Dense(43,activation='relu'),
    Dense(1,activation='sigmoid')
    ])

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
nn.compile(optimizer='Adam',loss='binary_crossentropy',metrics=['accuracy'])
nn.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=20,batch_size=50,callbacks=EarlyStopping(monitor='val_loss',mode='min',verbose=1,patience=50))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20

In [None]:
from tensorflow.keras.layers import Bidirectional
import warnings
warnings.filterwarnings('ignore')
embeding_feature_vector=40
nn1=Sequential([
    Embedding(voc_size,embeding_feature_vector,input_length=sent_length),
    Dropout(0.4),
    Bidirectional(LSTM(100)),
    Dropout(0.3),
    Dense(399,activation='relu'),
    Dense(43,activation='relu'),
    Dense(1,activation='sigmoid')
])

In [None]:
nn1.compile(optimizer='Adam',loss='binary_crossentropy',metrics=['accuracy'])
nn1.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=5,batch_size=50)