In [180]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import svm, ensemble
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
reddit_data = pd.read_excel('cleaned_reddit_india.xlsx')

In [5]:
feature_cols = ('is_orig', 'n_comm', 'total_awards', 'up_count', 'upvote_rat', 'calc_down_count')

In [31]:
mod_flair_vc = reddit_data.mod_flair.value_counts()

In [37]:
reddit_data['double_mod_flair'] = np.where(reddit_data['mod_flair'].isin(mod_flair_vc.index[mod_flair_vc>=10]), reddit_data['mod_flair'], 'other')

In [38]:
mod_flair_vc

Non-Political                109
Coronavirus                  108
Politics                      76
AskIndia                      31
[R]eddiquette                 10
Sports                         8
Photography                    8
Science/Technology             8
Policy/Economy                 7
r/all                          4
Business/Finance               3
Food                           3
other                          2
Scheduled                      2
CAA-NRC                        2
Unverified                     2
| Not specific to India |      1
Totally real                   1
Official Sadness Thread        1
Demonetization                 1
Zoke Tyme                      1
Name: mod_flair, dtype: int64

In [39]:
reddit_data.double_mod_flair.value_counts()

Non-Political    109
Coronavirus      108
Politics          76
other             54
AskIndia          31
[R]eddiquette     10
Name: double_mod_flair, dtype: int64

In [22]:
reddit_data.mod_flair.fillna('other', inplace=True)

In [97]:
X_train, X_test, y_train, y_test = train_test_split(reddit_data[list(feature_cols)], reddit_data.double_mod_flair, test_size=0.2)

In [98]:
rf_clf = ensemble.RandomForestClassifier()
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [99]:
accuracy_score(y_train, rf_clf.predict(X_train))

0.9967741935483871

In [100]:
accuracy_score(y_test, rf_clf.predict(X_test))

0.47435897435897434

In [50]:
y_test.value_counts().index.sort_values()

Index(['AskIndia', 'Coronavirus', 'Non-Political', 'Politics', '[R]eddiquette',
       'other'],
      dtype='object')

In [64]:
confusion_matrix(y_test, rf_clf.predict(X_test))

array([[5, 0, 0, 0, 0, 1],
       [2, 7, 2, 3, 0, 2],
       [0, 2, 9, 2, 0, 2],
       [0, 3, 2, 6, 0, 0],
       [0, 1, 1, 2, 0, 0],
       [1, 1, 4, 0, 0, 1]], dtype=int64)

In [101]:
svm_clf = svm.SVC()
svm_clf.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [102]:
accuracy_score(y_train, svm_clf.predict(X_train))

0.45483870967741935

In [87]:
accuracy_score(y_test, svm_clf.predict(X_test))

0.3898305084745763

In [93]:
xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

In [95]:
accuracy_score(y_train, xgb_clf.predict(X_train))

0.9969604863221885

In [96]:
accuracy_score(y_test, xgb_clf.predict(X_test))

0.4915254237288136

In [105]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [108]:
text_rf_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', ensemble.RandomForestClassifier())])

In [114]:
reddit_data.clean_comments.fillna('', inplace=True)

In [115]:
text_X_train, text_X_test, text_y_train, text_y_test = train_test_split(reddit_data.clean_comments, reddit_data.mod_flair, test_size=0.2)

In [117]:
text_rf_clf.fit(text_X_train, text_y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None

In [119]:
accuracy_score(text_y_test, text_rf_clf.predict(text_X_test))

0.5

In [120]:
text_svm_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', svm.LinearSVC())])

In [452]:
text_X_train

999     police arrested jignesh patel bjp councillor f...
1326    hes complaining noise time hotel management ze...
4       wtf rape goes delhi reported rape cases lot ra...
888                             far 70 debunked fake news
1674    far ask modis birth certificate ask doesnt mat...
                              ...                        
772     watching tv series playing old smackdown vs ra...
465     songs good love particular subtitles toonn n k...
1242    ive heard joke comic pov actor pov past real i...
525     totally sucks im losing motivation day day wor...
968     shes attacked worknn police said masrat zahra ...
Name: clean_comments, Length: 1376, dtype: object

In [121]:
text_svm_clf.fit(text_X_train, text_y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [122]:
accuracy_score(text_y_test, text_svm_clf.predict(text_X_test))

0.5256410256410257

In [261]:
reddit_data_v2 = pd.read_excel('cleaned_reddit_india_v2.xlsx')

In [262]:
mod_flair_v2_vc = reddit_data_v2.mod_flair.value_counts()

In [269]:
reddit_data_v2['double_mod_flair'] = np.where(reddit_data_v2['mod_flair'].isin(mod_flair_v2_vc.index[mod_flair_v2_vc>=60]), reddit_data_v2['mod_flair'], 'Other')

In [270]:
mod_flair_v2_vc

Politics                         401
Coronavirus                      401
Non-Political                    401
AskIndia                         144
Policy/Economy                    70
Business/Finance                  55
Photography                       55
Sports                            36
[R]eddiquette                     34
Science/Technology                26
Food                              19
Unverified                        16
CAA-NRC                           14
Scheduled                         12
r/all                              5
Demonetization                     2
AMA                                2
Policy & Economy                   2
Lifehacks                          1
Policy/Economy -2017 Article       1
Zoke Tyme                          1
Goal Achieved!!!                   1
Original Comics                    1
| Not specific to India |          1
Totally real                       1
Misleading                         1
Politics Source in comments        1
E

In [271]:
reddit_data_v2.double_mod_flair.value_counts()

Non-Political     401
Politics          401
Coronavirus       401
Other             303
AskIndia          144
Policy/Economy     70
Name: double_mod_flair, dtype: int64

In [272]:
X_train, X_test, y_train, y_test = train_test_split(reddit_data_v2[list(feature_cols)], reddit_data_v2.double_mod_flair, test_size=0.2)

In [273]:
rf_clf = ensemble.RandomForestClassifier()
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [274]:
accuracy_score(y_train, rf_clf.predict(X_train))

0.9847383720930233

In [275]:
accuracy_score(y_test, rf_clf.predict(X_test))

0.4796511627906977

In [277]:
confusion_matrix(y_test, rf_clf.predict(X_test))

array([[ 9,  6,  2,  5,  0,  5],
       [ 4, 49, 12,  3,  3, 10],
       [ 4, 15, 37,  7,  1,  9],
       [ 4, 11, 21, 14,  0,  9],
       [ 1,  4,  5,  1,  1,  1],
       [ 1, 13, 10, 10,  2, 55]], dtype=int64)

In [278]:
svm_clf = svm.SVC()
svm_clf.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [279]:
accuracy_score(y_train, svm_clf.predict(X_train))

0.4258720930232558

In [280]:
accuracy_score(y_test, svm_clf.predict(X_test))

0.436046511627907

In [281]:
l_svm_clf = svm.LinearSVC()
l_svm_clf.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [282]:
accuracy_score(y_train, l_svm_clf.predict(X_train))

0.3008720930232558

In [283]:
accuracy_score(y_test, l_svm_clf.predict(X_test))

0.27325581395348836

In [284]:
xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

In [285]:
accuracy_score(y_train, xgb_clf.predict(X_train))

0.9498546511627907

In [286]:
accuracy_score(y_test, xgb_clf.predict(X_test))

0.5087209302325582

In [287]:
text_rf_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', ensemble.RandomForestClassifier())])

In [288]:
reddit_data_v2.clean_comments.fillna('', inplace=True)

In [289]:
text_X_train, text_X_test, text_y_train, text_y_test = train_test_split(reddit_data_v2.clean_comments, reddit_data_v2.double_mod_flair, test_size=0.2)

In [385]:
text_X_train

999     police arrested jignesh patel bjp councillor f...
1326    hes complaining noise time hotel management ze...
4       wtf rape goes delhi reported rape cases lot ra...
888                             far 70 debunked fake news
1674    far ask modis birth certificate ask doesnt mat...
                              ...                        
772     watching tv series playing old smackdown vs ra...
465     songs good love particular subtitles toonn n k...
1242    ive heard joke comic pov actor pov past real i...
525     totally sucks im losing motivation day day wor...
968     shes attacked worknn police said masrat zahra ...
Name: clean_comments, Length: 1376, dtype: object

In [290]:
text_rf_clf.fit(text_X_train, text_y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None

In [291]:
accuracy_score(text_y_test, text_rf_clf.predict(text_X_test))

0.4680232558139535

In [292]:
text_svm_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', svm.LinearSVC())])

In [293]:
text_svm_clf.fit(text_X_train, text_y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [294]:
accuracy_score(text_y_test, text_svm_clf.predict(text_X_test))

0.4941860465116279

In [455]:
from tensorflow.keras.preprocessing import sequence, text
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Input
from tensorflow.keras.layers import LSTM, Bidirectional, Dropout, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from tensorflow.keras.callbacks import EarlyStopping

In [295]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(reddit_data_v2.clean_comments)
text_X_train_seq = tokenizer.texts_to_sequences(text_X_train)
text_X_test_seq = tokenizer.texts_to_sequences(text_X_test)

In [478]:
import pickle

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [296]:
encoder = LabelEncoder()
encoder.fit(text_y_train)
text_y_train_enc = encoder.transform(text_y_train)
text_y_test_enc = encoder.transform(text_y_test)


In [481]:
np.save('model_classes.npy', encoder.classes_)

In [297]:
maxlen=60

tf_text_X_train = sequence.pad_sequences(text_X_train_seq, maxlen=maxlen)
tf_text_X_test = sequence.pad_sequences(text_X_test_seq, maxlen=maxlen)

In [299]:
len(tokenizer.word_index)

20818

In [459]:
model = Sequential()
model.add(Embedding(20819, 128))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(6, activation='sigmoid'))

In [460]:
adam = Adam(lr=0.001)
model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['accuracy'],)
cb = EarlyStopping(monitor='val_loss', mode='min', patience=7, verbose=1)

In [461]:
from tensorflow.keras.utils import to_categorical

In [462]:
model.fit(tf_text_X_train, to_categorical(text_y_train_enc),
          batch_size=32,
          epochs=25,
          validation_data=(tf_text_X_test, to_categorical(text_y_test_enc)),)

Train on 1376 samples, validate on 344 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x23d01fc95c0>

In [219]:
import codecs, tqdm

In [220]:
f = codecs.open('wiki-news-300d-1M.vec', encoding='utf-8')

In [223]:
embeddings_index={}

In [224]:
for line in tqdm.tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print(f'found {len(embeddings_index)} word vectors')

999994it [02:45, 6055.22it/s]

found 999994 word vectors





In [317]:
word_index = tokenizer.word_index

In [318]:
words_not_found = []
nb_words = min(100000, len(word_index))
embedding_matrix = np.zeros((nb_words+1, 300))

In [319]:
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
        t_shape = embedding_vector.shape
    else:
        words_not_found.append(word)
        embedding_matrix[i] = np.zeros(t_shape)
        print(word, i, t_shape)



covid19 127 (300,)
nnthe 296 (300,)
covid 325 (300,)
bhakts 520 (300,)
rindia 658 (300,)
arnab 673 (300,)
nnthis 742 (300,)
kejriwal 802 (300,)
bhakt 857 (300,)
zomato 866 (300,)
nnif 956 (300,)
nnedit 1020 (300,)
mamata 1077 (300,)
nnwe 1102 (300,)
aadhar 1104 (300,)
oneplus 1241 (300,)
isro 1282 (300,)
jio 1293 (300,)
chaiwala 1301 (300,)
nnyou 1331 (300,)
nnni 1348 (300,)
wfh 1370 (300,)
nnalso 1414 (300,)
icmr 1448 (300,)
nnbut 1555 (300,)
nn1 1607 (300,)
modiji 1635 (300,)
nnthey 1655 (300,)
nnits 1664 (300,)
nnand 1689 (300,)
ambani 1694 (300,)
nnnow 1718 (300,)
aadhaar 1721 (300,)
subredditmessagecomposetorindia 1763 (300,)
sanghis 1767 (300,)
nnthere 1816 (300,)
bjps 1857 (300,)
anticaa 1890 (300,)
hotstar 1984 (300,)
nnnot 1987 (300,)
nnin 2038 (300,)
nnhow 2049 (300,)
tablighi 2063 (300,)
swiggy 2109 (300,)
nnso 2120 (300,)
nnjust 2127 (300,)
jnu 2210 (300,)
nnwhat 2252 (300,)
nnhe 2272 (300,)
sanghi 2291 (300,)
nnall 2309 (300,)
narendramodi 2321 (300,)
lodha 2323 (300,)
jum

9rnit 7045 (300,)
kamra 7062 (300,)
gaumutra 7071 (300,)
cowswami 7075 (300,)
ipdr 7086 (300,)
2g3g4g 7090 (300,)
gprs 7091 (300,)
becil 7094 (300,)
u0001f97a 7098 (300,)
fdroid 7114 (300,)
epass 7131 (300,)
tharoor 7139 (300,)
dwarka 7143 (300,)
waali 7151 (300,)
tunak 7152 (300,)
urbanclap 7161 (300,)
donennthe 7186 (300,)
rajdhani 7189 (300,)
itnnim 7191 (300,)
201011 7192 (300,)
azaan 7208 (300,)
tweetnn 7210 (300,)
nnnthis 7220 (300,)
nnplease 7226 (300,)
kitne 7228 (300,)
countrynnedit 7237 (300,)
ignou 7245 (300,)
nnhowever 7249 (300,)
byjus 7264 (300,)
raghuram 7267 (300,)
selfstyled 7277 (300,)
rehne 7281 (300,)
nnjournalism 7289 (300,)
nnnthey 7292 (300,)
todaynnps 7295 (300,)
viruss 7301 (300,)
nnlockdown 7323 (300,)
mayaram 7356 (300,)
mujhko 7362 (300,)
phirse 7364 (300,)
addressnn 7381 (300,)
saavn 7384 (300,)
chhod 7394 (300,)
nnafter 7395 (300,)
nnperhaps 7402 (300,)
otp 7415 (300,)
satyabhama 7422 (300,)
rindianstreetbets 7427 (300,)
akhlaq 7431 (300,)
nnany 7435 (300,

rathees 9009 (300,)
accountnns 9010 (300,)
causebut 9012 (300,)
believenas 9013 (300,)
publicnnno 9015 (300,)
lockand 9017 (300,)
majha 9020 (300,)
proshiv 9021 (300,)
okthis 9022 (300,)
effectivebut 9023 (300,)
pooldoes 9024 (300,)
hinduisms 9026 (300,)
rocksnnat 9028 (300,)
pddp 9029 (300,)
leechesnni 9033 (300,)
chuts 9035 (300,)
survivennits 9037 (300,)
itnnor 9045 (300,)
mkbhd 9047 (300,)
exynos 9048 (300,)
cameranncheck 9050 (300,)
statesnn 9051 (300,)
bingeablennive 9052 (300,)
goodnncurrently 9054 (300,)
didwhats 9066 (300,)
ntldr 9068 (300,)
nnc 9069 (300,)
nnexplanation 9071 (300,)
nntldr 9075 (300,)
2nnmarkaz 9076 (300,)
nnnps 9078 (300,)
nnimportant 9082 (300,)
yearnnhopefully 9087 (300,)
normalnnbut 9088 (300,)
liyo 9090 (300,)
hydroxhchloroquine 9092 (300,)
snnhopefully 9094 (300,)
nghanta 9098 (300,)
dgca 9099 (300,)
mattersnnmillions 9101 (300,)
screwednnif 9103 (300,)
flls 9104 (300,)
badnnin 9111 (300,)
screwednnin 9114 (300,)
screwednnso 9115 (300,)
malaysiannfor 911

dothraki 10647 (300,)
antinashnul 10648 (300,)
agla 10650 (300,)
jayenga 10652 (300,)
rmadlads 10655 (300,)
doode 10666 (300,)
nnp 10667 (300,)
meinnnkisi 10672 (300,)
hindostaan 10673 (300,)
namecaste 10674 (300,)
rightsnni 10678 (300,)
nnnin 10679 (300,)
kalpana 10684 (300,)
homelander 10691 (300,)
menngandhi 10693 (300,)
baalak 10694 (300,)
windownwhy 10695 (300,)
kasam 10696 (300,)
onesnnbut 10699 (300,)
monstersthe 10700 (300,)
kurzgesagtnnyou 10702 (300,)
lolnnedit 10704 (300,)
elopednncontext 10705 (300,)
sanskaar 10706 (300,)
gradewe 10707 (300,)
donewe 10709 (300,)
booksi 10710 (300,)
bulma 10712 (300,)
sketchbookid 10713 (300,)
inappropriateit 10715 (300,)
censornnaround 10716 (300,)
thoughwe 10717 (300,)
drawingsi 10718 (300,)
mountainsome 10720 (300,)
pennnthe 10721 (300,)
tumne 10723 (300,)
ladte 10724 (300,)
monitorfor 10730 (300,)
thoughi 10731 (300,)
ameerpet 10737 (300,)
venkataramana 10741 (300,)
ngovinda 10742 (300,)
neveryone 10743 (300,)
pudhu 10744 (300,)
ponnu 10

richernnx200bnninfosys 11617 (300,)
scientistsnnall 11618 (300,)
forwardsnn 11619 (300,)
coronovirus 11620 (300,)
functionnnthey 11624 (300,)
smarajit 11628 (300,)
nnnnmamata 11631 (300,)
saidnnnnbengal 11632 (300,)
73rs 11636 (300,)
nnsocial 11644 (300,)
nownnncue 11645 (300,)
nntruly 11652 (300,)
covid19nn 11655 (300,)
nnbajaj 11656 (300,)
saidnndumbass 11661 (300,)
rammifications 11662 (300,)
virusnnagree 11664 (300,)
nnclicked 11665 (300,)
worknnnhe 11670 (300,)
recentlynbill 11672 (300,)
taiwans 11673 (300,)
newsn 11675 (300,)
countriesnn 11676 (300,)
nnnnits 11677 (300,)
nadella 11678 (300,)
addednnit 11685 (300,)
bangalorennthey 11687 (300,)
nhad 11689 (300,)
npradhan 11691 (300,)
bikennour 11695 (300,)
courtsnnmainstream 11696 (300,)
justicennfurther 11697 (300,)
victimsnnthis 11698 (300,)
younnso 11699 (300,)
leastnnif 11702 (300,)
gadgetsnnx200bnnthey 11703 (300,)
differentlynnyes 11710 (300,)
bannon 11715 (300,)
debatennsticking 11716 (300,)
hostelwalo 11720 (300,)
bhookha 1

nnbesa 12439 (300,)
bothernnthese 12441 (300,)
covid19nni 12444 (300,)
farnnnot 12449 (300,)
nnseems 12450 (300,)
aate 12452 (300,)
bhayya 12453 (300,)
kitte 12454 (300,)
rupae 12455 (300,)
aaram 12457 (300,)
banata 12458 (300,)
fromnngoddamnit 12460 (300,)
3rdnnpersonally 12471 (300,)
31stnnpending 12472 (300,)
interviewsnnall 12474 (300,)
midmay 12475 (300,)
casesif 12477 (300,)
lockdownnndate 12478 (300,)
nnpublic 12479 (300,)
leastnnsome 12480 (300,)
maynnand 12481 (300,)
mayfor 12482 (300,)
postponednngoing 12483 (300,)
examsnnfor 12485 (300,)
assemblymovement 12487 (300,)
crimennnyour 12489 (300,)
fictionwriting 12494 (300,)
muslimsnnnow 12497 (300,)
manyndid 12500 (300,)
fadnavis 12501 (300,)
lagegi 12502 (300,)
ayenge 12503 (300,)
hhnnrahat 12507 (300,)
indori 12508 (300,)
1mgcom 12509 (300,)
healthmugcom 12510 (300,)
medlifecom 12511 (300,)
experiencenbut 12512 (300,)
began17th 12514 (300,)
dunzo 12515 (300,)
pharmeasy 12516 (300,)
23999 12518 (300,)
6gb 12521 (300,)
ramnnyou 

sakhyam 13220 (300,)
astu 13221 (300,)
caur 13225 (300,)
capal 13226 (300,)
varjy 13227 (300,)
etad 13228 (300,)
yaasya 13229 (300,)
svargya 13232 (300,)
atrunibarhaa 13234 (300,)
mahrhamlybharagarg 13235 (300,)
bhartram 13236 (300,)
rdhaya 13237 (300,)
puyagandhnn224nnvai 13238 (300,)
mrkaeydhibhir 13239 (300,)
viprai 13240 (300,)
pavai 13241 (300,)
mahtmabhi 13242 (300,)
kathbhir 13243 (300,)
anuklbhi 13244 (300,)
sahsitv 13245 (300,)
janrdana 13246 (300,)
savida 13247 (300,)
ktv 13248 (300,)
yathvan 13249 (300,)
madhusdana 13250 (300,)
ruruk 13251 (300,)
satym 13253 (300,)
hvaym 13254 (300,)
keava 13255 (300,)
satyabhm 13256 (300,)
svajitv 13258 (300,)
drupadtmajm 13259 (300,)
uvca 13260 (300,)
vacana 13261 (300,)
hdya 13262 (300,)
yath 13263 (300,)
bhvasamhitamnnnn 13264 (300,)
sidennjanamejaya 13267 (300,)
nvaishampayana 13268 (300,)
newsnnyou 13269 (300,)
torturennit 13270 (300,)
modia 13271 (300,)
world1944 13273 (300,)
undernourished208 13275 (300,)
underweight379 13276 (300,)


centernn 14044 (300,)
zokes 14045 (300,)
kumaraswamyi 14051 (300,)
ruleswhile 14052 (300,)
powernncompletely 14053 (300,)
nakama 14054 (300,)
yesnnfull 14056 (300,)
randiannpeace 14060 (300,)
examsnnin 14061 (300,)
covid20 14066 (300,)
auranachal 14067 (300,)
toofani 14068 (300,)
shouldersnn 14072 (300,)
feastnn 14074 (300,)
naivennit 14077 (300,)
cowbelt 14081 (300,)
addressesnnanother 14082 (300,)
goofup 14083 (300,)
xinping 14086 (300,)
washermans 14087 (300,)
spokennnbut 14094 (300,)
loliya 14097 (300,)
anomalynnits 14098 (300,)
systemnnand 14099 (300,)
spreadnnalso 14105 (300,)
delhivery 14106 (300,)
coolwinks 14110 (300,)
morningnn 14113 (300,)
nnlots 14116 (300,)
bhaktjan 14119 (300,)
pinrayi 14120 (300,)
modas 14121 (300,)
wifennmy 14124 (300,)
mominlaw 14125 (300,)
fatherinlaw 14126 (300,)
usnnme 14127 (300,)
betwe 14128 (300,)
cookingnnfather 14130 (300,)
jharu 14131 (300,)
pochanni 14132 (300,)
utensilnand 14134 (300,)
timesnnwife 14135 (300,)
ldrnall 14136 (300,)
constantly

globallynni 14919 (300,)
thatnnxa0 14920 (300,)
redplague 14922 (300,)
usjapan 14923 (300,)
maybennvenmo 14924 (300,)
transferwise 14926 (300,)
venmo 14928 (300,)
indiannalso 14929 (300,)
worknn 14930 (300,)
zahras 14931 (300,)
saidnnx200bnn 14933 (300,)
nwmi 14934 (300,)
nespecially 14940 (300,)
homennimagine 14941 (300,)
skillsnnfor 14945 (300,)
theren 14947 (300,)
cait 14952 (300,)
bornnnessentials 14953 (300,)
somethingnni 14955 (300,)
serviceslocal 14957 (300,)
chomu 14958 (300,)
failuredisappointment 14963 (300,)
csgo 14971 (300,)
xps 14972 (300,)
reasonsn1 14975 (300,)
freeeen2 14976 (300,)
pcn3 14977 (300,)
ratesn4 14978 (300,)
hypenni 14979 (300,)
duhnncod 14980 (300,)
defusal 14982 (300,)
csgonnmobile 14983 (300,)
divisionsnnnothing 14990 (300,)
wapasi 14992 (300,)
udhar 14994 (300,)
gandh 14995 (300,)
faila 14996 (300,)
nnswades 14998 (300,)
gaane 14999 (300,)
cartoonnnn 15001 (300,)
transmissionnndo 15009 (300,)
termitesnni 15010 (300,)
economyand 15019 (300,)
aaditya 15025

startupsnnwhy 15690 (300,)
donennif 15692 (300,)
theirsnnyou 15693 (300,)
vijayan 15696 (300,)
nprovide 15697 (300,)
answernn1 15700 (300,)
countrynn2 15701 (300,)
colloseum 15703 (300,)
stabilizennin 15705 (300,)
timennncoming 15706 (300,)
politicalideological 15711 (300,)
misleadingoutright 15712 (300,)
vahini 15716 (300,)
bajrang 15717 (300,)
wikipediawith 15725 (300,)
editsis 15726 (300,)
loweffort 15729 (300,)
bakwaas 15731 (300,)
professionallooking 15733 (300,)
24x7n3 15736 (300,)
nnfeb 15737 (300,)
revathi 15739 (300,)
krishnappa 15742 (300,)
bengaluru78 15743 (300,)
20209 15744 (300,)
ramanagara10 15747 (300,)
highprofit 15749 (300,)
throughoutnlast 15751 (300,)
settingslanguageschange 15755 (300,)
ponda 15757 (300,)
hernn 15758 (300,)
iift 15761 (300,)
sunitha 15763 (300,)
raveendra 15764 (300,)
saradhi 15765 (300,)
thatxa0concerns 15766 (300,)
chinaxa0may 15767 (300,)
misplacednnthey 15768 (300,)
200708 15769 (300,)
201516 15770 (300,)
industriessteam 15771 (300,)
sectornnco

nnlike 16880 (300,)
exhibitionnnwell 16885 (300,)
eesalacupnamde 16890 (300,)
bhencho 16892 (300,)
jeetegi 16893 (300,)
kidsscooter 16896 (300,)
piche 16897 (300,)
sbican 16902 (300,)
sarvar 16903 (300,)
xnstatement 16905 (300,)
mileganwrite 16906 (300,)
applicationnline 16907 (300,)
accountn2 16908 (300,)
problemn3 16909 (300,)
itn4 16910 (300,)
branchn5 16911 (300,)
branchn6 16912 (300,)
530am 16913 (300,)
pleasurenn 16914 (300,)
darrow 16916 (300,)
sengar 16917 (300,)
milkar 16920 (300,)
hamko 16921 (300,)
shariqpp 16924 (300,)
bhais 16927 (300,)
bch 16928 (300,)
ajla 16943 (300,)
kwayie 16947 (300,)
secondsnnyou 16949 (300,)
congratulationsnnproud 16951 (300,)
onein 16956 (300,)
khaate 16957 (300,)
goldnnshe 16958 (300,)
nationnnshe 16959 (300,)
lesbiannnshes 16960 (300,)
halfexpected 16963 (300,)
rsoccer 16964 (300,)
1982xa0guinness 16968 (300,)
homosexualsxa0u2063nu2063ndevi 16970 (300,)
acceptancenu2063non 16973 (300,)
deviu2063nn 16974 (300,)
camscanner 16976 (300,)
bbcnetflix 

yearnnfinal 18050 (300,)
43nnfinal 18051 (300,)
45nn1 18052 (300,)
49n2 18054 (300,)
22n3 18056 (300,)
69n4 18057 (300,)
87n5 18059 (300,)
83n6 18060 (300,)
69n7 18061 (300,)
70n8 18062 (300,)
86nnps 18064 (300,)
recessioncontraction 18065 (300,)
prettyn 18066 (300,)
dabana 18069 (300,)
npolitics 18070 (300,)
vishwash 18071 (300,)
wellnnjust 18072 (300,)
equinoxnnearths 18077 (300,)
planeor 18082 (300,)
shadowtheoretically 18089 (300,)
liveeveryone 18092 (300,)
rthewaywewere 18093 (300,)
rcolorizationrequests 18095 (300,)
prepartition 18096 (300,)
daenerys 18097 (300,)
lavni 18098 (300,)
paithani 18099 (300,)
dholsnnwho 18101 (300,)
bandnnwell 18102 (300,)
areannbut 18108 (300,)
chaturthi 18110 (300,)
rehears 18112 (300,)
pandols 18114 (300,)
rgameofthrones 18116 (300,)
rfreefolk 18117 (300,)
chickpet 18118 (300,)
usnn810 18123 (300,)
rtheydidntdothemath 18124 (300,)
nndriven 18128 (300,)
nnofficials 18133 (300,)
goodin 18135 (300,)
politicsbut 18136 (300,)
jehan 18138 (300,)
dehlavi 1

possiblenn 18990 (300,)
recordsnn 18993 (300,)
kidsnn 18996 (300,)
cylindersday 18997 (300,)
216nn 18998 (300,)
deliverynn 19000 (300,)
suraksha 19001 (300,)
refillnn 19005 (300,)
hindnn 19007 (300,)
juniorsenior 19008 (300,)
lifenn 19010 (300,)
yelledscreamed 19013 (300,)
situationnn 19016 (300,)
13082017nn 19017 (300,)
130817 19019 (300,)
sirnn 19020 (300,)
itselfnn 19021 (300,)
wordsnn 19023 (300,)
dussehra 19024 (300,)
samenn 19026 (300,)
familynn 19031 (300,)
vainnn 19032 (300,)
runningnn 19035 (300,)
wrongnn 19039 (300,)
prabhari 19040 (300,)
100bed 19041 (300,)
aeh 19043 (300,)
08082016 19044 (300,)
nrhm 19046 (300,)
purchasetenderordermaintenancesupplypayment 19049 (300,)
oxygenjumbo 19050 (300,)
oxygennn 19051 (300,)
dgme 19052 (300,)
duesnn 19054 (300,)
jailnn 19055 (300,)
againnn 19058 (300,)
waitingnn 19059 (300,)
servednn 19061 (300,)
friendnn 19062 (300,)
khann18042018 19063 (300,)
instigatorsi 19068 (300,)
innedit 19069 (300,)
hmingteichhangte 19070 (300,)
subthank 19072

itselfnnits 20034 (300,)
badnnmy 20037 (300,)
traditionalconservative 20040 (300,)
agonnmy 20043 (300,)
brokennnwe 20045 (300,)
naipaul 20046 (300,)
indiandesi 20052 (300,)
shredsnnit 20054 (300,)
baatnngg 20059 (300,)
nndrunk 20061 (300,)
doesntnneid 20068 (300,)
stopkillinganimals 20069 (300,)
todaynneid 20070 (300,)
sarkari 20072 (300,)
moholla 20073 (300,)
bribesnnone 20077 (300,)
samennthe 20084 (300,)
armyofficer 20086 (300,)
chattarpur 20088 (300,)
then0809 20091 (300,)
hardearned 20092 (300,)
bucksnnimo 20093 (300,)
promotionnnill 20096 (300,)
gallilevel 20099 (300,)
goonsnncan 20100 (300,)
1820k 20101 (300,)
78yrs 20102 (300,)
flagsnnthe 20104 (300,)
nishaan 20106 (300,)
nirvava 20109 (300,)
tadipaar 20113 (300,)
antihindu 20115 (300,)
basednrindia 20116 (300,)
betternim 20118 (300,)
notnhang 20119 (300,)
functiontomorrow 20121 (300,)
lucknthere 20122 (300,)
everyonennthe 20125 (300,)
fudgingkeeping 20126 (300,)
meaninglessnnis 20128 (300,)
stingwhistleblower 20129 (300,)
ncuz

In [465]:
model = Sequential()
model.add(Embedding(20819, 300, weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(LSTM(150)))
model.add(Dropout(0.3))
model.add(Dense(6, activation='sigmoid'))

In [466]:
model.summary()

Model: "sequential_43"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_39 (Embedding)     (None, None, 300)         6245700   
_________________________________________________________________
bidirectional_20 (Bidirectio (None, 300)               541200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_39 (Dense)             (None, 6)                 1806      
Total params: 6,788,706
Trainable params: 543,006
Non-trainable params: 6,245,700
_________________________________________________________________


In [467]:
adam = Adam(lr=0.0001)
model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['accuracy'],)

In [468]:
model.fit(tf_text_X_train, to_categorical(text_y_train_enc),
          batch_size=32,
          epochs=25,
          validation_data=(tf_text_X_test, to_categorical(text_y_test_enc)),)

Train on 1376 samples, validate on 344 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x23d0ac978d0>

In [477]:
model.save('BLSTM_model.h5')

In [469]:
model2 = Sequential()
model2.add(Embedding(20819, 300, weights=[embedding_matrix], trainable=False))
model2.add(Conv1D(64, 5, activation='relu'))
model2.add(GlobalMaxPooling1D())
#model2.add(Dense(10, activation='relu'))
model2.add(Dense(6, activation='sigmoid'))

In [471]:
model2.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'],)

In [472]:
model2.fit(tf_text_X_train, to_categorical(text_y_train_enc),
          batch_size=32,
          epochs=15,
          validation_data=(tf_text_X_test, to_categorical(text_y_test_enc)),)

Train on 1376 samples, validate on 344 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x23d0aef8320>

In [476]:
model2.save('CNN_model.h5')

In [428]:
import tensorflow_hub as tf_hub

In [448]:
from tensorflow.compat import v1
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K

In [435]:
embed = tf_hub.Module('https://tfhub.dev/google/elmo/2', trainable=True,
                               name="elmo_embed")


In [436]:
def ELMoEmbedding(x):
    return embed(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]

In [449]:
class ElmoEmbeddingLayer(Layer):
    def __init__(self, **kwargs):
        self.dimensions = 1024
        self.trainable=True
        super(ElmoEmbeddingLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.elmo = tf_hub.Module('https://tfhub.dev/google/elmo/2', trainable=self.trainable,
                               name="{}_module".format(self.name))

        self.trainable_weights += K. .tf.trainable_variables(scope="^{}_module/.*".format(self.name))
        super(ElmoEmbeddingLayer, self).build(input_shape)

    def call(self, x, mask=None):
        result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1),
                      as_dict=True,
                      signature='default',
                      )['default']
        return result

    def compute_mask(self, inputs, mask=None):
        return K.not_equal(inputs, '--PAD--')

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.dimensions)

In [450]:
input_text = Input(shape=(1,), dtype="string")
embedding = ElmoEmbeddingLayer()(input_text)
dense = Dense(256, activation='relu')(embedding)
pred = Dense(1, activation='sigmoid')(dense)

model = Model(inputs=[input_text], outputs=pred)

AttributeError: module 'tensorflow_core.keras.backend' has no attribute 'tf'