In [180]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import svm, ensemble
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
reddit_data = pd.read_excel('cleaned_reddit_india.xlsx')

In [4]:
reddit_data.columns

Index(['comments', 'flair', 'id', 'title', 'title_text', 'url', 'category',
       'domain', 'down_count', 'is_orig', 'n_comm', 'total_awards', 'up_count',
       'upvote_rat', 'views', 'calc_down_count', 'mod_domain', 'mod_flair',
       'clean_title', 'clean_title_text', 'clean_comments'],
      dtype='object')

In [5]:
feature_cols = ('is_orig', 'n_comm', 'total_awards', 'up_count', 'upvote_rat', 'calc_down_count')

In [11]:
onehe = OneHotEncoder()
flair_oneh = pd.get_dummies(reddit_data.flair.values)

In [12]:
flair_oneh

Unnamed: 0,/r/all,AskIndia,Business/Finance,CAA-NRC,CAA-NRC-NPR,Coronavirus,Demonetization,Food,Non-Political,Official Sadness Thread,...,Politics [Megathread],Scheduled,Science/Technology,Sports,Totally real,Unverified,Zoke Tyme,[R]eddiquette,r/all,| Not specific to India |
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
383,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
384,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
385,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
386,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [31]:
mod_flair_vc = reddit_data.mod_flair.value_counts()

In [37]:
reddit_data['double_mod_flair'] = np.where(reddit_data['mod_flair'].isin(mod_flair_vc.index[mod_flair_vc>=10]), reddit_data['mod_flair'], 'other')

In [38]:
mod_flair_vc

Non-Political                109
Coronavirus                  108
Politics                      76
AskIndia                      31
[R]eddiquette                 10
Sports                         8
Photography                    8
Science/Technology             8
Policy/Economy                 7
r/all                          4
Business/Finance               3
Food                           3
other                          2
Scheduled                      2
CAA-NRC                        2
Unverified                     2
| Not specific to India |      1
Totally real                   1
Official Sadness Thread        1
Demonetization                 1
Zoke Tyme                      1
Name: mod_flair, dtype: int64

In [39]:
reddit_data.double_mod_flair.value_counts()

Non-Political    109
Coronavirus      108
Politics          76
other             54
AskIndia          31
[R]eddiquette     10
Name: double_mod_flair, dtype: int64

In [22]:
reddit_data.mod_flair.fillna('other', inplace=True)

In [97]:
X_train, X_test, y_train, y_test = train_test_split(reddit_data[list(feature_cols)], reddit_data.double_mod_flair, test_size=0.2)

In [98]:
rf_clf = ensemble.RandomForestClassifier()
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [99]:
accuracy_score(y_train, rf_clf.predict(X_train))

0.9967741935483871

In [100]:
accuracy_score(y_test, rf_clf.predict(X_test))

0.47435897435897434

In [50]:
y_test.value_counts().index.sort_values()

Index(['AskIndia', 'Coronavirus', 'Non-Political', 'Politics', '[R]eddiquette',
       'other'],
      dtype='object')

In [64]:
confusion_matrix(y_test, rf_clf.predict(X_test))

array([[5, 0, 0, 0, 0, 1],
       [2, 7, 2, 3, 0, 2],
       [0, 2, 9, 2, 0, 2],
       [0, 3, 2, 6, 0, 0],
       [0, 1, 1, 2, 0, 0],
       [1, 1, 4, 0, 0, 1]], dtype=int64)

In [101]:
svm_clf = svm.SVC()
svm_clf.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [102]:
accuracy_score(y_train, svm_clf.predict(X_train))

0.45483870967741935

In [87]:
accuracy_score(y_test, svm_clf.predict(X_test))

0.3898305084745763

In [93]:
xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

In [95]:
accuracy_score(y_train, xgb_clf.predict(X_train))

0.9969604863221885

In [96]:
accuracy_score(y_test, xgb_clf.predict(X_test))

0.4915254237288136

In [105]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [108]:
text_rf_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', ensemble.RandomForestClassifier())])

In [114]:
reddit_data.clean_comments.fillna('', inplace=True)

In [115]:
text_X_train, text_X_test, text_y_train, text_y_test = train_test_split(reddit_data.clean_comments, reddit_data.mod_flair, test_size=0.2)

In [117]:
text_rf_clf.fit(text_X_train, text_y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None

In [119]:
accuracy_score(text_y_test, text_rf_clf.predict(text_X_test))

0.5

In [120]:
text_svm_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', svm.LinearSVC())])

In [121]:
text_svm_clf.fit(text_X_train, text_y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [122]:
accuracy_score(text_y_test, text_svm_clf.predict(text_X_test))

0.5256410256410257

In [211]:
from tensorflow.keras.preprocessing import sequence, text
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.optimizers import Adam

In [173]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(reddit_data.clean_comments)
text_X_train_seq = tokenizer.texts_to_sequences(text_X_train)
text_X_test_seq = tokenizer.texts_to_sequences(text_X_test)

In [184]:
encoder = LabelEncoder()
encoder.fit(text_y_train)
text_y_train_enc = encoder.transform(text_y_train)
text_y_test_enc = encoder.transform(text_y_test)

In [185]:
maxlen=60

tf_text_X_train = sequence.pad_sequences(text_X_train_seq, maxlen=maxlen)
tf_text_X_test = sequence.pad_sequences(text_X_test_seq, maxlen=maxlen)

In [175]:
max(text_X_train.apply(lambda x: len(x)))

6383

In [193]:
len(tokenizer.word_index)

8318

In [215]:
model = Sequential()
model.add(Embedding(8319, 128))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(1, activation='sigmoid'))

In [216]:
adam = Adam(lr=0.001)
model.compile(loss='binary_crossentropy',
              optimizer=adam,
              metrics=['accuracy'],)

In [217]:
model.fit(tf_text_X_train, text_y_train_enc,
          batch_size=32,
          epochs=15,
          validation_data=(tf_text_X_test, text_y_test_enc),)

Train on 310 samples, validate on 78 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x23c4e5dce80>