<a href="https://colab.research.google.com/github/jwang44/crispy-fiesta/blob/main/Ngram_selection_keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/

Mounted at /content/drive
/content/drive/MyDrive


## Load the data and get basic features

In [2]:
import pandas as pd
import numpy as np

In [3]:
train = pd.read_csv('./train.csv',engine='python')
test = pd.read_csv('./test.csv',engine='python')

In [4]:
X_train = train.body  # train texts
y_train = train.subreddit # train subreddits
X_test = test.body  # test texts

In [5]:
from sklearn.preprocessing import Normalizer, LabelEncoder
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [6]:
# transform target labels to values
le = LabelEncoder()
y_train_num = le.fit_transform(y_train.values) # convert category from string to numerical (!!!!! update the variables in kcross fold)

# vectorize word count
vectorizer = CountVectorizer()
vectors_train = vectorizer.fit_transform(X_train)
vectors_test = vectorizer.transform(X_test)

normalizer_train = Normalizer()
vectors_train= normalizer_train.transform(vectors_train)
vectors_test= normalizer_train.transform(vectors_test)

# print(vectorizer.get_feature_names())
print(vectors_train.shape)
print(vectors_test.shape)

(1999, 15365)
(1378, 15365)


In [7]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [8]:
# put it all together: remove stop words and punctuation, tfidf, lemmatization, normalization
stop_words = text.ENGLISH_STOP_WORDS

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

class New_LemmaTokenizer:
     def __init__(self):
       self.wnl = WordNetLemmatizer()
     def __call__(self, doc):
       return [self.wnl.lemmatize(t,pos =get_wordnet_pos(t)) for t in word_tokenize(doc) if t.isalpha()]

tf_idf_transformer = TfidfTransformer()
vectorizer = CountVectorizer(stop_words = stop_words, tokenizer = New_LemmaTokenizer(), ngram_range=(1, 2)) #unigram+bigram:ngram_range=(1, 2), only bigram:ngram_range=(2, 2)
vectors_train_stop_tfidf_Lemma = vectorizer.fit_transform(X_train)
vectors_train_stop_tfidf_Lemma = tf_idf_transformer.fit_transform(vectors_train_stop_tfidf_Lemma)
vectors_test_stop_tfidf_Lemma = vectorizer.transform(X_test)
vectors_test_stop_tfidf_Lemma = tf_idf_transformer.transform(vectors_test_stop_tfidf_Lemma)
vectors_train_stop_tfidf_Lemma = normalizer_train.transform(vectors_train_stop_tfidf_Lemma)
vectors_test_stop_tfidf_Lemma = normalizer_train.transform(vectors_test_stop_tfidf_Lemma)

#print(vectorizer.get_feature_names())
print(vectors_train_stop_tfidf_Lemma.shape)
print(vectors_test_stop_tfidf_Lemma.shape)

  'stop_words.' % sorted(inconsistent))


(1999, 70414)
(1378, 70414)


In [9]:
# remove stopwords and punctuation, tfidf, stemming, normalization
stop_words = text.ENGLISH_STOP_WORDS

class StemTokenizer:
     def __init__(self):
       self.wnl =PorterStemmer()
     def __call__(self, doc):
       return [self.wnl.stem(t) for t in word_tokenize(doc) if t.isalpha()]

tf_idf_transformer = TfidfTransformer()
vectorizer = CountVectorizer(stop_words = stop_words, tokenizer=StemTokenizer(),ngram_range=(1, 2)) #unigram+bigram:ngram_range=(1, 2), only bigram:ngram_range=(2, 2)
vectors_train_stop_tfidf_stem = vectorizer.fit_transform(X_train)
vectors_train_stop_tfidf_stem = tf_idf_transformer.fit_transform(vectors_train_stop_tfidf_stem)
vectors_test_stop_tfidf_stem = vectorizer.transform(X_test)
vectors_test_stop_tfidf_stem = tf_idf_transformer.transform(vectors_test_stop_tfidf_stem)
vectors_train_stop_tfidf_stem = normalizer_train.transform(vectors_train_stop_tfidf_stem)
vectors_test_stop_tfidf_stem = normalizer_train.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stop_tfidf_stem.shape)
print(vectors_test_stop_tfidf_stem.shape)

  'stop_words.' % sorted(inconsistent))


(1999, 73597)
(1378, 73597)


## Feature selection

In [10]:
from sklearn.feature_selection import SelectKBest, SelectPercentile, chi2, mutual_info_classif, f_classif, SelectFpr, SelectFwe, SelectFdr, RFE, RFECV, SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

### 13.2.1 chi2

In [11]:
# choose one: SelectPercentile percent%, selectkbest abolute number
select = SelectPercentile(chi2, percentile=60)
vectors_train_Lemma_X2 = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_X2 = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_X2.shape)

(1999, 42248)


### 13.2.2 mutual info

In [None]:
select = SelectKBest(mutual_info_classif, k=6000)
vectors_train_Lemma_mutual = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_mutual = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_mutual.shape)

(1999, 6000)


### 13.2.3 F score

In [None]:
select = SelectPercentile(f_classif, percentile=60)
vectors_train_Lemma_X2 = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_X2 = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_X2.shape)

(1999, 42248)


In [None]:
select = SelectKBest(f_classif, k=6000)
vectors_train_Lemma_F = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_F = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_F.shape)

(1999, 6000)


In [None]:
select = SelectPercentile(f_classif, percentile=60)
vectors_train_stem_X2 = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_X2 = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_X2.shape)

(1999, 44158)


In [None]:
select = SelectKBest(f_classif, k=5000)
vectors_train_stem_F = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_F = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_F.shape)

(1999, 5000)


### 13.2.4 FPR

In [None]:
select = SelectFpr()
vectors_train_Lemma_FPR = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_FPR = select.transform(vectors_test_stop_tfidf_Lemma)
vectors_train_Lemma_FPR.shape

(1999, 2602)

In [None]:
select = SelectFpr()
vectors_train_stem_FPR = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_FPR = select.transform(vectors_test_stop_tfidf_stem)
vectors_train_stem_FPR.shape

(1999, 2770)

### 13.2.5 FDR

In [None]:
select = SelectFdr()
vectors_train_Lemma_FDR = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_FDR = select.transform(vectors_test_stop_tfidf_Lemma)
vectors_train_Lemma_FDR.shape

(1999, 1013)

In [None]:
select = SelectFdr()
vectors_train_stem_FDR = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_FDR = select.transform(vectors_test_stop_tfidf_stem)
vectors_train_stem_FDR.shape

(1999, 1064)

### 13.2.6 FWE

In [None]:
select = SelectFwe()
vectors_train_Lemma_FWE = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_FWE = select.transform(vectors_test_stop_tfidf_Lemma)
vectors_train_Lemma_FWE.shape

(1999, 507)

In [None]:
select = SelectFwe()
vectors_train_stem_FWE = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_FWE = select.transform(vectors_test_stop_tfidf_stem)
vectors_train_stem_FWE.shape

(1999, 523)

### 13.3 Recursive feature elimination (runs super slow, maybe run this after setting max_features)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC

from sklearn.model_selection import KFold, cross_val_score

In [None]:
estimator = LinearSVC()
select = RFECV(estimator, step=98,scoring='accuracy')
vectors_train_Lemma_RFESVC = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_RFESVC = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_RFESVC.shape)

NameError: ignored

In [None]:
estimator = LinearSVC()
select = RFECV(estimator, step=85,scoring='accuracy')
vectors_train_stem_RFESVC = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_RFESVC = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_RFESVC.shape)

In [None]:
estimator = LogisticRegression()
select = RFECV(estimator, step=98,scoring='accuracy')
vectors_train_Lemma_RFELR = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_RFELR = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_RFELR.shape)

In [None]:
estimator = LogisticRegression()
select = RFECV(estimator, step=85,scoring='accuracy')
vectors_train_stem_RFELR = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_RFELR = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_RFELR.shape)

In [None]:
estimator = MultinomialNB()
select = RFECV(estimator, step=98,scoring='accuracy')
vectors_train_Lemma_RFEMNB = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_RFEMNB = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_RFEMNB.shape)

In [None]:
estimator = MultinomialNB()
select = RFECV(estimator, step=85,scoring='accuracy')
vectors_train_stem_RFEMNB = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_RFEMNB = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_RFEMNB.shape)

In [None]:
estimator = DecisionTreeClassifier()
select = RFECV(estimator, step=98,scoring='accuracy')
vectors_train_Lemma_RFEDT = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_RFEDT = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_RFEDT.shape)

In [None]:
estimator = DecisionTreeClassifier()
select = RFECV(estimator, step=85,scoring='accuracy')
vectors_train_stem_RFEDT = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_RFEDT = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_RFEDT.shape)

In [None]:
'''estimator = LinearSVC()
select = RFE(estimator, n_features_to_select=5000, step=0.1)
vectors_train_Lemma_RFE = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_RFE = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_RFE.shape)

In [None]:
'''estimator = LinearSVC()
select = RFE(estimator, n_features_to_select=5000, step=0.1)
vectors_train_stem_RFESVC = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_RFESVC = select.transform(vectors_test_stop_tfidf_stem)

### 13.4.1 selectfrommodel L1 norm

In [None]:
estimator = LinearSVC(C=10, penalty="l1",dual=False)
select = SelectFromModel(estimator,max_features=5000)
vectors_train_Lemma_SFML1 = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_SFML1 = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_SFML1.shape)

(1999, 2091)




In [None]:
estimator = LinearSVC(C=10, penalty="l1",dual=False)
select = SelectFromModel(estimator,max_features=5000)
vectors_train_stem_SFML1 = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_SFML1 = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_SFML1.shape)

(1999, 2064)




### 13.4.2 selectfrommodel tree

In [None]:
clf = ExtraTreesClassifier()
clf = clf.fit(vectors_train_stop_tfidf_Lemma, y_train_num)
model = SelectFromModel(clf, prefit=True)
vectors_train_Lemma_SFMtree = model.transform(vectors_train_stop_tfidf_Lemma)
vectors_train_Lemma_SFMtree.shape

(1999, 9466)

In [None]:
clf = ExtraTreesClassifier()
clf = clf.fit(vectors_train_stop_tfidf_stem, y_train_num)
model = SelectFromModel(clf, prefit=True)
vectors_train_stem_SFMtree = model.transform(vectors_train_stop_tfidf_stem)
vectors_train_stem_SFMtree.shape

(1999, 9776)

## Experiment on sklearn models (with NGRAM)

### Find the best set of features
We have 16 sets in total


The best are 

vectors_train_Lemma_X2

vectors_train_Lemma_F

vectors_train_Lemma_SFML1

In [None]:
model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_X2, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stem_X2, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_mutual, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stem_mutual, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_F, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stem_F, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_FPR, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stem_FPR, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_FDR, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stem_FDR, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_FWE, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stem_FWE, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_SFML1, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stem_SFML1, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_SFMtree, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stem_SFMtree, y_train_num, cv=10)
print(scores.mean())

0.9459798994974875
0.9449748743718593
0.934464824120603
0.930462311557789
0.9469748743718593
0.9449773869346734
0.9384698492462311
0.9424723618090451
0.9214572864321608
0.927467336683417
0.8869422110552764
0.8869422110552764
0.9484748743718594
0.9464723618090451
0.938464824120603
0.9339623115577889


In [None]:
model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_X2, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stem_X2, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_mutual, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stem_mutual, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_F, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stem_F, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_FPR, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stem_FPR, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_FDR, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stem_FDR, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_FWE, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stem_FWE, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_SFML1, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stem_SFML1, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_SFMtree, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stem_SFMtree, y_train_num, cv=10)
print(scores.mean())

0.9364798994974877
0.9354798994974877
0.9289748743718593
0.9289723618090452
0.9374773869346734
0.9309748743718593
0.9329723618090451
0.927467336683417
0.9079522613065327
0.9109572864321608
0.8764422110552765
0.8764422110552765
0.9274698492462312
0.9264698492462312
0.9304723618090451
0.926469849246231


In [None]:
model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_X2, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_stem_X2, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_mutual, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_stem_mutual, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_F, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_stem_F, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_FPR, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_stem_FPR, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_FDR, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_stem_FDR, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_FWE, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_stem_FWE, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_SFML1, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_stem_SFML1, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_SFMtree, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_stem_SFMtree, y_train_num, cv=10)
print(scores.mean())

0.9179673366834171
0.9209648241206029
0.9164673366834171
0.9144648241206029
0.9329723618090451
0.928967336683417
0.9159698492462311
0.9214623115577888
0.8879497487437185
0.8814522613065326
0.839929648241206
0.839929648241206
0.926469849246231
0.9324673366834171
0.9229698492462312
0.9234648241206029


### Grid search
The best features:

vectors_train_Lemma_X2

vectors_train_Lemma_F

vectors_train_Lemma_SFML1

#### vectors_train_Lemma_X2
Best models: LinearSVM, MultiNB, BernouliNB

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = LinearSVC()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_X2, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9459798994974875
C: 1


In [None]:
model = MultinomialNB()
parameters = {
    'alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2), 
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_X2, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9554849246231155
alpha: 1e-10


In [None]:
model = BernoulliNB()
parameters = {
    'alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_X2, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9469773869346734
alpha: 1e-10


#### vectors_train_Lemma_F
Best model: MultiNB, LinearSVM, BernouliNB

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = LinearSVC()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9469748743718593
C: 1


In [None]:
model = MultinomialNB()
parameters = {
    'alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2), 
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9639874371859296
alpha: 1e-05


In [None]:
model = BernoulliNB()
parameters = {
    'alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9594824120603015
alpha: 1e-10


#### vectors_train_Lemma_SFML1
Best model: MultiNB, LinearSVM, BernouliNB

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = LinearSVC()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_SFML1, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9594874371859297
C: 10


In [None]:
model = MultinomialNB()
parameters = {
    'alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2), 
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_SFML1, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9439773869346734
alpha: 0.1


In [None]:
model = BernoulliNB()
parameters = {
    'alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_SFML1, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9519874371859297
alpha: 0.1


### Best model: LinearSVM on vectors_train_Lemma_SFML1

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
model = LinearSVC(C=10)
model.fit(vectors_train_Lemma_SFML1, y_train_num)
cross_val_score(model, vectors_train_Lemma_SFML1, y_train_num, cv=10)

array([0.94      , 0.97      , 0.96      , 0.955     , 0.945     ,
       0.96      , 0.985     , 0.955     , 0.95      , 0.97487437])

In [None]:
model = LinearSVC(C=10)
model.fit(vectors_train_Lemma_SFML1, y_train_num)
y_pred = model.predict(vectors_test_Lemma_SFML1)
y_pred = le.inverse_transform(y_pred)

#### Write results to CSV

In [None]:
result = pd.DataFrame({'id': test.id, 'subreddit': y_pred})
result.to_csv("result.csv", index=False)

In [None]:
pred_csv = pd.read_csv('result.csv',engine='python')
pred_csv.head()

Unnamed: 0,id,subreddit
0,0,science
1,1,science
2,2,anime
3,3,science
4,4,science


### Second Best model: MultiNB on vectors_train_Lemma_F

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
model = MultinomialNB(alpha=1e-5)
# model.fit(vectors_train_Lemma_F, y_train_num)
cross_val_score(model, vectors_train_Lemma_F, y_train_num, cv=10)

array([0.96      , 0.985     , 0.95      , 0.975     , 0.94      ,
       0.965     , 0.98      , 0.955     , 0.955     , 0.97487437])

In [None]:
model = MultinomialNB(alpha=1e-5)
model.fit(vectors_train_Lemma_F, y_train_num)
y_pred = model.predict(vectors_test_Lemma_F)
y_pred = le.inverse_transform(y_pred)

#### Write results to CSV

In [None]:
result = pd.DataFrame({'id': test.id, 'subreddit': y_pred})
result.to_csv("result.csv", index=False)

In [None]:
pred_csv = pd.read_csv('result.csv',engine='python')
pred_csv.head()

Unnamed: 0,id,subreddit
0,0,science
1,1,science
2,2,laptop
3,3,science
4,4,science


## Keras

In [13]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras import utils

In [86]:
# x_train = vectors_train_Lemma_X2
# x_test = vectors_test_Lemma_X2

# x_train = vectors_train_Lemma_mutual
# x_test = vectors_test_Lemma_mutual

# x_train = vectors_train
# x_test = vectors_test

x_train = vectors_train_stop_tfidf_Lemma
x_test = vectors_test_stop_tfidf_Lemma

x_train = x_train.toarray()
x_test = x_test.toarray()

num_classes = len(np.unique(y_train_num))
y_train = utils.to_categorical(y_train_num, num_classes)

In [116]:
batch_size = 32
epochs = 4

In [117]:
# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(x_train.shape[1],)))
model.add(Activation('relu'))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.1))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [118]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [119]:
y_test = model.predict(x_test, batch_size, verbose=1)
y_classes = [np.argmax(y, axis=None, out=None) for y in y_test]
y_pred = le.inverse_transform(y_classes)
result = pd.DataFrame({'id': test.id, 'subreddit': y_pred})
result.to_csv("result.csv", index=False)



## Estimate test Accuracy

In [120]:
result_df = pd.read_csv('result.csv')
result_df = result_df['subreddit']
error = 0
for i in range(1, len(result_df)-1):
  if result_df[i]==result_df[i-1] or result_df[i]==result_df[i+1]:
    continue
  error = error + 1
accu = (len(result_df)-error) / len(result_df)
accu

0.9346879535558781