<a href="https://colab.research.google.com/github/jwang44/crispy-fiesta/blob/main/submit/feature_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature selection after N-grams
This notebook is for investigating the use of N-gram and various feature selection strategies. N-grams introduces a lot of redundant features, so feature selection is necessary in making the model efficient. 

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/

Mounted at /content/drive
/content/drive/MyDrive


## Load the data and get basic features

In [2]:
import pandas as pd
import numpy as np

In [3]:
train = pd.read_csv('./train.csv',engine='python')
test = pd.read_csv('./test.csv',engine='python')

In [4]:
X_train = train.body  # train texts
y_train = train.subreddit # train subreddits
X_test = test.body  # test texts

In [5]:
from sklearn.preprocessing import Normalizer, LabelEncoder
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [6]:
# transform target labels to values
le = LabelEncoder()
y_train_num = le.fit_transform(y_train.values) # convert category from string to numerical

# vectorize word count
vectorizer = CountVectorizer()
vectors_train = vectorizer.fit_transform(X_train)
vectors_test = vectorizer.transform(X_test)

normalizer_train = Normalizer()
vectors_train= normalizer_train.transform(vectors_train)
vectors_test= normalizer_train.transform(vectors_test)

# print(vectorizer.get_feature_names())
print(vectors_train.shape)
print(vectors_test.shape)

(1999, 15365)
(1378, 15365)


In [7]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [8]:
# remove stop words and punctuation, tfidf, lemmatization, normalization
stop_words = text.ENGLISH_STOP_WORDS

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

class New_LemmaTokenizer:
     def __init__(self):
       self.wnl = WordNetLemmatizer()
     def __call__(self, doc):
       return [self.wnl.lemmatize(t,pos =get_wordnet_pos(t)) for t in word_tokenize(doc) if t.isalpha()]

tf_idf_transformer = TfidfTransformer()
vectorizer = CountVectorizer(stop_words = stop_words, tokenizer = New_LemmaTokenizer(), ngram_range=(1, 2)) #unigram+bigram:ngram_range=(1, 2), only bigram:ngram_range=(2, 2)
vectors_train_stop_tfidf_Lemma = vectorizer.fit_transform(X_train)
vectors_train_stop_tfidf_Lemma = tf_idf_transformer.fit_transform(vectors_train_stop_tfidf_Lemma)
vectors_test_stop_tfidf_Lemma = vectorizer.transform(X_test)
vectors_test_stop_tfidf_Lemma = tf_idf_transformer.transform(vectors_test_stop_tfidf_Lemma)
vectors_train_stop_tfidf_Lemma = normalizer_train.transform(vectors_train_stop_tfidf_Lemma)
vectors_test_stop_tfidf_Lemma = normalizer_train.transform(vectors_test_stop_tfidf_Lemma)

#print(vectorizer.get_feature_names())
print(vectors_train_stop_tfidf_Lemma.shape)
print(vectors_test_stop_tfidf_Lemma.shape)

  'stop_words.' % sorted(inconsistent))


(1999, 70414)
(1378, 70414)


After adding bi-gram to unigram, we end up over 70,000 features.

In [9]:
# remove stopwords and punctuation, tfidf, stemming, normalization
stop_words = text.ENGLISH_STOP_WORDS

class StemTokenizer:
     def __init__(self):
       self.wnl =PorterStemmer()
     def __call__(self, doc):
       return [self.wnl.stem(t) for t in word_tokenize(doc) if t.isalpha()]

tf_idf_transformer = TfidfTransformer()
vectorizer = CountVectorizer(stop_words = stop_words, tokenizer=StemTokenizer(),ngram_range=(1, 2)) #unigram+bigram:ngram_range=(1, 2), only bigram:ngram_range=(2, 2)
vectors_train_stop_tfidf_stem = vectorizer.fit_transform(X_train)
vectors_train_stop_tfidf_stem = tf_idf_transformer.fit_transform(vectors_train_stop_tfidf_stem)
vectors_test_stop_tfidf_stem = vectorizer.transform(X_test)
vectors_test_stop_tfidf_stem = tf_idf_transformer.transform(vectors_test_stop_tfidf_stem)
vectors_train_stop_tfidf_stem = normalizer_train.transform(vectors_train_stop_tfidf_stem)
vectors_test_stop_tfidf_stem = normalizer_train.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stop_tfidf_stem.shape)
print(vectors_test_stop_tfidf_stem.shape)

  'stop_words.' % sorted(inconsistent))


(1999, 73597)
(1378, 73597)


## Feature selection

In [12]:
from sklearn.feature_selection import SelectKBest, SelectPercentile, chi2, mutual_info_classif, f_classif, SelectFpr, SelectFwe, SelectFdr, RFE, RFECV, SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

### 13.2.1 chi2

In [13]:
# choose one: SelectPercentile percent%, selectkbest abolute number
select = SelectPercentile(chi2, percentile=60)
vectors_train_Lemma_X2 = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_X2 = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_X2.shape)

(1999, 42248)


In [14]:
select = SelectKBest(chi2, k=6000)
vectors_train_Lemma_X2 = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_X2 = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_X2.shape)

(1999, 6000)


In [15]:
select = SelectPercentile(chi2, percentile=60)
vectors_train_stem_X2 = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_X2 = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_X2.shape)

(1999, 44158)


In [16]:
select = SelectKBest(chi2, k=5000)
vectors_train_stem_X2 = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_X2 = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_X2.shape)

(1999, 5000)


### 13.2.2 mutual info

In [17]:
select = SelectPercentile(mutual_info_classif, percentile=60)
vectors_train_Lemma_X2 = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_X2 = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_X2.shape)

(1999, 42248)


In [18]:
select = SelectKBest(mutual_info_classif, k=6000)
vectors_train_Lemma_mutual = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_mutual = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_mutual.shape)

(1999, 6000)


In [19]:
select = SelectPercentile(mutual_info_classif, percentile=60)
vectors_train_stem_X2 = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_X2 = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_X2.shape)

(1999, 44158)


In [20]:
select = SelectKBest(mutual_info_classif, k=6000)
vectors_train_stem_mutual = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_mutual = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_mutual.shape)

(1999, 6000)


### 13.2.3 F score

In [21]:
select = SelectPercentile(f_classif, percentile=60)
vectors_train_Lemma_X2 = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_X2 = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_X2.shape)

(1999, 42248)


In [22]:
select = SelectKBest(f_classif, k=6000)
vectors_train_Lemma_F = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_F = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_F.shape)

(1999, 6000)


In [23]:
select = SelectPercentile(f_classif, percentile=60)
vectors_train_stem_X2 = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_X2 = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_X2.shape)

(1999, 44158)


In [24]:
select = SelectKBest(f_classif, k=6000)
vectors_train_stem_F = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_F = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_F.shape)

(1999, 6000)


### 13.2.4 FPR

In [25]:
select = SelectFpr()
vectors_train_Lemma_FPR = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_FPR = select.transform(vectors_test_stop_tfidf_Lemma)
vectors_train_Lemma_FPR.shape

(1999, 2602)

In [26]:
select = SelectFpr()
vectors_train_stem_FPR = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_FPR = select.transform(vectors_test_stop_tfidf_stem)
vectors_train_stem_FPR.shape

(1999, 2770)

### 13.2.5 FDR

In [27]:
select = SelectFdr()
vectors_train_Lemma_FDR = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_FDR = select.transform(vectors_test_stop_tfidf_Lemma)
vectors_train_Lemma_FDR.shape

(1999, 1013)

In [28]:
select = SelectFdr()
vectors_train_stem_FDR = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_FDR = select.transform(vectors_test_stop_tfidf_stem)
vectors_train_stem_FDR.shape

(1999, 1064)

### 13.2.6 FWE

In [29]:
select = SelectFwe()
vectors_train_Lemma_FWE = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_FWE = select.transform(vectors_test_stop_tfidf_Lemma)
vectors_train_Lemma_FWE.shape

(1999, 507)

In [30]:
select = SelectFwe()
vectors_train_stem_FWE = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_FWE = select.transform(vectors_test_stop_tfidf_stem)
vectors_train_stem_FWE.shape

(1999, 523)

### 13.3 Recursive feature elimination

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC

from sklearn.model_selection import KFold, cross_val_score

In [32]:
estimator = LinearSVC()
select = RFECV(estimator, step=700,scoring='accuracy')
vectors_train_Lemma_RFESVC = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_RFESVC = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_RFESVC.shape)

(1999, 18614)


In [33]:
estimator = LogisticRegression()
select = RFECV(estimator, step=700,scoring='accuracy')
vectors_train_Lemma_RFELR = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_RFELR = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_RFELR.shape)

(1999, 19314)


In [34]:
estimator = MultinomialNB()
select = RFECV(estimator, step=700,scoring='accuracy')
vectors_train_Lemma_RFEMNB = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_RFEMNB = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_RFEMNB.shape)

(1999, 70414)


## Experiment on sklearn models (with NGRAM)

### Find the best set of features
As stem has basically the same effect as Lemma, we only keep Lemma for further experiments


The best are 

vectors_train_Lemma_X2

vectors_train_Lemma_F

vectors_train_Lemma_RFESVC/RFELR/RFEMNB

In [35]:
model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_X2, y_train_num, cv=5)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_mutual, y_train_num, cv=5)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_F, y_train_num, cv=5)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_FPR, y_train_num, cv=5)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_FDR, y_train_num, cv=5)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_FWE, y_train_num, cv=5)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_RFESVC, y_train_num, cv=5)
print(scores.mean())

0.9399736842105263
0.9319636591478696
0.9464736842105262
0.9359674185463659
0.9199523809523811
0.8864335839598997
0.9384661654135338


In [36]:
model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_X2, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_mutual, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_F, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_FPR, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_FDR, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_FWE, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_RFELR, y_train_num, cv=10)
print(scores.mean())


0.9364798994974877
0.9289748743718593
0.9374773869346734
0.9329723618090451
0.9079522613065327
0.8809472361809044
0.9319773869346735


In [37]:
model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_X2, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_mutual, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_F, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_FPR, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_FDR, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_FWE, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_RFEMNB, y_train_num, cv=10)
print(scores.mean())


0.9179673366834171
0.9164673366834171
0.9329723618090451
0.9159698492462311
0.8879497487437185
0.8354246231155777
0.9204698492462311


### Grid search
The best features:

vectors_train_Lemma_X2

vectors_train_Lemma_F

vectors_train_Lemma_RFESVC/RFELR/RFEMNB

#### vectors_train_Lemma_X2

In [38]:
from sklearn.model_selection import GridSearchCV

In [39]:
model = LinearSVC()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_X2, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9459798994974875
C: 1


In [40]:
model = MultinomialNB()
parameters = {
    'alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2), 
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_X2, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9554849246231155
alpha: 1e-10


In [41]:
model = LogisticRegression()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_X2, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9399773869346735
C: 10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


#### vectors_train_Lemma_F

In [None]:
from sklearn.model_selection import GridSearchCV

In [42]:
model = LinearSVC()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9469748743718593
C: 1


In [43]:
model = MultinomialNB()
parameters = {
    'alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2), 
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9639874371859296
alpha: 1e-05


In [44]:
model = LogisticRegression()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_X2, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9399773869346735
C: 10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


#### vectors_train_Lemma_RFE

In [None]:
from sklearn.model_selection import GridSearchCV

In [45]:
model = LinearSVC()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_RFESVC, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9439723618090451
C: 1


In [46]:
model = MultinomialNB()
parameters = {
    'alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2), 
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_RFEMNB, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9304723618090451
alpha: 0.1


In [47]:
model = LogisticRegression()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_RFELR, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9414773869346735
C: 100


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Best model: LinearSVM on vectors_train_Lemma_X2

In [None]:
from sklearn.model_selection import cross_val_score

In [55]:
model = LinearSVC(C=1)
cross_val_score(model, vectors_train_Lemma_X2, y_train_num, cv=10)

array([0.92      , 0.97      , 0.95      , 0.94      , 0.92      ,
       0.935     , 0.965     , 0.955     , 0.945     , 0.95979899])

In [58]:
model = LinearSVC(C=1)
model.fit(vectors_train_Lemma_X2, y_train_num)
y_pred = model.predict(vectors_test_Lemma_X2)
y_pred = le.inverse_transform(y_pred)

#### Write results to CSV

In [59]:
result = pd.DataFrame({'id': test.id, 'subreddit': y_pred})
result.to_csv("result.csv", index=False)

In [60]:
pred_csv = pd.read_csv('result.csv',engine='python')
pred_csv.head()

Unnamed: 0,id,subreddit
0,0,science
1,1,science
2,2,anime
3,3,science
4,4,science


### Second Best model: MultiNB on vectors_train_Lemma_F

In [48]:
from sklearn.model_selection import cross_val_score

In [49]:
model = MultinomialNB(alpha=1e-5)
cross_val_score(model, vectors_train_Lemma_F, y_train_num, cv=10)

array([0.96      , 0.985     , 0.95      , 0.975     , 0.94      ,
       0.965     , 0.98      , 0.955     , 0.955     , 0.97487437])

In [50]:
model = MultinomialNB(alpha=1e-5)
model.fit(vectors_train_Lemma_F, y_train_num)
y_pred = model.predict(vectors_test_Lemma_F)
y_pred = le.inverse_transform(y_pred)

#### Write results to CSV

In [51]:
result = pd.DataFrame({'id': test.id, 'subreddit': y_pred})
result.to_csv("result.csv", index=False)

In [52]:
pred_csv = pd.read_csv('result.csv',engine='python')
pred_csv.head()

Unnamed: 0,id,subreddit
0,0,science
1,1,science
2,2,laptop
3,3,science
4,4,science
