<a href="https://colab.research.google.com/github/jwang44/crispy-fiesta/blob/main/feature_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/

Mounted at /content/drive
/content/drive/MyDrive


## Load the data and get basic features

In [2]:
import pandas as pd
import numpy as np

In [3]:
train = pd.read_csv('./train.csv',engine='python')
test = pd.read_csv('./test.csv',engine='python')

In [6]:
X_train = train.body  # train texts
y_train = train.subreddit # train subreddits
X_test = test.body  # test texts

In [7]:
from sklearn.preprocessing import Normalizer, LabelEncoder
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [8]:
# transform target labels to values
le = LabelEncoder()
y_train_num = le.fit_transform(y_train.values) # convert category from string to numerical (!!!!! update the variables in kcross fold)

# vectorize word count
vectorizer = CountVectorizer()
vectors_train = vectorizer.fit_transform(X_train)
vectors_test = vectorizer.transform(X_test)

normalizer_train = Normalizer()
vectors_train= normalizer_train.transform(vectors_train)
vectors_test= normalizer_train.transform(vectors_test)

# print(vectorizer.get_feature_names())
print(vectors_train.shape)
print(vectors_test.shape)

(1999, 15365)
(1378, 15365)


In [9]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [10]:
# put it all together: remove stop words and punctuation, tfidf, lemmatization, normalization
stop_words = text.ENGLISH_STOP_WORDS

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

class New_LemmaTokenizer:
     def __init__(self):
       self.wnl = WordNetLemmatizer()
     def __call__(self, doc):
       return [self.wnl.lemmatize(t,pos =get_wordnet_pos(t)) for t in word_tokenize(doc) if t.isalpha()]

tf_idf_transformer = TfidfTransformer()
vectorizer = CountVectorizer(stop_words = stop_words, tokenizer = New_LemmaTokenizer())
vectors_train_stop_tfidf_Lemma = vectorizer.fit_transform(X_train)
vectors_train_stop_tfidf_Lemma = tf_idf_transformer.fit_transform(vectors_train_stop_tfidf_Lemma)
vectors_test_stop_tfidf_Lemma = vectorizer.transform(X_test)
vectors_test_stop_tfidf_Lemma = tf_idf_transformer.transform(vectors_test_stop_tfidf_Lemma)
vectors_train_stop_tfidf_Lemma = normalizer_train.transform(vectors_train_stop_tfidf_Lemma)
vectors_test_stop_tfidf_Lemma = normalizer_train.transform(vectors_test_stop_tfidf_Lemma)

# print(vectorizer.get_feature_names())
print(vectors_train_stop_tfidf_Lemma.shape)
print(vectors_test_stop_tfidf_Lemma.shape)

  'stop_words.' % sorted(inconsistent))


(1999, 9779)
(1378, 9779)


In [17]:
# remove stopwords and punctuation, tfidf, stemming, normalization
stop_words = text.ENGLISH_STOP_WORDS

class StemTokenizer:
     def __init__(self):
       self.wnl =PorterStemmer()
     def __call__(self, doc):
       return [self.wnl.stem(t) for t in word_tokenize(doc) if t.isalpha()]

tf_idf_transformer = TfidfTransformer()
vectorizer = CountVectorizer(stop_words = stop_words, tokenizer=StemTokenizer())
vectors_train_stop_tfidf_stem = vectorizer.fit_transform(X_train)
vectors_train_stop_tfidf_stem = tf_idf_transformer.fit_transform(vectors_train_stop_tfidf_stem)
vectors_test_stop_tfidf_stem = vectorizer.transform(X_test)
vectors_test_stop_tfidf_stem = tf_idf_transformer.transform(vectors_test_stop_tfidf_stem)
vectors_train_stop_tfidf_stem = normalizer_train.transform(vectors_train_stop_tfidf_stem)
vectors_test_stop_tfidf_stem = normalizer_train.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stop_tfidf_stem.shape)
print(vectors_test_stop_tfidf_stem.shape)

  'stop_words.' % sorted(inconsistent))


(1999, 8522)
(1378, 8522)


## Feature selection

In [68]:
from sklearn.feature_selection import SelectKBest, chi2, VarianceThreshold, f_classif, mutual_info_classif, SelectFpr, SelectFwe, SelectFdr, RFE, RFECV, SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

### 13.2.1 chi2

In [None]:
select = SelectKBest(chi2, k=6000)
vectors_train_Lemma_X2 = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_X2 = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_X2.shape)

(1999, 5000)


In [None]:
select = SelectKBest(chi2, k=5000)
vectors_train_stem_X2 = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_X2 = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_X2.shape)

(1999, 4000)


### 13.2.2 mutual info

In [27]:
select = SelectKBest(mutual_info_classif, k=6000)
vectors_train_Lemma_mutual = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_mutual = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_mutual.shape)

(1999, 6000)


In [28]:
select = SelectKBest(mutual_info_classif, k=5000)
vectors_train_stem_mutual = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_mutual = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_mutual.shape)

(1999, 5000)


### 13.2.3 F score

In [None]:
select = SelectKBest(f_classif, k=6000)
vectors_train_Lemma_F = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_F = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_F.shape)

(1999, 6000)


In [None]:
select = SelectKBest(f_classif, k=5000)
vectors_train_stem_F = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_stem_F = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_stem_F.shape)

(1999, 5000)


### 13.2.4 FPR

In [76]:
select = SelectFpr()
vectors_train_Lemma_FPR = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_FPR = select.transform(vectors_test_stop_tfidf_Lemma)
vectors_train_Lemma_FPR.shape

(1999, 1598)

In [78]:
select = SelectFpr()
vectors_train_stem_FPR = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_FPR = select.transform(vectors_test_stop_tfidf_stem)
vectors_train_stem_FPR.shape

(1999, 1585)

### 13.2.5 FDR

In [84]:
select = SelectFdr()
vectors_train_Lemma_FDR = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_FDR = select.transform(vectors_test_stop_tfidf_Lemma)
vectors_train_Lemma_FDR.shape

(1999, 841)

In [85]:
select = SelectFdr()
vectors_train_Lemma_FDR = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_Lemma_FDR = select.transform(vectors_test_stop_tfidf_stem)
vectors_train_Lemma_FDR.shape

(1999, 891)

### 13.2.6 FWE

In [86]:
select = SelectFwe()
vectors_train_Lemma_FWE = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_FWE = select.transform(vectors_test_stop_tfidf_Lemma)
vectors_train_Lemma_FWE.shape

(1999, 359)

In [87]:
select = SelectFwe()
vectors_train_Lemma_FWE = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_Lemma_FWE = select.transform(vectors_test_stop_tfidf_stem)
vectors_train_Lemma_FWE.shape

(1999, 362)

### 13.3 Recursive feature elimination

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC

from sklearn.model_selection import KFold, cross_val_score

In [39]:
estimator = LinearSVC()
select = RFECV(estimator, step=98,scoring='accuracy')
vectors_train_Lemma_RFESVC = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_RFESVC = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_RFECV.shape)

(1999, 4291)


In [41]:
estimator = LinearSVC()
select = RFECV(estimator, step=85,scoring='accuracy')
vectors_train_stem_RFESVC = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_RFESVC = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_RFESVC.shape)

(1999, 6222)


In [43]:
estimator = LogisticRegression()
select = RFECV(estimator, step=98,scoring='accuracy')
vectors_train_Lemma_RFELR = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_RFELR = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_RFELR.shape)

(1999, 4095)


In [None]:
estimator = LogisticRegression()
select = RFECV(estimator, step=85,scoring='accuracy')
vectors_train_stem_RFELR = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_RFELR = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_RFELR.shape)

In [47]:
estimator = MultinomialNB()
select = RFECV(estimator, step=98,scoring='accuracy')
vectors_train_Lemma_RFEMNB = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_RFEMNB = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_RFEMNB.shape)

(1999, 9681)


In [51]:
estimator = MultinomialNB()
select = RFECV(estimator, step=85,scoring='accuracy')
vectors_train_stem_RFEMNB = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_RFEMNB = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_RFEMNB.shape)

(1999, 8437)


In [49]:
estimator = DecisionTreeClassifier()
select = RFECV(estimator, step=98,scoring='accuracy')
vectors_train_Lemma_RFEDT = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_RFEDT = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_RFEDT.shape)

(1999, 6349)


In [48]:
estimator = DecisionTreeClassifier()
select = RFECV(estimator, step=85,scoring='accuracy')
vectors_train_stem_RFEDT = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_RFEDT = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_RFEDT.shape)

(1999, 8097)


In [24]:
'''estimator = LinearSVC()
select = RFE(estimator, n_features_to_select=5000, step=0.1)
vectors_train_Lemma_RFE = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_RFE = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_RFE.shape)

(1999, 5000)


In [26]:
'''estimator = LinearSVC()
select = RFE(estimator, n_features_to_select=5000, step=0.1)
vectors_train_stem_RFESVC = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_RFESVC = select.transform(vectors_test_stop_tfidf_stem)

### 13.4.1 selectfrommodel L1 norm

In [54]:
estimator = LinearSVC(C=10, penalty="l1",dual=False)
select = SelectFromModel(estimator,max_features=5000)
vectors_train_Lemma_SFML1 = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_SFML1 = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_SFML1.shape)

(1999, 1285)




In [23]:
estimator = LinearSVC(C=10, penalty="l1",dual=False)
select = SelectFromModel(estimator,max_features=5000)
vectors_train_stem_SFML1 = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_SFML1 = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_SFML1.shape)

(1999, 1257)




### 13.4.2 selectfrommodel tree

In [57]:
clf = ExtraTreesClassifier()
clf = clf.fit(vectors_train_stop_tfidf_Lemma, y_train_num)
model = SelectFromModel(clf, prefit=True)
vectors_train_stem_SFMtree = model.transform(vectors_train_stop_tfidf_Lemma)
vectors_train_stem_SFMtree.shape

(1999, 1419)

In [59]:
clf = ExtraTreesClassifier()
clf = clf.fit(vectors_train_stop_tfidf_stem, y_train_num)
model = SelectFromModel(clf, prefit=True)
vectors_train_stem_SFMtree = model.transform(vectors_train_stop_tfidf_stem)
vectors_train_stem_SFMtree.shape

(1999, 1283)

## Experiment on sklearn models

### Find the best set of features
We have 8 sets in total

The best are 

**1** 2 3 **5** 6 7 for SVM

1 **5 6** for logistic regression

In [None]:
# 0.9414
model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_X2, y_train_num, cv=10)
print(scores.mean())

# 0.9389
model = LinearSVC()
scores = cross_val_score(model, vectors_train_stem_X2, y_train_num, cv=10)
print(scores.mean())

#0.9329
model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_mutual, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stem_mutual, y_train_num, cv=10)
print(scores.mean())

# 0.9414
model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_F, y_train_num, cv=10)
print(scores.mean())

# 0.9409
model = LinearSVC()
scores = cross_val_score(model, vectors_train_stem_F, y_train_num, cv=10)
print(scores.mean())

# 0.9364
model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_RFE, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stem_RFE, y_train_num, cv=10)
print(scores.mean())

0.9414773869346733
0.9389773869346735
0.932964824120603
0.926964824120603
0.9414773869346735
0.9409773869346735
0.9364698492462311
0.9314648241206029
0.3436809045226131
0.3436809045226131


In [None]:
# 0.9349
model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_X2, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stem_X2, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_mutual, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stem_mutual, y_train_num, cv=10)
print(scores.mean())

# 0.9364
model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_F, y_train_num, cv=10)
print(scores.mean())

# 0.9369
model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stem_F, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_RFE, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stem_RFE, y_train_num, cv=10)
print(scores.mean())

0.934969849246231
0.9309723618090452
0.9304698492462309
0.9284723618090451
0.9364723618090451
0.9369723618090451
0.929969849246231
0.9284698492462311


In [None]:
# 0.9324
model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_X2, y_train_num, cv=10)
print(scores.mean())

# 0.9319
model = MultinomialNB()
scores = cross_val_score(model, vectors_train_stem_X2, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_mutual, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_stem_mutual, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_F, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_stem_F, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_RFE, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_stem_RFE, y_train_num, cv=10)
print(scores.mean())

0.9324698492462311
0.931964824120603
0.920967336683417
0.9244597989949748
0.924964824120603
0.9214648241206029
0.922967336683417
0.9209597989949749


### Grid search
The best features:

vectors_train_Lemma_X2

vectors_train_Lemma_F

vectors_train_stem_F

#### vectors_train_Lemma_X2
Best models: LinearSVM, MultiNB, Logistic

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = LinearSVC()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_X2, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9414773869346733
C: 1


In [None]:
model = SVC()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000),
    'gamma': (1e-3, 1e-4)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_X2, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9239723618090452
C: 1000
gamma: 0.001


In [None]:
model = LogisticRegression()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000),
    'max_iter': (1000, 5000, 10000)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_X2, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9379773869346735
C: 100
max_iter: 1000


In [None]:
model = KNeighborsClassifier()
parameters = {
    'n_neighbors': (3, 5, 10, 20, 40), 
    'p': (1, 2, 3),
    'leaf_size': (10, 20, 30, 50)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_X2, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.6668366834170854
leaf_size: 10
n_neighbors: 40
p: 1


In [None]:
model = DecisionTreeClassifier()
parameters = {
    'max_depth': (10, 100, 1000, 10000), 
    'min_samples_leaf': (1, 5, 10)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_X2, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.7748944723618091
max_depth: 1000
min_samples_leaf: 1


In [None]:
model = MultinomialNB()
parameters = {
    'alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2), 
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_X2, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9409748743718594
alpha: 0.1


In [None]:
model = BernoulliNB()
parameters = {
    'alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_X2, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9354773869346735
alpha: 0.1


#### vectors_train_Lemma_F
Best model: MultiNB, LinearSVM, Logistic regression

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = LinearSVC()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9414773869346735
C: 1


In [None]:
model = SVC()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000),
    'gamma': (1e-3, 1e-4)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.925469849246231
C: 1000
gamma: 0.001


In [None]:
model = LogisticRegression()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000),
    'max_iter': (1000, 5000, 10000)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9394773869346735
C: 10
max_iter: 1000


In [None]:
model = KNeighborsClassifier()
parameters = {
    'n_neighbors': (3, 5, 10, 20, 40), 
    'p': (1, 2, 3),
    'leaf_size': (10, 20, 30, 50)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.611819095477387
leaf_size: 10
n_neighbors: 40
p: 1


In [None]:
model = DecisionTreeClassifier()
parameters = {
    'max_depth': (10, 100, 1000, 10000), 
    'min_samples_leaf': (1, 5, 10)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.7729045226130653
max_depth: 100
min_samples_leaf: 5


In [None]:
model = MultinomialNB()
parameters = {
    'alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2), 
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9439723618090451
alpha: 1e-05


In [None]:
model = BernoulliNB()
parameters = {
    'alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.934472361809045
alpha: 1e-05


#### vectors_train_stem_F
Best model: MultiNB, LinearSVM, Logistic regression

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = LinearSVC()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_stem_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9409773869346735
C: 1


In [None]:
model = SVC()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000),
    'gamma': (1e-3, 1e-4)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_stem_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9254673366834171
C: 1000
gamma: 0.001


In [None]:
model = LogisticRegression()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000),
    'max_iter': (1000, 5000, 10000)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_stem_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9399748743718593
C: 10
max_iter: 1000


In [None]:
model = KNeighborsClassifier()
parameters = {
    'n_neighbors': (3, 5, 10, 20, 40), 
    'p': (1, 2, 3),
    'leaf_size': (10, 20, 30, 50)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_stem_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.48522613065326636
leaf_size: 10
n_neighbors: 40
p: 2


In [None]:
model = DecisionTreeClassifier()
parameters = {
    'max_depth': (10, 100, 1000, 10000), 
    'min_samples_leaf': (1, 5, 10)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_stem_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.7749045226130653
max_depth: 1000
min_samples_leaf: 5


In [None]:
model = MultinomialNB()
parameters = {
    'alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2), 
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_stem_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9429698492462311
alpha: 1e-05


In [None]:
model = BernoulliNB()
parameters = {
    'alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_stem_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.935472361809045
alpha: 1e-05


### Best model: Multinomial NB on vectors_train_Lemma_F

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
model = MultinomialNB(alpha=1e-05)
model.fit(vectors_train_Lemma_F, y_train_num)
cross_val_score(model, vectors_train_Lemma_F, y_train_num, cv=10)

array([0.95      , 0.955     , 0.95      , 0.95      , 0.91      ,
       0.945     , 0.96      , 0.935     , 0.94      , 0.94472362])

In [None]:
model = MultinomialNB(alpha=1e-05)
model.fit(vectors_train_Lemma_F, y_train_num)
y_pred = model.predict(vectors_test_Lemma_F)
y_pred = le.inverse_transform(y_pred)

#### Write results to CSV

In [None]:
result = pd.DataFrame({'id': test.id, 'subreddit': y_pred})
result.to_csv("result.csv", index=False)

In [None]:
pred_csv = pd.read_csv('result.csv',engine='python')
pred_csv.head()

Unnamed: 0,id,subreddit
0,0,science
1,1,science
2,2,science
3,3,science
4,4,science


### Second Best model: LinearSVC on vectors_train_Lemma_F 

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
model = LinearSVC(C=1)
# model.fit(vectors_train_Lemma_F, y_train_num)
cross_val_score(model, vectors_train_Lemma_F, y_train_num, cv=10)

array([0.92      , 0.96      , 0.945     , 0.93      , 0.93      ,
       0.95      , 0.965     , 0.945     , 0.915     , 0.95477387])

In [None]:
model = LinearSVC(C=1)
model.fit(vectors_train_Lemma_F, y_train_num)
y_pred = model.predict(vectors_test_Lemma_F)
y_pred = le.inverse_transform(y_pred)

#### Write results to CSV

In [None]:
result = pd.DataFrame({'id': test.id, 'subreddit': y_pred})
result.to_csv("result.csv", index=False)

In [None]:
pred_csv = pd.read_csv('result.csv',engine='python')
pred_csv.head()

Unnamed: 0,id,subreddit
0,0,science
1,1,science
2,2,science
3,3,science
4,4,science
