<a href="https://colab.research.google.com/github/jwang44/crispy-fiesta/blob/main/submit/First_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# First experiments
This notebook includes basic text feature design steps as given in the tutorial, and experiments with basic classifiers from sklearn. 

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/

Mounted at /content/drive
/content/drive/MyDrive


#### Load the data

In [None]:
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv('./train.csv',engine='python')
test = pd.read_csv('./test.csv',engine='python')

In [None]:
X_train = train.body  # train texts
y_train = train.subreddit # train subreddits
X_test = test.body  # test texts

## Feature extraction

### sk-learn processing

In [None]:
from sklearn.preprocessing import Normalizer, LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [None]:
# transform target labels to values
le = LabelEncoder()
y_train_num = le.fit_transform(y_train.values) # convert category from string to numerical (!!!!! update the variables in kcross fold)

# vectorize word count
vectorizer = CountVectorizer()
vectors_train = vectorizer.fit_transform(X_train)
vectors_test = vectorizer.transform(X_test)
vectors_train = vectors_train.todense()
vectors_test = vectors_test.todense()

# onehot encoding
onehot = OneHotEncoder(handle_unknown = 'ignore')
vectors_train = onehot.fit_transform(vectors_train)
vectors_test = onehot.transform(vectors_test)

normalizer_train = Normalizer()

# print(vectorizer.get_feature_names())
print(vectors_train.shape)
print(vectors_test.shape)

(1999, 35729)
(1378, 35729)


#### Binary



In [None]:
vectorizer = CountVectorizer(binary=True)
vectors_train_binary = vectorizer.fit_transform(X_train)
vectors_test_binary = vectorizer.transform(X_test)


In [None]:
# tf-idf
tf_idf_vectorizer = TfidfVectorizer()
vectors_train_idf = tf_idf_vectorizer.fit_transform(X_train)
vectors_test_idf = tf_idf_vectorizer.transform(X_test)
vectors_train_idf= normalizer_train.transform(vectors_train_idf)
vectors_test_idf = normalizer_train.transform(vectors_test_idf)
print(vectors_train_idf.shape)
print(vectors_test_idf.shape)

(1999, 15365)
(1378, 15365)


### nltk processing

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


####Stemming
features: `vector_train_stem`, `vector_test_stem`

In [None]:
# stemming
class StemTokenizer:
     def __init__(self):
       self.wnl =PorterStemmer()
     def __call__(self, doc):
       return [self.wnl.stem(t) for t in word_tokenize(doc) if t.isalpha()]

vectorizer = CountVectorizer(tokenizer=StemTokenizer())
vectors_train_stem = vectorizer.fit_transform(X_train)
vectors_test_stem = vectorizer.transform(X_test)
vectors_train_stem= normalizer_train.transform(vectors_train_stem)
vectors_test_stem = normalizer_train.transform(vectors_test_stem)
print(vectors_train_stem.shape)
print(vectors_test_stem.shape)

(1999, 8727)
(1378, 8727)


#### Lemmatization
features: `vector_train_Lemma`, `vector_test_Lemma`

In [None]:
# Lemmatization
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)
  
class New_LemmaTokenizer:
     def __init__(self):
       self.wnl = WordNetLemmatizer()
     def __call__(self, doc):
       return [self.wnl.lemmatize(t,pos =get_wordnet_pos(t)) for t in word_tokenize(doc) if t.isalpha()]

vectorizer = CountVectorizer(tokenizer=New_LemmaTokenizer())
vectors_train_Lemma = vectorizer.fit_transform(X_train)
vectors_test_Lemma = vectorizer.transform(X_test)
vectors_train_Lemma= normalizer_train.transform(vectors_train_Lemma)
vectors_test_Lemma = normalizer_train.transform(vectors_test_Lemma)
print(vectors_train_Lemma.shape)
print(vectors_test_Lemma.shape)

(1999, 10045)
(1378, 10045)


#### 6 feature sets

1. features: vectors_train_stop, vectors_test_stop

In [None]:
# remove stop words and punctuation, normalization
stop_words = text.ENGLISH_STOP_WORDS

class PuncTokenizer:
     def __init__(self):
       pass
     def __call__(self, doc):
       return [t for t in word_tokenize(doc) if t.isalpha()]
vectorizer = CountVectorizer(stop_words = stop_words, tokenizer=PuncTokenizer())
vectors_train_stop = vectorizer.fit_transform(X_train)
vectors_test_stop = vectorizer.transform(X_test)

normalizer_train = Normalizer()
vectors_train_stop= normalizer_train.transform(vectors_train_stop)
vectors_test_stop = normalizer_train.transform(vectors_test_stop)
print(vectors_train_stop.shape)
print(vectors_test_stop.shape)
#print(vectorizer.get_feature_names())

(1999, 12402)
(1378, 12402)


2. features: vectors_train_stop_tfidf, vectors_test_stop_tfidf

In [None]:
# remove stop words and punctuation, tfidf, normalization
stop_words = text.ENGLISH_STOP_WORDS

tf_idf_transformer = TfidfTransformer()

class PuncTokenizer:
     def __init__(self):
       pass
     def __call__(self, doc):
       return [t for t in word_tokenize(doc) if t.isalpha()]
vectorizer = CountVectorizer(stop_words = stop_words, tokenizer=PuncTokenizer())
vectors_train_stop_tfidf = vectorizer.fit_transform(X_train)
vectors_train_stop_tfidf = tf_idf_transformer.fit_transform(vectors_train_stop_tfidf)
vectors_test_stop_tfidf = vectorizer.transform(X_test)
vectors_test_stop_tfidf = tf_idf_transformer.transform(vectors_test_stop_tfidf)

vectors_train_stop_tfidf = normalizer_train.transform(vectors_train_stop_tfidf)
vectors_test_stop_tfidf = normalizer_train.transform(vectors_test_stop_tfidf)
print(vectors_train_stop_tfidf.shape)
print(vectors_test_stop_tfidf.shape)

(1999, 12402)
(1378, 12402)


3. features: vectors_train_stop_Lemma, vectors_test_stop_Lemma

In [None]:
# remove stop words and punctuation, lemmatization, normalization
stop_words = text.ENGLISH_STOP_WORDS

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

class New_LemmaTokenizer:
     def __init__(self):
       self.wnl = WordNetLemmatizer()
     def __call__(self, doc):
       return [self.wnl.lemmatize(t,pos =get_wordnet_pos(t)) for t in word_tokenize(doc) if t.isalpha()]

vectorizer = CountVectorizer(stop_words = stop_words, tokenizer = New_LemmaTokenizer())
vectors_train_stop_Lemma = vectorizer.fit_transform(X_train)
vectors_test_stop_Lemma = vectorizer.transform(X_test)
vectors_train_stop_Lemma = normalizer_train.transform(vectors_train_stop_Lemma)
vectors_test_stop_Lemma = normalizer_train.transform(vectors_test_stop_Lemma)

# print(vectorizer.get_feature_names())
print(vectors_train_stop_Lemma.shape)
print(vectors_test_stop_Lemma.shape)

  'stop_words.' % sorted(inconsistent))


(1999, 9779)
(1378, 9779)


4. features: `vectors_train_stop_tfidf_Lemma`, `vectors_test_stop_tfidf_Lemma`

In [None]:
# put it all together: remove stop words and punctuation, tfidf, lemmatization, normalization
stop_words = text.ENGLISH_STOP_WORDS

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

class New_LemmaTokenizer:
     def __init__(self):
       self.wnl = WordNetLemmatizer()
     def __call__(self, doc):
       return [self.wnl.lemmatize(t,pos =get_wordnet_pos(t)) for t in word_tokenize(doc) if t.isalpha()]

tf_idf_transformer = TfidfTransformer()
vectorizer = CountVectorizer(stop_words = stop_words, tokenizer = New_LemmaTokenizer())
vectors_train_stop_tfidf_Lemma = vectorizer.fit_transform(X_train)
vectors_train_stop_tfidf_Lemma = tf_idf_transformer.fit_transform(vectors_train_stop_tfidf_Lemma)
vectors_test_stop_tfidf_Lemma = vectorizer.transform(X_test)
vectors_test_stop_tfidf_Lemma = tf_idf_transformer.transform(vectors_test_stop_tfidf_Lemma)
vectors_train_stop_tfidf_Lemma = normalizer_train.transform(vectors_train_stop_tfidf_Lemma)
vectors_test_stop_tfidf_Lemma = normalizer_train.transform(vectors_test_stop_tfidf_Lemma)

# print(vectorizer.get_feature_names())
print(vectors_train_stop_tfidf_Lemma.shape)
print(vectors_test_stop_tfidf_Lemma.shape)

  'stop_words.' % sorted(inconsistent))


(1999, 9779)
(1378, 9779)


5. features: vectors_train_stop_stem, vectors_test_stop_stem

In [None]:
# remove stopwords and punctuation, stemming, normalization
stop_words = text.ENGLISH_STOP_WORDS

class StemTokenizer:
     def __init__(self):
       self.wnl =PorterStemmer()
     def __call__(self, doc):
       return [self.wnl.stem(t) for t in word_tokenize(doc) if t.isalpha()]

vectorizer = CountVectorizer(stop_words = stop_words, tokenizer=StemTokenizer())
vectors_train_stop_stem = vectorizer.fit_transform(X_train)
vectors_test_stop_stem = vectorizer.transform(X_test)
vectors_train_stop_stem = normalizer_train.transform(vectors_train_stop_stem)
vectors_test_stop_stem = normalizer_train.transform(vectors_test_stop_stem)
print(vectors_train_stop_stem.shape)
print(vectors_test_stop_stem.shape)

  'stop_words.' % sorted(inconsistent))


(1999, 8522)
(1378, 8522)


6. features: vectors_train_stop_tfidf_stem, vectors_test_stop_tfidf_stem

In [None]:
# remove stopwords and punctuation, tfidf, stemming, normalization
stop_words = text.ENGLISH_STOP_WORDS

class StemTokenizer:
     def __init__(self):
       self.wnl =PorterStemmer()
     def __call__(self, doc):
       return [self.wnl.stem(t) for t in word_tokenize(doc) if t.isalpha()]

tf_idf_transformer = TfidfTransformer()
vectorizer = CountVectorizer(stop_words = stop_words, tokenizer=StemTokenizer())
vectors_train_stop_tfidf_stem = vectorizer.fit_transform(X_train)
vectors_train_stop_tfidf_stem = tf_idf_transformer.fit_transform(vectors_train_stop_tfidf_stem)
vectors_test_stop_tfidf_stem = vectorizer.transform(X_test)
vectors_test_stop_tfidf_stem = tf_idf_transformer.transform(vectors_test_stop_tfidf_stem)
vectors_train_stop_tfidf_stem = normalizer_train.transform(vectors_train_stop_tfidf_stem)
vectors_test_stop_tfidf_stem = normalizer_train.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stop_tfidf_stem.shape)
print(vectors_test_stop_tfidf_stem.shape)

  'stop_words.' % sorted(inconsistent))


(1999, 8522)
(1378, 8522)


#### binary (for use with Bernoulli)

In [None]:
# put it all together: remove stopwords, punctuation, lemmatization, 
stop_words = text.ENGLISH_STOP_WORDS

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

class New_LemmaTokenizer:
     def __init__(self):
       self.wnl = WordNetLemmatizer()
     def __call__(self, doc):
       return [self.wnl.lemmatize(t,pos =get_wordnet_pos(t)) for t in word_tokenize(doc) if t.isalpha()]

vectorizer = CountVectorizer(stop_words = stop_words,tokenizer=New_LemmaTokenizer(),binary=True)
vectors_train_stop_Lemma_binary = vectorizer.fit_transform(X_train)
vectors_test_stop_Lemma_binary = vectorizer.transform(X_test)


# print(vectorizer.get_feature_names())
# print(vectors_train_stop_Lemma_binary)
print(vectors_test_stop_Lemma_binary.shape)

  'stop_words.' % sorted(inconsistent))


(1378, 9779)


## Experiments with models in sk-learn

In [None]:
# from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC

from sklearn.model_selection import KFold, cross_val_score

#### Find the best set of features

We have 6 different sets of features
* vectors_train_stop, vectors_test_stop
* vectors_train_stop_tfidf, vectors_test_stop_tfidf
* vectors_train_stop_Lemma, vectors_test_stop_Lemma
* vectors_train_stop_tfidf_Lemma, vectors_test_stop_tfidf_Lemma
* vectors_train_stop_stem, vectors_test_stop_stem
* vectors_train_stop_tfidf_stem, vectors_test_stop_tfidf_stem




In [None]:
model = LinearSVC()
scores = cross_val_score(model, vectors_train_stop, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stop_tfidf, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stop_Lemma, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stop_tfidf_Lemma, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stop_stem, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stop_tfidf_stem, y_train_num, cv=10)
print(scores.mean())

0.9169673366834171
0.925469849246231
0.9239698492462312
0.933964824120603
0.921467336683417
0.9289648241206031


In [None]:
model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stop, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stop_tfidf, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stop_Lemma, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stop_tfidf_Lemma, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stop_stem, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stop_tfidf_stem, y_train_num, cv=10)
print(scores.mean())

0.9014497487437186
0.9254648241206029
0.9069597989949749
0.928969849246231
0.9004572864321607
0.927969849246231


#### Find the best off-the-shelf model

In [None]:
X = vectors_train_stop_tfidf_Lemma # the best set of feature found in the previous step
y = y_train_num

In [None]:
model = LinearSVC()
kf = KFold(n_splits=10, shuffle=True)
train_accus = []
test_accus = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train, y_train)
    train_accus.append(model.score(X_train, y_train))
    test_accus.append(model.score(X_test, y_test))
train_accus = np.array(train_accus)
test_accus = np.array(test_accus)
print("-------------Linear SVC---------------")
print("train accu: ", train_accus.mean())
print("test accu: ", test_accus.mean())

-------------Linear SVC---------------
train accu:  1.0
test accu:  0.9319723618090452


In [None]:
model = SVC()
kf = KFold(n_splits=10, shuffle=True)
train_accus = []
test_accus = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train, y_train)
    train_accus.append(model.score(X_train, y_train))
    test_accus.append(model.score(X_test, y_test))
train_accus = np.array(train_accus)
test_accus = np.array(test_accus)
print("-------------RBF SVC---------------")
print("train accu: ", train_accus.mean())
print("test accu: ", test_accus.mean())

-------------RBF SVC---------------
train accu:  1.0
test accu:  0.9234698492462311


In [None]:
model = LogisticRegression()
kf = KFold(n_splits=10, shuffle=True)
train_accus = []
test_accus = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train, y_train)
    train_accus.append(model.score(X_train, y_train))
    test_accus.append(model.score(X_test, y_test))
train_accus = np.array(train_accus)
test_accus = np.array(test_accus)
print("-------------Logistic Regression---------------")
print("train accu: ", train_accus.mean())
print("test accu: ", test_accus.mean())

-------------Linear SVC---------------
train accu:  0.9909955530850473
test accu:  0.929467336683417


In [None]:
model = KNeighborsClassifier()
kf = KFold(n_splits=10, shuffle=True)
train_accus = []
test_accus = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train, y_train)
    train_accus.append(model.score(X_train, y_train))
    test_accus.append(model.score(X_test, y_test))
train_accus = np.array(train_accus)
test_accus = np.array(test_accus)
print("-------------K-nearest Neighbor---------------")
print("train accu: ", train_accus.mean())
print("test accu: ", test_accus.mean())

-------------Linear SVC---------------
train accu:  0.9056750972762645
test accu:  0.8489095477386934


In [None]:
model = DecisionTreeClassifier()
kf = KFold(n_splits=10, shuffle=True)
train_accus = []
test_accus = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train, y_train)
    train_accus.append(model.score(X_train, y_train))
    test_accus.append(model.score(X_test, y_test))
train_accus = np.array(train_accus)
test_accus = np.array(test_accus)
print("-------------Decision Tree---------------")
print("train accu: ", train_accus.mean())
print("test accu: ", test_accus.mean())

-------------Linear SVC---------------
train accu:  1.0
test accu:  0.7989145728643218


In [None]:
model = MultinomialNB()
kf = KFold(n_splits=10, shuffle=True)
train_accus = []
test_accus = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train, y_train)
    train_accus.append(model.score(X_train, y_train))
    test_accus.append(model.score(X_test, y_test))
train_accus = np.array(train_accus)
test_accus = np.array(test_accus)
print("-------------Multinomial NB---------------")
print("train accu: ", train_accus.mean())
print("test accu: ", test_accus.mean())

-------------Multinomial NB---------------
train accu:  0.9808238218763512
test accu:  0.9174748743718592


In [None]:
model = BernoulliNB()
kf = KFold(n_splits=10, shuffle=True)
train_accus = []
test_accus = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train, y_train)
    train_accus.append(model.score(X_train, y_train))
    test_accus.append(model.score(X_test, y_test))
train_accus = np.array(train_accus)
test_accus = np.array(test_accus)
print("-------------Bernoulli NB---------------")
print("train accu: ", train_accus.mean())
print("test accu: ", test_accus.mean())

-------------Bernoulli NB---------------
train accu:  0.9438612809585573
test accu:  0.8469170854271357


#### Grid search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = LinearSVC()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_stop_tfidf_Lemma, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.933964824120603
C: 1


In [None]:
model = SVC()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000),
    'gamma': (1e-3, 1e-4)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_stop_tfidf_Lemma, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9224698492462311
C: 1000
gamma: 0.001


In [None]:
model = LogisticRegression()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000),
    'max_iter': (100, 1000, 5000, 10000)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_stop_tfidf_Lemma, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9344673366834171
C: 100
max_iter: 1000


In [None]:
model = KNeighborsClassifier()
parameters = {
    'n_neighbors': (3, 5, 10, 20, 40), 
    'p': (1, 2, 3),
    'leaf_size': (10, 20, 30, 50)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_stop_tfidf_Lemma, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.8654422110552764
leaf_size: 10
n_neighbors: 20
p: 2


In [None]:
model = DecisionTreeClassifier()
parameters = {
    'max_depth': (10, 100, 1000, 10000), 
    'min_samples_leaf': (1, 5, 10)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_stop_tfidf_Lemma, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.7719045226130653
max_depth: 1000
min_samples_leaf: 5


In [None]:
model = MultinomialNB()
parameters = {
    'alpha': (0, 0.1, 0.5, 1, 2), 
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_stop_tfidf_Lemma, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9254648241206029
alpha: 0.1


In [None]:
model = BernoulliNB()
parameters = {
    'alpha': (0, 0.1, 0.5, 1, 2)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_stop_tfidf_Lemma, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9034572864321608
alpha: 0.1


In [None]:
gs_model.cv_results_

## Make predictions on test data

In [None]:
model = LogisticRegression(C=100, max_iter=1000)
model.fit(vectors_train_stop_tfidf_Lemma, y_train_num)
y_pred = model.predict(vectors_test_stop_Lemma)
y_pred = le.inverse_transform(y_pred)

In [None]:
model = LinearSVC(C=1)
model.fit(vectors_train_stop_tfidf_Lemma, y_train_num)
y_pred = model.predict(vectors_test_stop_Lemma)
y_pred = le.inverse_transform(y_pred)

#### Write results to CSV

In [None]:
result = pd.DataFrame({'id': test.id, 'subreddit': y_pred})
result.to_csv("result.csv", index=False)

In [None]:
pred_csv = pd.read_csv('result.csv',engine='python')
pred_csv.head()

Unnamed: 0,id,subreddit
0,0,science
1,1,science
2,2,anime
3,3,science
4,4,science
