<a href="https://colab.research.google.com/github/jwang44/crispy-fiesta/blob/main/runtime_accu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/

Mounted at /content/drive
/content/drive/MyDrive


## Load the data and get basic features

In [2]:
import pandas as pd
import numpy as np

In [3]:
train = pd.read_csv('./train.csv',engine='python')
test = pd.read_csv('./test.csv',engine='python')

In [4]:
X_train = train.body  # train texts
y_train = train.subreddit # train subreddits
X_test = test.body  # test texts

In [5]:
from sklearn.preprocessing import Normalizer, LabelEncoder
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [6]:
# transform target labels to values
le = LabelEncoder()
y_train_num = le.fit_transform(y_train.values) # convert category from string to numerical (!!!!! update the variables in kcross fold)

# vectorize word count
vectorizer = CountVectorizer()
vectors_train = vectorizer.fit_transform(X_train)
vectors_test = vectorizer.transform(X_test)

normalizer_train = Normalizer()
vectors_train= normalizer_train.transform(vectors_train)
vectors_test= normalizer_train.transform(vectors_test)

# print(vectorizer.get_feature_names())
print(vectors_train.shape)
print(vectors_test.shape)

(1999, 15365)
(1378, 15365)


In [7]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [8]:
# put it all together: remove stop words and punctuation, tfidf, lemmatization, normalization
stop_words = text.ENGLISH_STOP_WORDS

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

class New_LemmaTokenizer:
     def __init__(self):
       self.wnl = WordNetLemmatizer()
     def __call__(self, doc):
       return [self.wnl.lemmatize(t,pos =get_wordnet_pos(t)) for t in word_tokenize(doc) if t.isalpha()]

tf_idf_transformer = TfidfTransformer()
vectorizer = CountVectorizer(stop_words = stop_words, tokenizer = New_LemmaTokenizer(), ngram_range=(1, 2)) #unigram+bigram:ngram_range=(1, 2), only bigram:ngram_range=(2, 2)
vectors_train_stop_tfidf_Lemma = vectorizer.fit_transform(X_train)
vectors_train_stop_tfidf_Lemma = tf_idf_transformer.fit_transform(vectors_train_stop_tfidf_Lemma)
vectors_test_stop_tfidf_Lemma = vectorizer.transform(X_test)
vectors_test_stop_tfidf_Lemma = tf_idf_transformer.transform(vectors_test_stop_tfidf_Lemma)
vectors_train_stop_tfidf_Lemma = normalizer_train.transform(vectors_train_stop_tfidf_Lemma)
vectors_test_stop_tfidf_Lemma = normalizer_train.transform(vectors_test_stop_tfidf_Lemma)

#print(vectorizer.get_feature_names())
print(vectors_train_stop_tfidf_Lemma.shape)
print(vectors_test_stop_tfidf_Lemma.shape)

  'stop_words.' % sorted(inconsistent))


(1999, 70414)
(1378, 70414)


In [9]:
# remove stopwords and punctuation, tfidf, stemming, normalization
stop_words = text.ENGLISH_STOP_WORDS

class StemTokenizer:
     def __init__(self):
       self.wnl =PorterStemmer()
     def __call__(self, doc):
       return [self.wnl.stem(t) for t in word_tokenize(doc) if t.isalpha()]

tf_idf_transformer = TfidfTransformer()
vectorizer = CountVectorizer(stop_words = stop_words, tokenizer=StemTokenizer(),ngram_range=(1, 2)) #unigram+bigram:ngram_range=(1, 2), only bigram:ngram_range=(2, 2)
vectors_train_stop_tfidf_stem = vectorizer.fit_transform(X_train)
vectors_train_stop_tfidf_stem = tf_idf_transformer.fit_transform(vectors_train_stop_tfidf_stem)
vectors_test_stop_tfidf_stem = vectorizer.transform(X_test)
vectors_test_stop_tfidf_stem = tf_idf_transformer.transform(vectors_test_stop_tfidf_stem)
vectors_train_stop_tfidf_stem = normalizer_train.transform(vectors_train_stop_tfidf_stem)
vectors_test_stop_tfidf_stem = normalizer_train.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stop_tfidf_stem.shape)
print(vectors_test_stop_tfidf_stem.shape)

  'stop_words.' % sorted(inconsistent))


(1999, 73597)
(1378, 73597)


## Measure runtime and accuracy for different feature numbers and different feature selections

In [39]:
from sklearn.feature_selection import SelectKBest, SelectPercentile, chi2, mutual_info_classif, f_classif, SelectFpr, SelectFwe, SelectFdr, RFE, RFECV, SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import time

In [66]:
def runtime_accu(selector, model, per):
  # select = SelectKBest(chi2, k=k)
  if selector=='chi2':
    select = SelectPercentile(chi2, percentile=per)
  elif selector=='mutual_info':
    select = SelectPercentile(mutual_info_classif, percentile=per)
  elif selector=='f':
    select = SelectPercentile(f_classif, percentile=per)
  
  vectors_train_Lemma = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
  print(vectors_train_Lemma.shape)
  t1 = time.time()
  print("accuracy: ", cross_val_score(model, vectors_train_Lemma, y_train_num, cv=10).mean())
  print("runtime: ", time.time()-t1, '\n')

In [80]:
per_range = [0.015, 0.06, 0.24, 1, 4, 16, 64]

### Chi2

In [76]:
selector = 'chi2'

In [81]:
model = LinearSVC(C=1)
for per in per_range:
  runtime_accu(selector, model, per)

(1999, 11)
accuracy:  0.689354271356784
runtime:  0.1898493766784668 

(1999, 42)
accuracy:  0.7594045226130653
runtime:  0.20742559432983398 

(1999, 169)
accuracy:  0.8579472361809044
runtime:  0.23154997825622559 

(1999, 705)
accuracy:  0.9119572864321608
runtime:  0.26328563690185547 

(1999, 2817)
accuracy:  0.9404773869346735
runtime:  0.342679500579834 

(1999, 11266)
accuracy:  0.9469723618090452
runtime:  0.45365047454833984 

(1999, 45064)
accuracy:  0.9419748743718592
runtime:  0.7767648696899414 



In [83]:
model = SVC(C=1)
for per in per_range:
  runtime_accu(selector, model, per)

(1999, 11)
accuracy:  0.6848492462311558
runtime:  0.8431212902069092 

(1999, 42)
accuracy:  0.7578969849246231
runtime:  1.0111474990844727 

(1999, 169)
accuracy:  0.8509346733668341
runtime:  1.3295824527740479 

(1999, 705)
accuracy:  0.8959447236180905
runtime:  3.338622808456421 

(1999, 2817)
accuracy:  0.929467336683417
runtime:  10.090218782424927 

(1999, 11266)
accuracy:  0.8959472361809044
runtime:  15.05076265335083 

(1999, 45064)
accuracy:  0.9204748743718593
runtime:  21.84426212310791 



In [85]:
model = MultinomialNB(alpha=1e-10)
for per in per_range:
  runtime_accu(selector, model, per)

(1999, 11)
accuracy:  0.5967914572864321
runtime:  0.03517866134643555 

(1999, 42)
accuracy:  0.7013643216080403
runtime:  0.031725168228149414 

(1999, 169)
accuracy:  0.8084170854271356
runtime:  0.03154349327087402 

(1999, 705)
accuracy:  0.8954597989949749
runtime:  0.04047989845275879 

(1999, 2817)
accuracy:  0.9479824120603016
runtime:  0.04791426658630371 

(1999, 11266)
accuracy:  0.9589849246231156
runtime:  0.07815885543823242 

(1999, 45064)
accuracy:  0.9279748743718592
runtime:  0.1644604206085205 



In [86]:
model = LogisticRegression()
for per in per_range:
  runtime_accu(selector, model, per)

(1999, 11)
accuracy:  0.6848517587939698
runtime:  0.519320011138916 

(1999, 42)
accuracy:  0.7508944723618091
runtime:  0.7451884746551514 

(1999, 169)
accuracy:  0.8499422110552762
runtime:  0.9701001644134521 

(1999, 705)
accuracy:  0.9049547738693468
runtime:  1.2688312530517578 

(1999, 2817)
accuracy:  0.9304698492462311
runtime:  4.636819124221802 

(1999, 11266)
accuracy:  0.9369723618090452
runtime:  10.185412168502808 

(1999, 45064)
accuracy:  0.9334748743718595
runtime:  34.91974925994873 



### Mutual info

In [87]:
selector = 'mutual_info'

In [88]:
model = LinearSVC(C=1)
for per in per_range:
  runtime_accu(selector, model, per)

(1999, 11)
accuracy:  0.48426381909547744
runtime:  0.20358991622924805 

(1999, 43)
accuracy:  0.7133768844221106
runtime:  0.2039651870727539 

(1999, 169)
accuracy:  0.8244221105527638
runtime:  0.2308509349822998 

(1999, 705)
accuracy:  0.8824447236180906
runtime:  0.29204273223876953 

(1999, 2816)
accuracy:  0.9289648241206031
runtime:  0.38042211532592773 

(1999, 11266)
accuracy:  0.9354648241206028
runtime:  0.48803162574768066 

(1999, 45064)
accuracy:  0.937467336683417
runtime:  0.7465944290161133 



In [89]:
model = SVC(C=1)
for per in per_range:
  runtime_accu(selector, model, per)

(1999, 11)
accuracy:  0.48827638190954775
runtime:  1.937324047088623 

(1999, 43)
accuracy:  0.6988668341708544
runtime:  2.406881093978882 

(1999, 169)
accuracy:  0.8084120603015075
runtime:  3.5704739093780518 

(1999, 705)
accuracy:  0.8674346733668342
runtime:  6.797235727310181 

(1999, 2816)
accuracy:  0.922964824120603
runtime:  14.542876720428467 

(1999, 11266)
accuracy:  0.9214597989949749
runtime:  18.81100559234619 

(1999, 45064)
accuracy:  0.9484798994974876
runtime:  22.218812942504883 



In [90]:
model = MultinomialNB(alpha=1e-10)
for per in per_range:
  runtime_accu(selector, model, per)

(1999, 11)
accuracy:  0.4632462311557789
runtime:  0.022405385971069336 

(1999, 43)
accuracy:  0.669856783919598
runtime:  0.022440195083618164 

(1999, 169)
accuracy:  0.7443869346733669
runtime:  0.02251148223876953 

(1999, 705)
accuracy:  0.7949120603015075
runtime:  0.028333187103271484 

(1999, 2816)
accuracy:  0.8734371859296483
runtime:  0.035927534103393555 

(1999, 11266)
accuracy:  0.8999547738693467
runtime:  0.05977010726928711 

(1999, 45064)
accuracy:  0.8904497487437186
runtime:  0.13895964622497559 



In [91]:
model = LogisticRegression()
for per in per_range:
  runtime_accu(selector, model, per)

(1999, 11)
accuracy:  0.4882688442211055
runtime:  0.645073413848877 

(1999, 43)
accuracy:  0.7068869346733667
runtime:  0.6883525848388672 

(1999, 169)
accuracy:  0.814427135678392
runtime:  1.0057330131530762 

(1999, 705)
accuracy:  0.8859497487437185
runtime:  1.4701075553894043 

(1999, 2816)
accuracy:  0.9234723618090452
runtime:  5.246405124664307 

(1999, 11266)
accuracy:  0.9314773869346734
runtime:  13.437782526016235 

(1999, 45064)
accuracy:  0.9284773869346734
runtime:  34.78925347328186 



### F score

In [92]:
selector = 'f'

In [93]:
model = LinearSVC(C=1)
for per in per_range:
  runtime_accu(selector, model, per)

(1999, 11)
accuracy:  0.699356783919598
runtime:  0.16190314292907715 

(1999, 20)
accuracy:  0.7348844221105527
runtime:  0.16633152961730957 

(1999, 168)
accuracy:  0.834429648241206
runtime:  0.2083582878112793 

(1999, 705)
accuracy:  0.9049422110552765
runtime:  0.24715232849121094 

(1999, 2817)
accuracy:  0.9409698492462312
runtime:  0.35201287269592285 

(1999, 11266)
accuracy:  0.9454748743718593
runtime:  0.46314024925231934 

(1999, 45064)
accuracy:  0.9459798994974875
runtime:  0.7204024791717529 



In [94]:
model = SVC(C=1)
for per in per_range:
  runtime_accu(selector, model, per)

(1999, 11)
accuracy:  0.6963492462311558
runtime:  0.9451847076416016 

(1999, 20)
accuracy:  0.7353793969849247
runtime:  0.9378843307495117 

(1999, 168)
accuracy:  0.8264221105527639
runtime:  1.3068928718566895 

(1999, 705)
accuracy:  0.8864296482412062
runtime:  3.703448534011841 

(1999, 2817)
accuracy:  0.9344748743718594
runtime:  11.83224630355835 

(1999, 11266)
accuracy:  0.9249623115577889
runtime:  16.691577911376953 

(1999, 45064)
accuracy:  0.7908869346733669
runtime:  20.65315270423889 



In [95]:
model = MultinomialNB(alpha=1e-10)
for per in per_range:
  runtime_accu(selector, model, per)

(1999, 11)
accuracy:  0.646321608040201
runtime:  0.02029895782470703 

(1999, 20)
accuracy:  0.683354271356784
runtime:  0.020882368087768555 

(1999, 168)
accuracy:  0.7789020100502513
runtime:  0.02650904655456543 

(1999, 705)
accuracy:  0.8764497487437186
runtime:  0.025598764419555664 

(1999, 2817)
accuracy:  0.9404798994974876
runtime:  0.03756141662597656 

(1999, 11266)
accuracy:  0.9579849246231156
runtime:  0.05775046348571777 

(1999, 45064)
accuracy:  0.9554849246231157
runtime:  0.12949609756469727 



In [96]:
model = LogisticRegression()
for per in per_range:
  runtime_accu(selector, model, per)

(1999, 11)
accuracy:  0.6958492462311557
runtime:  0.4725379943847656 

(1999, 20)
accuracy:  0.7308793969849245
runtime:  0.5780160427093506 

(1999, 168)
accuracy:  0.8294246231155779
runtime:  0.8769228458404541 

(1999, 705)
accuracy:  0.8959472361809044
runtime:  1.2939579486846924 

(1999, 2817)
accuracy:  0.9319723618090452
runtime:  4.657679557800293 

(1999, 11266)
accuracy:  0.9374773869346734
runtime:  11.054420232772827 

(1999, 45064)
accuracy:  0.9359773869346736
runtime:  36.070772886276245 



## Ignore for now

### 13.4.1 selectfrommodel L1 norm

In [None]:
estimator = LinearSVC(C=10, penalty="l1",dual=False)
select = SelectFromModel(estimator,max_features=5000)
vectors_train_Lemma_SFML1 = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_SFML1 = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_SFML1.shape)

(1999, 2091)




In [None]:
estimator = LinearSVC(C=10, penalty="l1",dual=False)
select = SelectFromModel(estimator,max_features=5000)
vectors_train_stem_SFML1 = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_SFML1 = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_SFML1.shape)

(1999, 2064)




### 13.4.2 selectfrommodel tree

In [None]:
clf = ExtraTreesClassifier()
clf = clf.fit(vectors_train_stop_tfidf_Lemma, y_train_num)
model = SelectFromModel(clf, prefit=True)
vectors_train_Lemma_SFMtree = model.transform(vectors_train_stop_tfidf_Lemma)
vectors_train_Lemma_SFMtree.shape

(1999, 9466)

In [None]:
clf = ExtraTreesClassifier()
clf = clf.fit(vectors_train_stop_tfidf_stem, y_train_num)
model = SelectFromModel(clf, prefit=True)
vectors_train_stem_SFMtree = model.transform(vectors_train_stop_tfidf_stem)
vectors_train_stem_SFMtree.shape

(1999, 9776)

###Recursive feature elimination

In [53]:
estimator = LinearSVC()
select = RFECV(estimator, step=700,scoring='accuracy', n_jobs=-1)
vectors_train_Lemma_RFESVC = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_RFESVC = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_RFESVC.shape)

(1999, 18614)


In [55]:
model = LinearSVC()
# model.fit(vectors_train_Lemma_RFESVC, y_num)
cross_val_score(model, vectors_train_Lemma_RFESVC, y_train_num).mean()

0.9384661654135338

In [None]:
estimator = LinearSVC()
select = RFECV(estimator, step=85,scoring='accuracy')
vectors_train_stem_RFESVC = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_RFESVC = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_RFESVC.shape)

In [None]:
estimator = LogisticRegression()
select = RFECV(estimator, step=98,scoring='accuracy')
vectors_train_Lemma_RFELR = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_RFELR = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_RFELR.shape)

In [None]:
estimator = LogisticRegression()
select = RFECV(estimator, step=85,scoring='accuracy')
vectors_train_stem_RFELR = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_RFELR = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_RFELR.shape)

In [None]:
estimator = MultinomialNB()
select = RFECV(estimator, step=98,scoring='accuracy')
vectors_train_Lemma_RFEMNB = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_RFEMNB = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_RFEMNB.shape)

In [None]:
estimator = MultinomialNB()
select = RFECV(estimator, step=85,scoring='accuracy')
vectors_train_stem_RFEMNB = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_RFEMNB = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_RFEMNB.shape)

### Experiment on sklearn models (with NGRAM)

### Find the best set of features
We have 16 sets in total


The best are 

vectors_train_Lemma_X2

vectors_train_Lemma_F

vectors_train_Lemma_SFML1

In [None]:
model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_X2, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stem_X2, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_mutual, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stem_mutual, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_F, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stem_F, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_FPR, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stem_FPR, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_FDR, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stem_FDR, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_FWE, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stem_FWE, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_SFML1, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stem_SFML1, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_SFMtree, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_stem_SFMtree, y_train_num, cv=10)
print(scores.mean())

0.9459798994974875
0.9449748743718593
0.934464824120603
0.930462311557789
0.9469748743718593
0.9449773869346734
0.9384698492462311
0.9424723618090451
0.9214572864321608
0.927467336683417
0.8869422110552764
0.8869422110552764
0.9484748743718594
0.9464723618090451
0.938464824120603
0.9339623115577889


In [None]:
model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_X2, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stem_X2, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_mutual, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stem_mutual, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_F, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stem_F, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_FPR, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stem_FPR, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_FDR, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stem_FDR, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_FWE, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stem_FWE, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_SFML1, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stem_SFML1, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_Lemma_SFMtree, y_train_num, cv=10)
print(scores.mean())

model = LogisticRegression()
scores = cross_val_score(model, vectors_train_stem_SFMtree, y_train_num, cv=10)
print(scores.mean())

0.9364798994974877
0.9354798994974877
0.9289748743718593
0.9289723618090452
0.9374773869346734
0.9309748743718593
0.9329723618090451
0.927467336683417
0.9079522613065327
0.9109572864321608
0.8764422110552765
0.8764422110552765
0.9274698492462312
0.9264698492462312
0.9304723618090451
0.926469849246231


In [None]:
model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_X2, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_stem_X2, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_mutual, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_stem_mutual, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_F, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_stem_F, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_FPR, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_stem_FPR, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_FDR, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_stem_FDR, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_FWE, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_stem_FWE, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_SFML1, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_stem_SFML1, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_Lemma_SFMtree, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB()
scores = cross_val_score(model, vectors_train_stem_SFMtree, y_train_num, cv=10)
print(scores.mean())

0.9179673366834171
0.9209648241206029
0.9164673366834171
0.9144648241206029
0.9329723618090451
0.928967336683417
0.9159698492462311
0.9214623115577888
0.8879497487437185
0.8814522613065326
0.839929648241206
0.839929648241206
0.926469849246231
0.9324673366834171
0.9229698492462312
0.9234648241206029


### Grid search
The best features:

vectors_train_Lemma_X2

vectors_train_Lemma_F

vectors_train_Lemma_SFML1

#### vectors_train_Lemma_X2
Best models: LinearSVM, MultiNB, BernouliNB

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = LinearSVC()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_X2, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9459798994974875
C: 1


In [None]:
model = MultinomialNB()
parameters = {
    'alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2), 
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_X2, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9554849246231155
alpha: 1e-10


In [None]:
model = BernoulliNB()
parameters = {
    'alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_X2, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9469773869346734
alpha: 1e-10


#### vectors_train_Lemma_F
Best model: MultiNB, LinearSVM, BernouliNB

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = LinearSVC()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9469748743718593
C: 1


In [None]:
model = MultinomialNB()
parameters = {
    'alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2), 
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9639874371859296
alpha: 1e-05


In [None]:
model = BernoulliNB()
parameters = {
    'alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_F, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9594824120603015
alpha: 1e-10


#### vectors_train_Lemma_SFML1
Best model: MultiNB, LinearSVM, BernouliNB

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = LinearSVC()
parameters = {
    'C': (0.01, 0.1, 1, 10, 100, 1000)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_SFML1, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9594874371859297
C: 10


In [None]:
model = MultinomialNB()
parameters = {
    'alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2), 
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_SFML1, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9439773869346734
alpha: 0.1


In [None]:
model = BernoulliNB()
parameters = {
    'alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2)
}
gs_model = GridSearchCV(model, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_Lemma_SFML1, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9519874371859297
alpha: 0.1


### Best model: LinearSVM on vectors_train_Lemma_SFML1

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
model = LinearSVC(C=10)
model.fit(vectors_train_Lemma_SFML1, y_train_num)
cross_val_score(model, vectors_train_Lemma_SFML1, y_train_num, cv=10)

array([0.94      , 0.97      , 0.96      , 0.955     , 0.945     ,
       0.96      , 0.985     , 0.955     , 0.95      , 0.97487437])

In [None]:
model = LinearSVC(C=10)
model.fit(vectors_train_Lemma_SFML1, y_train_num)
y_pred = model.predict(vectors_test_Lemma_SFML1)
y_pred = le.inverse_transform(y_pred)

#### Write results to CSV

In [None]:
result = pd.DataFrame({'id': test.id, 'subreddit': y_pred})
result.to_csv("result.csv", index=False)

In [None]:
pred_csv = pd.read_csv('result.csv',engine='python')
pred_csv.head()

Unnamed: 0,id,subreddit
0,0,science
1,1,science
2,2,anime
3,3,science
4,4,science


### Second Best model: MultiNB on vectors_train_Lemma_F

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
model = MultinomialNB(alpha=1e-5)
# model.fit(vectors_train_Lemma_F, y_train_num)
cross_val_score(model, vectors_train_Lemma_F, y_train_num, cv=10)

array([0.96      , 0.985     , 0.95      , 0.975     , 0.94      ,
       0.965     , 0.98      , 0.955     , 0.955     , 0.97487437])

In [None]:
model = MultinomialNB(alpha=1e-5)
model.fit(vectors_train_Lemma_F, y_train_num)
y_pred = model.predict(vectors_test_Lemma_F)
y_pred = le.inverse_transform(y_pred)

#### Write results to CSV

In [None]:
result = pd.DataFrame({'id': test.id, 'subreddit': y_pred})
result.to_csv("result.csv", index=False)

In [None]:
pred_csv = pd.read_csv('result.csv',engine='python')
pred_csv.head()

Unnamed: 0,id,subreddit
0,0,science
1,1,science
2,2,laptop
3,3,science
4,4,science
