<a href="https://colab.research.google.com/github/jwang44/crispy-fiesta/blob/main/feature_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/

Mounted at /content/drive
/content/drive/MyDrive


In [3]:
import pandas as pd
import numpy as np

In [4]:
train = pd.read_csv('./train.csv',engine='python')
test = pd.read_csv('./test.csv',engine='python')

In [5]:
X_train = train.body  # train texts
y_train = train.subreddit # train subreddits
X_test = test.body  # test texts

In [8]:
from sklearn.preprocessing import Normalizer, LabelEncoder
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [39]:
# transform target labels to values
le = LabelEncoder()
y_train_num = le.fit_transform(y_train.values) # convert category from string to numerical (!!!!! update the variables in kcross fold)

# vectorize word count
vectorizer = CountVectorizer()
vectors_train = vectorizer.fit_transform(X_train)
vectors_test = vectorizer.transform(X_test)

normalizer_train = Normalizer()
vectors_train= normalizer_train.transform(vectors_train)
vectors_test= normalizer_train.transform(vectors_test)

# print(vectorizer.get_feature_names())
print(vectors_train.shape)
print(vectors_test.shape)

(1999, 15365)
(1378, 15365)


In [46]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [47]:
# put it all together: remove stop words and punctuation, tfidf, lemmatization, normalization
stop_words = text.ENGLISH_STOP_WORDS

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

class New_LemmaTokenizer:
     def __init__(self):
       self.wnl = WordNetLemmatizer()
     def __call__(self, doc):
       return [self.wnl.lemmatize(t,pos =get_wordnet_pos(t)) for t in word_tokenize(doc) if t.isalpha()]

tf_idf_transformer = TfidfTransformer()
vectorizer = CountVectorizer(stop_words = stop_words, tokenizer = New_LemmaTokenizer())
vectors_train_stop_tfidf_Lemma = vectorizer.fit_transform(X_train)
vectors_train_stop_tfidf_Lemma = tf_idf_transformer.fit_transform(vectors_train_stop_tfidf_Lemma)
vectors_test_stop_tfidf_Lemma = vectorizer.transform(X_test)
vectors_test_stop_tfidf_Lemma = tf_idf_transformer.transform(vectors_test_stop_tfidf_Lemma)
vectors_train_stop_tfidf_Lemma = normalizer_train.transform(vectors_train_stop_tfidf_Lemma)
vectors_test_stop_tfidf_Lemma = normalizer_train.transform(vectors_test_stop_tfidf_Lemma)

# print(vectorizer.get_feature_names())
print(vectors_train_stop_tfidf_Lemma.shape)
print(vectors_test_stop_tfidf_Lemma.shape)

  'stop_words.' % sorted(inconsistent))


(1999, 9779)
(1378, 9779)


In [48]:
# remove stopwords and punctuation, tfidf, stemming, normalization
stop_words = text.ENGLISH_STOP_WORDS

class StemTokenizer:
     def __init__(self):
       self.wnl =PorterStemmer()
     def __call__(self, doc):
       return [self.wnl.stem(t) for t in word_tokenize(doc) if t.isalpha()]

tf_idf_transformer = TfidfTransformer()
vectorizer = CountVectorizer(stop_words = stop_words, tokenizer=StemTokenizer())
vectors_train_stop_tfidf_stem = vectorizer.fit_transform(X_train)
vectors_train_stop_tfidf_stem = tf_idf_transformer.fit_transform(vectors_train_stop_tfidf_stem)
vectors_test_stop_tfidf_stem = vectorizer.transform(X_test)
vectors_test_stop_tfidf_stem = tf_idf_transformer.transform(vectors_test_stop_tfidf_stem)
vectors_train_stop_tfidf_stem = normalizer_train.transform(vectors_train_stop_tfidf_stem)
vectors_test_stop_tfidf_stem = normalizer_train.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stop_tfidf_stem.shape)
print(vectors_test_stop_tfidf_stem.shape)

  'stop_words.' % sorted(inconsistent))


(1999, 8522)
(1378, 8522)


In [86]:
from sklearn.feature_selection import SelectKBest, chi2, VarianceThreshold, f_classif, mutual_info_classif, RFE, SelectFromModel, SequentialFeatureSelector

13.2 1 chi2

In [52]:
select = SelectKBest(chi2, k=5000)
vectors_train_Lemma_X2 = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_X2 = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_X2.shape)

(1999, 5000)


In [54]:
select = SelectKBest(chi2, k=4000)
vectors_train_stem_X2 = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_X2 = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_X2.shape)

(1999, 4000)


13.2 2 mutual info

In [55]:
select = SelectKBest(mutual_info_classif, k=5000)
vectors_train_Lemma_mutual = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_mutual = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_mutual.shape)

(1999, 5000)


In [57]:
select = SelectKBest(mutual_info_classif, k=4000)
vectors_train_stem_mutual = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_mutual = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_mutual.shape)

(1999, 4000)


## 13.2 3 F score

In [58]:
select = SelectKBest(f_classif, k=6000)
vectors_train_Lemma_F = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_F = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_F.shape)

(1999, 6000)


In [59]:
select = SelectKBest(f_classif, k=5000)
vectors_train_stem_F = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_stem_F = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_stem_F.shape)

(1999, 5000)


13.3 RFE

In [66]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC

from sklearn.model_selection import KFold, cross_val_score

In [73]:
estimator = LogisticRegression()
select = RFE(estimator, n_features_to_select=5000, step=0.1)
vectors_train_Lemma_RFE = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_RFE = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_RFE.shape)

(1999, 5000)


In [None]:
estimator = LogisticRegression()
select = RFE(estimator, n_features_to_select=5000, step=0.1)
vectors_train_stem_RFE = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_RFE = select.transform(vectors_test_stop_tfidf_stem)

13.4 selectfrommodel

In [76]:
estimator = LinearSVC(C=0.01, penalty="l1",dual=False)
select = SelectFromModel(estimator,max_features=5000)
vectors_train_Lemma_SFM = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_SFM = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_SFM.shape)

(1999, 1)


In [None]:
estimator = LinearSVC(C=0.01, penalty="l1",dual=False)
select = SelectFromModel(estimator,max_features=5000)
vectors_train_stem_SFM = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_SFM = select.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stem_SFM.shape)

13.5 SequentialFeatureSelector

In [84]:
estimator = LogisticRegression()
select = SequentialFeatureSelector(estimator, n_features_to_select=3000)
vectors_train_Lemma_SFS = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_SFS = select.transform(vectors_test_stop_tfidf_Lemma)

NameError: ignored

In [77]:
estimator = LogisticRegression()
select = SequentialFeatureSelector(estimator, n_features_to_select=3000)
vectors_train_stem_SFS = select.fit_transform(vectors_train_stop_tfidf_stem, y_train_num)
vectors_test_stem_SFS = select.transform(vectors_test_stop_tfidf_stem)

NameError: ignored