<a href="https://colab.research.google.com/github/jwang44/crispy-fiesta/blob/main/pipelinenew.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive


## Load the data and get basic features

In [None]:
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv('./train.csv',engine='python')
test = pd.read_csv('./test.csv',engine='python')

In [None]:
X_train = train.body  # train texts
y_train = train.subreddit # train subreddits
X_test = test.body  # test texts

In [None]:
from sklearn.preprocessing import Normalizer, LabelEncoder
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [None]:
# transform target labels to values
le = LabelEncoder()
y_train_num = le.fit_transform(y_train.values) # convert category from string to numerical (!!!!! update the variables in kcross fold)

# vectorize word count
vectorizer = CountVectorizer()
vectors_train = vectorizer.fit_transform(X_train)
vectors_test = vectorizer.transform(X_test)

normalizer_train = Normalizer()
vectors_train= normalizer_train.transform(vectors_train)
vectors_test= normalizer_train.transform(vectors_test)

# print(vectorizer.get_feature_names())
print(vectors_train.shape)
print(vectors_test.shape)

(1999, 15365)
(1378, 15365)


In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [169]:
# put it all together: remove stop words and punctuation, tfidf, lemmatization, normalization
stop_words = text.ENGLISH_STOP_WORDS

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

class New_LemmaTokenizer:
     def __init__(self):
       self.wnl = WordNetLemmatizer()
     def __call__(self, doc):
       return [self.wnl.lemmatize(t,pos =get_wordnet_pos(t)) for t in word_tokenize(doc) if t.isalpha()]

tf_idf_transformer = TfidfTransformer()
vectorizer = CountVectorizer(stop_words = stop_words, tokenizer = New_LemmaTokenizer(), ngram_range=(1, 2)) #unigram+bigram:ngram_range=(1, 2), only bigram:ngram_range=(2, 2)
vectors_train_stop_Lemma = vectorizer.fit_transform(X_train)
vectors_train_stop_tfidf_Lemma = tf_idf_transformer.fit_transform(vectors_train_stop_Lemma)
vectors_test_stop_Lemma = vectorizer.transform(X_test)
vectors_test_stop_tfidf_Lemma = tf_idf_transformer.transform(vectors_test_stop_Lemma)
vectors_train_stop_tfidf_Lemma = normalizer_train.transform(vectors_train_stop_tfidf_Lemma)
vectors_test_stop_tfidf_Lemma = normalizer_train.transform(vectors_test_stop_tfidf_Lemma)
normalizer_l1 = Normalizer(norm='l1')
vectors_train_stop_tfidf_l1_Lemma = normalizer_l1.transform(vectors_train_stop_tfidf_Lemma)
vectors_test_stop_tfidf_l1_Lemma = normalizer_l1.transform(vectors_test_stop_tfidf_Lemma)

#print(vectorizer.get_feature_names())
print(vectors_train_stop_tfidf_Lemma.shape)
#print(vectors_test_stop_tfidf_Lemma.shape)

  'stop_words.' % sorted(inconsistent))


(1999, 70414)


In [None]:
# remove stopwords and punctuation, tfidf, stemming, normalization
stop_words = text.ENGLISH_STOP_WORDS

class StemTokenizer:
     def __init__(self):
       self.wnl =PorterStemmer()
     def __call__(self, doc):
       return [self.wnl.stem(t) for t in word_tokenize(doc) if t.isalpha()]

tf_idf_transformer = TfidfTransformer()
vectorizer = CountVectorizer(stop_words = stop_words, tokenizer=StemTokenizer(),ngram_range=(1, 2)) #unigram+bigram:ngram_range=(1, 2), only bigram:ngram_range=(2, 2)
vectors_train_stop_tfidf_stem = vectorizer.fit_transform(X_train)
vectors_train_stop_tfidf_stem = tf_idf_transformer.fit_transform(vectors_train_stop_tfidf_stem)
vectors_test_stop_tfidf_stem = vectorizer.transform(X_test)
vectors_test_stop_tfidf_stem = tf_idf_transformer.transform(vectors_test_stop_tfidf_stem)
vectors_train_stop_tfidf_stem = normalizer_train.transform(vectors_train_stop_tfidf_stem)
vectors_test_stop_tfidf_stem = normalizer_train.transform(vectors_test_stop_tfidf_stem)
print(vectors_train_stop_tfidf_stem.shape)
print(vectors_test_stop_tfidf_stem.shape)

  'stop_words.' % sorted(inconsistent))


(1999, 73597)
(1378, 73597)


### ngram

In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words = stop_words)),
    ('tfidf', TfidfTransformer()),
    ('normalize',Normalizer()),
    ('select', RFECV(estimator=LinearSVC(),step=2800))
])

parameters = {  
    'vect__ngram_range': ((1,1),(1, 3), (1, 2),(2,2),(3,3)),
}
gs_model = GridSearchCV(pipeline, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(X_train, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9474773869346734
vect__ngram_range: (1, 2)


## 13.2

In [None]:
from sklearn.feature_selection import SelectKBest, SelectPercentile, chi2, mutual_info_classif, f_classif, SelectFpr, SelectFwe, SelectFdr, RFE, RFECV, SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC

from sklearn.model_selection import KFold, cross_val_score

### linearSVC without lemma

In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('normalize',Normalizer()),
    ('select', SelectPercentile()),
    ('clf', LinearSVC()),
])

parameters = {  
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__stop_words':(None, text.ENGLISH_STOP_WORDS),
    #'vect__max_features':
    'tfidf__use_idf': (True, False),
    'normalize__norm': ('l1','l2'),
    'select__percentile': (20, 40, 60, 80, 100),
    'select__score_func': (chi2, f_classif), #mutual_info_classif,
    'clf__C': (0.01, 0.1, 1, 10, 100)
}
gs_model = GridSearchCV(pipeline, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(X_train, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9514824120603015
clf__C: 10
normalize__norm: 'l2'
select__percentile: 100
select__score_func: <function chi2 at 0x7f54485fc320>
tfidf__use_idf: True
vect__ngram_range: (1, 2)
vect__stop_words: frozenset({'yourself', 'ltd', 'afterwards', 'only', 'already', 'everyone', 'move', 'by', 'themselves', 'someone', 'detail', 'hereafter', 'becomes', 'down', 'then', 'rather', 'thin', 'though', 'ie', 'over', 'latter', 'sometime', 'on', 'un', 'with', 'ten', 'through', 'meanwhile', 'no', 'whom', 'becoming', 'most', 'sixty', 'very', 'beforehand', 'whence', 'will', 'because', 'thereby', 'go', 'below', 'were', 'an', 'whereby', 'much', 're', 'whoever', 'always', 'sometimes', 'found', 'has', 'some', 'she', 'see', 'during', 'without', 'but', 'de', 'last', 'top', 'these', 'everything', 'others', 'mostly', 'there', 'may', 'could', 'put', 'how', 'those', 'onto', 'except', 'four', 'anyway', 'among', 'be', 'its', 'cry', 'enough', 'whatever', 'thence', 'else', 'which', 'none', 'himself', 'never', 'moreover', '

In [None]:
tf_idf_transformer = TfidfTransformer()
vectorizer = CountVectorizer(stop_words = stop_words, ngram_range=(1, 2)) #unigram+bigram:ngram_range=(1, 2), only bigram:ngram_range=(2, 2)
vectors_train_stop = vectorizer.fit_transform(X_train)
vectors_train_stop_tfidf = tf_idf_transformer.fit_transform(vectors_train_stop)
vectors_test_stop = vectorizer.transform(X_test)
vectors_test_stop_tfidf = tf_idf_transformer.transform(vectors_test_stop_Lemma)
vectors_train_stop_tfidf = normalizer_train.transform(vectors_train_stop_tfidf)
vectors_test_stop_tfidf = normalizer_train.transform(vectors_test_stop_tfidf)

select = SelectPercentile(chi2, percentile=100)
vectors_train_X2_SVC = select.fit_transform(vectors_train_stop_tfidf, y_train_num)
vectors_test_X2_SVC = select.transform(vectors_test_stop_tfidf)
print(vectors_train_X2_SVC.shape)

(1999, 89095)


### linearSVC with lemma

In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfTransformer()),
    ('normalize',Normalizer()),
    ('select', SelectPercentile()),
    ('clf', LinearSVC()),
])

parameters = {
    'tfidf__use_idf': (True, False),
    'normalize__norm': ('l1','l2'),
    'select__percentile': (20, 40, 60, 80, 100),
    'select__score_func': (chi2, f_classif),# ，mutual_info_classif
    'clf__C': (0.01, 0.1, 1, 10, 100)
}
gs_model = GridSearchCV(pipeline, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_stop_Lemma, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))



0.940969849246231
clf__C: 1
normalize__norm: 'l2'
select__percentile: 40
select__score_func: <function chi2 at 0x7fa8ddef2290>
tfidf__use_idf: True


In [172]:
pipeline = Pipeline([
    ('select', SelectPercentile(chi2)),
    ('clf', LinearSVC()),
])

parameters = {
    'select__percentile': (46, 50,54),
    #'select__score_func': (chi2, f_classif),# ，mutual_info_classif
    #'clf__C': (0.01, 0.1, 1, 10, 100)
}
gs_model = GridSearchCV(pipeline, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_stop_tfidf_Lemma, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))



0.9399723618090452
select__percentile: 50


In [None]:
pipeline = Pipeline([
    ('select', SelectPercentile()),
    ('clf', LinearSVC()),
])

parameters = {
    'select__percentile': (5 ,10, 20, 40, 80),
    'select__score_func': (chi2, f_classif),# ，mutual_info_classif
    'clf__C': (0.01, 0.1, 1, 10, 100)
}
gs_model = GridSearchCV(pipeline, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_stop_tfidf_Lemma, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

In [185]:
select = SelectPercentile(chi2, percentile=40)
vectors_train_Lemma_X2_SVC = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_X2_SVC = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_X2_SVC.shape)

(1999, 28165)


### MultinomialNB with Lemma

In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfTransformer()),
    ('normalize',Normalizer()),
    ('select', SelectPercentile()),
    ('clf', MultinomialNB())
])

parameters = {   
    'tfidf__use_idf': (True, False),
    'normalize__norm': ('l1','l2'),
    'select__percentile': (20, 40, 60, 80, 100),
    'select__score_func': (chi2, f_classif),# , mutual_info_classif
    'clf__alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2)
}
gs_model = GridSearchCV(pipeline, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_stop_Lemma, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9309723618090452
clf__alpha: 0.1
normalize__norm: 'l2'
select__percentile: 60
select__score_func: <function chi2 at 0x7fa8ddef2290>
tfidf__use_idf: False


In [None]:
select = SelectPercentile(chi2, percentile=60)
vectors_train_Lemma_X2_MNB = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_X2_MNB = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_X2_MNB.shape)

(1999, 42248)


### BernoulliNB with lemma

In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfTransformer()),
    ('normalize',Normalizer()),
    ('select', SelectPercentile()),
    ('clf', BernoulliNB())
])

parameters = {   
    'tfidf__use_idf': (True, False),
    'normalize__norm': ('l1','l2'),
    'select__percentile': (20, 40, 60, 80, 100),
    'select__score_func': (chi2, f_classif),# , mutual_info_classif
    'clf__alpha': (1e-10, 1e-5, 0.1, 0.5, 1, 2)
}
gs_model = GridSearchCV(pipeline, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_stop_Lemma, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9094623115577889
clf__alpha: 1e-05
normalize__norm: 'l2'
select__percentile: 60
select__score_func: <function chi2 at 0x7fa8ddef2290>
tfidf__use_idf: True


In [None]:
select = SelectPercentile(chi2, percentile=60)
vectors_train_Lemma_X2_BNB = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_X2_BNB = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_X2_BNB.shape)

(1999, 42248)


## 13.3

In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfTransformer()),
    ('normalize',Normalizer()),
    ('select', RFECV(estimator=LinearSVC())),
])

parameters = {   
    #'tfidf__use_idf': (True, False),
    #'normalize__norm': ('l1','l2'),
    #'select__estimator':('BernoulliNB()','LinearSVC()','MultinomialNB()'), 
    'select__step': (700, 1400, 2800 )
}
gs_model = GridSearchCV(pipeline, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_stop_Lemma, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

0.9394673366834171
select__step: 2800


In [154]:
estimator = LinearSVC(C=1)
select = RFECV(estimator, step=2800, scoring='accuracy')
vectors_train_Lemma_RFESVC = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_RFESVC = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_RFESVC.shape)
model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_RFESVC, y_train_num, cv=10)
print(scores.mean())

(1999, 14414)
0.9449748743718593


## 13.4 L1

In [None]:
estimator = LinearSVC(C=10, penalty="l1",dual=False)
select = SelectFromModel(estimator)
vectors_train_Lemma_SFML1 = select.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_test_Lemma_SFML1 = select.transform(vectors_test_stop_tfidf_Lemma)
print(vectors_train_Lemma_SFML1.shape)

(1999, 2105)




## TruncatedSVD

In [None]:
pipeline = Pipeline([
    ('svd', TruncatedSVD()),
    ('clf', LinearSVC())
])

parameters = {   
    'svd__n_components': (150, 600, 1200, 1800),
    'clf__C': (0.1, 1, 10)
}
gs_model = GridSearchCV(pipeline, parameters, cv=10, n_jobs=-1)
gs_model = gs_model.fit(vectors_train_stop_tfidf_Lemma, y_train_num)
print(gs_model.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_model.best_params_[param_name]))

In [None]:
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
#pca = PCA(n_components='mle',svd_solver='full')
svd = TruncatedSVD(n_components=500)
vectors_train_Lemma_svd = svd.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
#vectors_train_Lemma_pca = pca.fit_transform(vectors_train_stop_tfidf_Lemma, y_train_num)
vectors_train_Lemma_svd.shape

(1999, 500)

## models

In [None]:
model = LinearSVC(C=10)
scores = cross_val_score(model, vectors_train_X2_SVC, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC(C=1)
scores = cross_val_score(model, vectors_train_Lemma_X2_SVC, y_train_num, cv=10)
print(scores.mean())

model = MultinomialNB(alpha = 0.1) 
scores = cross_val_score(model, vectors_train_Lemma_X2_MNB, y_train_num, cv=10)
print(scores.mean())

model = BernoulliNB(alpha = 0.1)
scores = cross_val_score(model, vectors_train_Lemma_X2_BNB, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_RFESVC, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_SFML1, y_train_num, cv=10)
print(scores.mean())

model = LinearSVC()
scores = cross_val_score(model, vectors_train_Lemma_svd, y_train_num, cv=10)
print(scores.mean())

0.9429748743718595
0.9479723618090452
0.9409773869346735
0.9249723618090453
0.9449748743718593
0.9489773869346735
0.912464824120603


In [155]:
model = LinearSVC()
model.fit(vectors_train_Lemma_RFESVC, y_train_num)
cross_val_score(model, vectors_train_Lemma_RFESVC, y_train_num, cv=10).mean()

0.9449748743718593

In [159]:
model = LinearSVC()
model.fit(vectors_train_Lemma_RFESVC, y_train_num)
y_pred = model.predict(vectors_test_Lemma_RFESVC)
y_pred = le.inverse_transform(y_pred)

In [186]:
model = LinearSVC()
model.fit(vectors_train_Lemma_X2_SVC, y_train_num)
cross_val_score(model, vectors_train_Lemma_X2_SVC, y_train_num, cv=10).mean()

0.9479723618090452

In [187]:
model = LinearSVC()
model.fit(vectors_train_Lemma_X2_SVC, y_train_num)
y_pred = model.predict(vectors_test_Lemma_X2_SVC)
y_pred = le.inverse_transform(y_pred)

In [182]:
model = LinearSVC(C=10)
model.fit(vectors_train_Lemma_SFML1, y_train_num)
cross_val_score(model, vectors_train_Lemma_SFML1, y_train_num, cv=10).mean()

0.9574874371859297

In [188]:
result = pd.DataFrame({'id': test.id, 'subreddit': y_pred})
result.to_csv("result.csv", index=False)

In [189]:
pred_csv = pd.read_csv('result.csv',engine='python')
pred_csv.head()

Unnamed: 0,id,subreddit
0,0,science
1,1,science
2,2,anime
3,3,science
4,4,science
