In [144]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.cluster import MiniBatchKMeans

In [4]:
def load_dataset(filename):
    X, y = [], []
    with open(filename, 'r') as dataset:
        next(dataset)
        for line in dataset:
            sample = line.split(',')
            y.append(sample[0])
            X.append(','.join(sample[2:]))
    return X, y

In [36]:
X, y = load_dataset("train.csv")

In [37]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
#pipeline

In [124]:
pipe1 = Pipeline([("tranformer", CountVectorizer())])#, ("model", MultinomialNB())])
pipe1.fit(X_train, y_train)
#pipe1.score(X_holdout, y_holdout)

Pipeline(steps=[('tranformer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None))])

In [114]:
#customized transform

In [125]:
pipe2 = Pipeline([("tranformer", TfidfVectorizer())])#, ("model", MultinomialNB())])
pipe2.fit(X_train, y_train)
#pipe2.score(X_holdout, y_holdout)

Pipeline(steps=[('tranformer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None))])

In [116]:
#customized transform

In [126]:
pipe3 = Pipeline([("tranformer", TfidfVectorizer(stop_words = "english", max_df = 0.5, min_df = 3, 
                                                ngram_range=(1, 2)))])# ("model", MultinomialNB())])
pipe3.fit(X_train, y_train)
#pipe3.score(X_holdout, y_holdout)

Pipeline(steps=[('tranformer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=3,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None))])

In [127]:
#union transform

In [145]:
pipe4 = Pipeline([("union", FeatureUnion(transformer_list = [("pipe1", pipe1), ("pipe2", pipe2), ("pipe3", pipe3)])),
                  ("model", LogisticRegression())])
#pipe4 = Pipeline
pipe4.fit(X_train, y_train)

pipe4.score(X_holdout, y_holdout)

0.83375527426160334

In [38]:
ct_vec = CountVectorizer()
tr_X_train = ct_vec.fit_transform(X_train)
tr_X_train_dense = pd.DataFrame(tr_X_train.todense(), columns = [ct_vec.get_feature_names()])

In [40]:
tr_X_holdout = ct_vec.transform(X_holdout)
tr_X_holdout_dense = pd.DataFrame(tr_X_holdout.todense(), columns = [ct_vec.get_feature_names()])

In [42]:
nb = MultinomialNB()

In [43]:
nb.fit(tr_X_train_dense, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [44]:
nb.score(tr_X_holdout_dense, y_holdout)

0.81603375527426159

In [45]:
# def pipeline(X_train, y_train, X_holdout, y_holdout, ct_vec, model):
#     tr_X_train = ct_vec.transform(X_train)
#     tr_X_train_dense = pd.DataFrame(tr_X_train.todense(), columns = [ct_vec.get_feature_names()])
    
#     tr_X_holdout = ct_vec.transform(X_holdout)
#     tr_X_holdout_dense = pd.DataFrame(tr_X_holdout.todense(), columns = [ct_vec.get_feature_names()])

#     model.fit(tr_X_train_dense, y_train)
#     return model.score(tr_X_holdout_dense, y_holdout)

In [93]:
pipeline(X_train, y_train, X_holdout, y_holdout, ct_vec, nb)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [47]:
tr_X_train_dense.head(5)

Unnamed: 0,00,000,01,01k4wu4w,02,05,06,0612,07,075,...,zoanthropes,zobiana,zombies,zone,zoo,zooey,zooming,zuccotti,zuckenberg,zuckerberg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [84]:
class MyTransform(BaseEstimator, TransformerMixin):

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        ct_vec = CountVectorizer(stop_words = "english", max_df = 0.5, min_df = 3, ngram_range=(1, 2))
        ct_vec.fit(X)
        return ct_vec.transform(X)

In [85]:
mytr = MyTransform()

In [86]:
mytr.fit(X_train, y_train)

MyTransform()

In [87]:
new_X = mytr.transform(X_train)

In [96]:
new_X_dense = pd.DataFrame(new_X.todense())

In [97]:
new_X_dense.tail(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3249,3250,3251,3252,3253,3254,3255,3256,3257,3258
2757,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2758,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2759,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2760,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2761,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [89]:
new_X_holdout = mytr.transform(X_holdout)

In [94]:
new_X_holdout_dense = pd.DataFrame(new_X_holdout.todense(), columns = new_X_dense)

AttributeError: 'MyTransform' object has no attribute 'get_feature_names'

In [95]:
nb.score(new_X_holdout_dense, y_holdout)

ValueError: shapes (1185,1498) and (12937,2) not aligned: 1498 (dim 1) != 12937 (dim 0)