In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.pipeline import Pipeline
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
import pandas as pd
import time

In [2]:
df=pd.read_csv('data/sample.csv')
df.head()

Unnamed: 0,label,text
0,0,"@vivmondo Haha, it did not feel like a win at ..."
1,0,again stuck with javascript #web2.0 designer ...
2,0,Exhausted...terrible week
3,0,@dstarr I want my bed x
4,0,Hayley is awake &amp; screaming (sick ). I'll...


In [4]:
def data_cleaning(text_list): 
    stopwords_rem=False
    stopwords_en=stopwords.words('english')
    lemmatizer=WordNetLemmatizer()
    tokenizer=TweetTokenizer()
    tokens_list=[]
    for each_text in text_list: 
        lemmatized_tokens=[]
        tokens=tokenizer.tokenize(each_text.lower())
        pos_tags=pos_tag(tokens)
        for each_token, tag in pos_tags: 
            if tag.startswith('NN'): 
                pos='n'
            elif tag.startswith('VB'): 
                pos='v'
            else: 
                pos='a'
            lemmatized_token=lemmatizer.lemmatize(each_token, pos)
            if stopwords_rem: # False 
                if lemmatized_token not in stopwords_en: 
                    lemmatized_tokens.append(lemmatized_token)
            else: 
                lemmatized_tokens.append(lemmatized_token)
        tokens_list.append(' '.join(lemmatized_tokens))
    return tokens_list

In [6]:
demo_text=['Today is a good day', 'Today is a bad day', 'Today is ok']
estimators=[('cleaner', FunctionTransformer(data_cleaning)), 
            ('vectorizer', TfidfVectorizer(ngram_range=(1, 2)))]
demo_pipeline=Pipeline(estimators)
demo_ary=demo_pipeline.fit_transform(demo_text).toarray()
demo_vocab=[vocab for vocab, idx in sorted(demo_pipeline['vectorizer'].vocabulary_.items(), key=lambda item: item[1])]
# demo_pipeline.fit(demo_text)
# demo_pipeline.transform(demo_text)
demo_df=pd.DataFrame(demo_ary, columns=demo_vocab)
demo_df['text']=demo_text
demo_df

Unnamed: 0,bad,bad day,be,be bad,be good,be ok,day,good,good day,ok,today,today be,text
0,0.0,0.0,0.274634,0.0,0.464997,0.0,0.353642,0.464997,0.464997,0.0,0.274634,0.274634,Today is a good day
1,0.464997,0.464997,0.274634,0.464997,0.0,0.0,0.353642,0.0,0.0,0.0,0.274634,0.274634,Today is a bad day
2,0.0,0.0,0.338381,0.0,0.0,0.572929,0.0,0.0,0.0,0.572929,0.338381,0.338381,Today is ok


In [44]:
X=df['text']
y=df['label']
X_train, X_test, y_train, y_test=train_test_split(X, y, train_size=.9)

In [8]:

# train_set=int(len(X)*0.9)
# X_train=X[:train_set]
# y_train=y[:train_set]
# X_test=X[train_set:]
# y_test=y[train_set:]

In [45]:
start_time=time.time()
estimators=[('cleaner', FunctionTransformer(data_cleaning)), 
            ('vectorizer', TfidfVectorizer(max_features=100000, ngram_range=(1, 2)))]
preprocessing_pipeline=Pipeline(estimators)
X_train_transformed=preprocessing_pipeline.fit_transform(X_train)#[0:5])
# preprocessing_pipeline.transform([':)', 'Today is a bad day'])
# preprocessing_pipeline.transform(df['text'])#[0:5])

In [46]:
len(preprocessing_pipeline['vectorizer'].vocabulary_)

100000

In [47]:
# If the 𝑃(𝑋|𝐶)'s are binary variables the model is a binomial Naive Bayes,
# and multinomial Naive Bayes if multinomial distribution and Gaussian if 
# continuous Gaussian distribution(parameters are only mean and variance).

In [48]:
nb=MultinomialNB()
nb.fit(X_train_transformed, y_train)
print(f'Time: {time.time()-start_time}')
X_test_transformed=preprocessing_pipeline.transform(X_test)
print(f'Train Score: {nb.score(X_train_transformed, y_train)}')
print(f'Test Score: {nb.score(X_test_transformed, y_test)}')

Time: 0.15418505668640137
Train Score: 0.8276666666666667
Test Score: 0.7886


In [57]:
start_time=time.time()
estimators=[('cleaner', FunctionTransformer(data_cleaning)), 
            ('vectorizer', TfidfVectorizer(max_features=150000, ngram_range=(1, 2)))]
preprocessing_pipeline=Pipeline(estimators)
X_train_transformed=preprocessing_pipeline.fit_transform(X_train)#[0:5])
# preprocessing_pipeline.transform([':)', 'Today is a bad day'])
# preprocessing_pipeline.transform(df['text'])#[0:5])

In [58]:
nb=MultinomialNB()
nb.fit(X_train_transformed, y_train)
print(f'Time: {time.time()-start_time}')
X_test_transformed=preprocessing_pipeline.transform(X_test)
print(f'Train Score: {nb.score(X_train_transformed, y_train)}')
print(f'Test Score: {nb.score(X_test_transformed, y_test)}')

Time: 356.5071060657501
Train Score: 0.8380861111111111
Test Score: 0.790325


In [53]:
start_time=time.time()
estimators=[('cleaner', FunctionTransformer(data_cleaning)), 
            ('vectorizer', TfidfVectorizer(max_features=300000, ngram_range=(1, 2)))]
preprocessing_pipeline=Pipeline(estimators)
X_train_transformed=preprocessing_pipeline.fit_transform(X_train)#[0:5])
# preprocessing_pipeline.transform([':)', 'Today is a bad day'])
# preprocessing_pipeline.transform(df['text'])#[0:5])

In [54]:
nb=MultinomialNB()
nb.fit(X_train_transformed, y_train)
print(f'Time: {time.time()-start_time}')
X_test_transformed=preprocessing_pipeline.transform(X_test)
print(f'Train Score: {nb.score(X_train_transformed, y_train)}')
print(f'Test Score: {nb.score(X_test_transformed, y_test)}')

Time: 399.9984641075134
Train Score: 0.8587277777777778
Test Score: 0.7915
