In [1]:
import pandas as pd

In [10]:
df_amazon=pd.read_table('amazon_cells_labelled.txt', names=['message', 'label'] )
df_imdb_labelled=pd.read_table('imdb_labelled.txt', names=['message', 'label'])
df_yelp_labelled=pd.read_table('yelp_labelled.txt', names=['message', 'label'])


In [11]:
frames=pd.concat([df_amazon, df_imdb_labelled, df_yelp_labelled], keys=['amazon', 'imdb', 'yelp'])
frames.head()

Unnamed: 0,Unnamed: 1,message,label
amazon,0,So there is no way for me to plug it in here i...,0
amazon,1,"Good case, Excellent value.",1
amazon,2,Great for the jawbone.,1
amazon,3,Tied to charger for conversations lasting more...,0
amazon,4,The mic is great.,1


In [12]:
frames.shape

(2748, 2)

In [13]:
frames.isnull().sum()

message    0
label      0
dtype: int64

## Working with spacy

In [17]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [16]:
print(spacy.__version__)

2.3.2


In [18]:
doc=nlp("Let's go to N.Y.!")

In [19]:
for token in doc:
    print(token.text)

Let
's
go
to
N.Y.
!


In [23]:
from spacy.lang.en import STOP_WORDS

In [24]:
stopwords=list(STOP_WORDS)

### Lemmatization

In [31]:
docx = nlp("This is how John Walker was walking. He was also running beside the lawn.")

In [27]:
for word in docx:
    print(word.text,"Lemma >>", word.lemma_)

This Lemma >> this
is Lemma >> be
how Lemma >> how
John Lemma >> John
Walker Lemma >> Walker
was Lemma >> be
walking Lemma >> walk
. Lemma >> .
He Lemma >> -PRON-
was Lemma >> be
also Lemma >> also
running Lemma >> run
beside Lemma >> beside
the Lemma >> the
lawn Lemma >> lawn
. Lemma >> .


In [36]:
# Lemma that are not pronouns

docx_new=[word.lemma_.lower().strip() if word.lemma_!="-PRON-" else word.lower_ for word in docx]
docx_new

['this',
 'be',
 'how',
 'john',
 'walker',
 'be',
 'walk',
 '.',
 'he',
 'be',
 'also',
 'run',
 'beside',
 'the',
 'lawn',
 '.']

In [56]:
new=[ word for word in docx if word.is_stop == False and not word.is_punct ]
new

[John, Walker, walking, running, lawn]

## Performing all operations done above using function

In [68]:
import string

punctuations=string.punctuation

In [69]:
# Creating a spacy parser
from spacy.lang.en import English

parser=English()

In [81]:
def spacy_tokenizer(sentence):
    mytokens=parser(sentence)
    mytokens=[word.lemma_.lower().strip() if word.lemma_!="-PRON-" else word.lower_ for word in mytokens]
    mytokens=[word for word in mytokens if word not in stopwords and word not in punctuations]
    
    return mytokens
    

## Machine learning with SkLearn

In [71]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

In [72]:

#Custom transformer using spaCy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text 
def clean_text(text):     
    return text.strip().lower()

### Count Vectorizer

In [82]:
cntvectorizer=CountVectorizer(tokenizer=spacy_tokenizer)

### TFID Vectorizer

In [83]:
tfvectorizer=TfidfVectorizer(tokenizer=spacy_tokenizer)

### Classifier

In [75]:
classifier=LinearSVC()

### Split the data into train test set

In [87]:
X=frames['message']
y=frames['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

### Create the pipeline with Transformer(cleaner), Count Vectorizer and Classifier

In [90]:
pipe_countvectorizer=Pipeline([('cleaner', predictors()), 
                              ('vectorizer', cntvectorizer),
                              ('classifier', classifier)])

In [91]:
pipe_countvectorizer.fit(X_train, y_train)

Pipeline(steps=[('cleaner', <__main__.predictors object at 0x000001BEFA65E2B0>),
                ('vectorizer',
                 CountVectorizer(tokenizer=<function spacy_tokenizer at 0x000001BEFC754430>)),
                ('classifier', LinearSVC())])

In [93]:
print("Accuracy with Count vectorizer", pipe_countvectorizer.score(X_test, y_test))

Accuracy with Count vectorizer 0.7763636363636364


### Create the pipeline with Transformer(cleaner), TFID vectorizer, Classifier

In [95]:
pipe_tfidvectorizer=Pipeline([('cleaner', predictors()), 
                              ('vectorizer', tfvectorizer), 
                              ('classifier', classifier)])

In [96]:
pipe_tfidvectorizer.fit(X_train, y_train)

Pipeline(steps=[('cleaner', <__main__.predictors object at 0x000001BEF517EB50>),
                ('vectorizer',
                 TfidfVectorizer(tokenizer=<function spacy_tokenizer at 0x000001BEFC754430>)),
                ('classifier', LinearSVC())])

In [97]:
print("Accuracy with TFID Vectorizer", pipe_tfidvectorizer.score(X_test, y_test))

Accuracy with TFID Vectorizer 0.82


### Let us check the most common words in the amazon dataset

In [98]:
from collections import Counter

In [103]:
word_freq=frames['message'].to_list()

In [104]:
common_words=Counter(word_freq)

In [105]:
common_words.most_common(10)

[('Works great!.', 2),
 ('If you like a loud buzzing to override all your conversations, then this phone is for you!',
  2),
 ("Don't buy this product.", 2),
 ('Great phone!.', 2),
 ('Works great.', 2),
 ('Great Phone.', 2),
 ('This is a great deal.', 2),
 ('Excellent product for the price.', 2),
 ('Does not fit.', 2),
 ('Great phone.', 2)]