In [0]:
#import necessary models
import spacy
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline



In [9]:
#load the file from local space
# from google.colab import files
# uploaded = files.upload()

# Loading TSV file
df_amazon = pd.read_csv ("amazon_alexa.tsv", sep="\t")
# Top 5 records
df_amazon.head()
# shape of dataframe
# df_amazon.shape
# View data information
# df_amazon.info()

# Feedback Value count
df_amazon.feedback.value_counts()


1    2893
0     257
Name: feedback, dtype: int64

In [0]:
# strip information like stopwords and punctuation, from each review.

import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens
    

In [0]:
#  custom transformer for removing initial and end spaces and converting text into lower case.
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [0]:
# When we classify text, we end up with text snippets matched with their respective labels. 
# But we can’t simply use text strings in our machine learning model; we need a way to convert our text into something that can be represented numerically just like the labels (1 for positive and 0 for negative) are. 
# Classifying text in positive and negative labels is called sentiment analysis. So we need a way to represent our text numerically.

# using spacy_tokenizer
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

# The higher the TF-IDF, the more important that term is to that document.
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [0]:
# Splitting The Data into Training and Test Sets
from sklearn.model_selection import train_test_split

X = df_amazon['verified_reviews'] # the features we want to analyze
ylabels = df_amazon['feedback'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

In [14]:
# Creating a Pipeline and Generating the Model
# pipeline with three components: a cleaner, a vectorizer, and a classifier. 
# The cleaner uses our predictors class object to clean and preprocess the text. 
# The vectorizer uses countvector objects to create the bag of words matrix for our text. 
# The classifier is an object that performs the logistic regression to classify the sentiments.

# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x7f1ba1066080>),
                ('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 t...\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x7f1ba255c840>,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
             

In [15]:
# Evaluating the model
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))


Logistic Regression Accuracy: 0.9312169312169312
Logistic Regression Precision: 0.9400871459694989
Logistic Regression Recall: 0.9885452462772051


This model correctly identified a comment’s sentiment 94.1% of the time. 
When it predicted a review was positive, that review was actually positive 95% of the time. 
When handed a positive review, our model identified it as positive 98.6% of the time