### Import Libraries

In [1]:
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix


In [2]:
pip install mlxtend

Note: you may need to restart the kernel to use updated packages.


In [3]:
from mlxtend.plotting import plot_confusion_matrix

### Read in Dataset

In [4]:
corpus = pd.read_csv('Spotify_reviews.csv')

In [5]:
# Preview
corpus.head()

Unnamed: 0,Time_submitted,Review,Rating,Total_thumbsup,Reply
0,2022-07-09 15:00:00,"Great music service, the audio is high quality...",5,2,
1,2022-07-09 14:21:22,Please ignore previous negative rating. This a...,5,1,
2,2022-07-09 13:27:32,"This pop-up ""Get the best Spotify experience o...",4,0,
3,2022-07-09 13:26:45,Really buggy and terrible to use as of recently,1,1,
4,2022-07-09 13:20:49,Dear Spotify why do I get songs that I didn't ...,1,1,


## Tagging Reviews
Creating a fucntion to tag the reivew as "Positive", "Negative", or "Neutral"

I will assume a positive reviewer has also left an app rating that is postive. The critia is as follows:


Positive = rating: 4-5

Neutral = rating: 3

Negative = rating: 1-2

### Creating a function to make a new column (Sentiment) to catagorize the reviews

In [6]:
#create a function to compute the negative, neutral and positive analysis
def getAnalysis(Rating):
    if Rating<3:
        return 'negative'
    elif Rating==3:
        return 'neutral'
    else:
        return 'positive'
    
corpus['Sentiment']=corpus['Rating'].apply(getAnalysis)

#show preview
corpus.head()

Unnamed: 0,Time_submitted,Review,Rating,Total_thumbsup,Reply,Sentiment
0,2022-07-09 15:00:00,"Great music service, the audio is high quality...",5,2,,positive
1,2022-07-09 14:21:22,Please ignore previous negative rating. This a...,5,1,,positive
2,2022-07-09 13:27:32,"This pop-up ""Get the best Spotify experience o...",4,0,,positive
3,2022-07-09 13:26:45,Really buggy and terrible to use as of recently,1,1,,negative
4,2022-07-09 13:20:49,Dear Spotify why do I get songs that I didn't ...,1,1,,negative


Now every review is tagged with a sentmient

## Building the Model
Now that the data is tagged with sentiment, we need to build a model to test for accuracy. In other words, is the sentiment tag accurate based on the review...
Again, we will be using the Multinomial Naive Bayes model

### Split the data
similar to what we did in the last model, we will split the data into X and Y. The difference is that this time Y will be the sentiment tag

In [7]:
X = corpus.Review
y = corpus.Sentiment

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42,
                                                    test_size=0.25)

### Cleaning Data

stop words

In [9]:
sw = stopwords.words('english')

def get_wordnet_pos(treebank_tag):
    '''
    Translate nltk POS to wordnet tags
    '''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

lemimatize words, make everything lowercase, remove number, and punctuation

In [10]:
def doc_preparer(doc, stop_words=sw):
    '''
    
    :param doc: a document from the satire corpus 
    :return: a document string with words which have been 
            lemmatized, 
            parsed for stopwords, 
            made lowercase,
            and stripped of punctuation and numbers.
    '''
    
    regex_token = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
    doc = regex_token.tokenize(doc)
    doc = [word.lower() for word in doc]
    doc = [word for word in doc if word not in sw]
    # print(doc)
    doc = pos_tag(doc)
    doc = [(word[0], get_wordnet_pos(word[1])) for word in doc]
    lemmatizer = WordNetLemmatizer() 
    doc = [lemmatizer.lemmatize(word[0], word[1]) for word in doc]
    return ' '.join(doc)

Tokenize

In [11]:
import nltk
nltk.download('averaged_perceptron_tagger')

token_docs = [doc_preparer(doc, sw) for doc in X_train]

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hopmiller\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


The data is now cleaned, standardized, and ready to start feeding the model!

### Secondary Split

In [12]:
# Secondary train-test split to build our best model
X_test, X_val, y_test, y_val = train_test_split(token_docs, y_train,
                                                test_size=0.25, random_state=42)

### Count Vectorization

CountVectorizer is breaking down a sentence or text into words by performing preprocessing tasks like converting all words to lowercase, thus removing special characters. The result will be the data in form of vectors. 

In [13]:
cv = CountVectorizer()

X_test_vec = cv.fit_transform(X_test)
X_test_vec = pd.DataFrame.sparse.from_spmatrix(X_test_vec)
X_test_vec.columns = sorted(cv.vocabulary_)
X_test_vec.set_index(y_test.index, inplace=True)

In [14]:
#previewing results
X_test_vec

Unnamed: 0,aa,aaa,aaaa,aaaaaaaa,aaaannndd,aaah,aac,aada,aads,aah,...,zomato,zombie,zombify,zone,zoner,zong,zoom,zpotify,zumo,zuri
24466,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21450,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9456,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29030,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57633,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16899,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39666,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58321,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27421,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
#using the newly vectorized data
X_val_vec = cv.transform(X_val)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(cv.vocabulary_)
X_val_vec.set_index(y_val.index, inplace=True)

### Running the Model

In [16]:
# Multinomial Naive Bayes
mnb = MultinomialNB()

#fitting the model
mnb.fit(X_test_vec, y_test)

MultinomialNB()

### Evaluating the Model

In [17]:
y_pred = mnb.predict(X_test_vec)
from sklearn.metrics import classification_report
classification = classification_report(y_test,y_pred)
print(classification)

              precision    recall  f1-score   support

    negative       0.75      0.89      0.81     14008
     neutral       0.62      0.16      0.26      3845
    positive       0.86      0.87      0.87     16793

    accuracy                           0.80     34646
   macro avg       0.75      0.64      0.65     34646
weighted avg       0.79      0.80      0.78     34646



This model is accurate, with an f-1 accuracy score of an 0.80. It is more accurate than the model classifying reivews based on ratings alone.