### Import Libraries

In [1]:
import pandas as pd
import nltk

### Read Dataset

In [2]:
df = pd.read_csv('/Users/hopmiller/Desktop/Capstone/DataSets/Spotify_reviews.csv')

### Preview Data

In [3]:
df.head()

Unnamed: 0,Time_submitted,Review,Rating,Total_thumbsup,Reply
0,2022-07-09 15:00:00,"Great music service, the audio is high quality...",5,2,
1,2022-07-09 14:21:22,Please ignore previous negative rating. This a...,5,1,
2,2022-07-09 13:27:32,"This pop-up ""Get the best Spotify experience o...",4,0,
3,2022-07-09 13:26:45,Really buggy and terrible to use as of recently,1,1,
4,2022-07-09 13:20:49,Dear Spotify why do I get songs that I didn't ...,1,1,


In [4]:
# Shape of data
df.shape

(61594, 5)

## Cleaning Data

In [5]:
import nltk
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hopmiller\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Making word lowercase

In [6]:
df['Review']=df['Review'].str.lower()
# Previewing data
df['Review'].head()

0    great music service, the audio is high quality...
1    please ignore previous negative rating. this a...
2    this pop-up "get the best spotify experience o...
3      really buggy and terrible to use as of recently
4    dear spotify why do i get songs that i didn't ...
Name: Review, dtype: object

#### Removing stopwords

In [7]:
STOPWORDS = set(stopwords.words('english'))
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
df['Review'] = df['Review'].apply(lambda text: cleaning_stopwords(text))
# Previewing data
df['Review'].head()

0    great music service, audio high quality app ea...
1    please ignore previous negative rating. app su...
2    pop-up "get best spotify experience android 12...
3                   really buggy terrible use recently
4    dear spotify get songs put playlist??? shuffle...
Name: Review, dtype: object

### Standardizing strings

#### Removing puncutation 

In [8]:
import string
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)
df['Review']= df['Review'].apply(lambda x: cleaning_punctuations(x))
# Preview data
df['Review'].head()

0    great music service audio high quality app eas...
1    please ignore previous negative rating app sup...
2    popup get best spotify experience android 12 a...
3                   really buggy terrible use recently
4     dear spotify get songs put playlist shuffle play
Name: Review, dtype: object

#### Removing URLs

In [9]:
#importing Regular Expression
import re

def cleaning_URLs(data):
    return re.sub('((www.[^s]+)|(https?://[^s]+))',' ',data)
df['Review'] = df['Review'].apply(lambda x: cleaning_URLs(x))
# Preview
df['Review'].head()

0    great music service audio high quality app eas...
1    please ignore previous negative rating app sup...
2    popup get best spotify experience android 12 a...
3                   really buggy terrible use recently
4     dear spotify get songs put playlist shuffle play
Name: Review, dtype: object

#### Removing numbers

In [10]:
def cleaning_numbers(data):
    return re.sub('[0-9]+', ' ', data)
df['Review'] = df['Review'].apply(lambda x: cleaning_numbers(x))
df['Review'].head()

0    great music service audio high quality app eas...
1    please ignore previous negative rating app sup...
2    popup get best spotify experience android   an...
3                   really buggy terrible use recently
4     dear spotify get songs put playlist shuffle play
Name: Review, dtype: object

#### Removing short words 
I am opting to take out words with 2 or less characters

In [11]:
def transform_text(text):
    return ' '.join([word for word in text.split() if len(word) > 2])
df['Review'] = df['Review'].apply(lambda x: transform_text(x))
# Preview data
df['Review'].head() 

0    great music service audio high quality app eas...
1    please ignore previous negative rating app sup...
2    popup get best spotify experience android anno...
3                   really buggy terrible use recently
4     dear spotify get songs put playlist shuffle play
Name: Review, dtype: object

### Tokenization 
Using the Natural Language Toolkit, I will split the strings (reviews) into lists of words

In [12]:
# import word_tokenize from Natural Language Toolkit
from nltk.tokenize import TweetTokenizer

# creating a reference variable
tt = TweetTokenizer()
df['Review']=df['Review'].apply(tt.tokenize)
df['Review'].head()

0    [great, music, service, audio, high, quality, ...
1    [please, ignore, previous, negative, rating, a...
2    [popup, get, best, spotify, experience, androi...
3             [really, buggy, terrible, use, recently]
4    [dear, spotify, get, songs, put, playlist, shu...
Name: Review, dtype: object

### Stemming


In [13]:
import nltk
st = nltk.PorterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return data
df['Review']= df['Review'].apply(lambda x: stemming_on_text(x))
df['Review'].head()

0    [great, music, service, audio, high, quality, ...
1    [please, ignore, previous, negative, rating, a...
2    [popup, get, best, spotify, experience, androi...
3             [really, buggy, terrible, use, recently]
4    [dear, spotify, get, songs, put, playlist, shu...
Name: Review, dtype: object

### Lemmatizer 

In [14]:
import nltk
nltk.download('wordnet')

lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data
df['Review'] = df['Review'].apply(lambda x: lemmatizer_on_text(x))
df['Review'].head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hopmiller\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0    [great, music, service, audio, high, quality, ...
1    [please, ignore, previous, negative, rating, a...
2    [popup, get, best, spotify, experience, androi...
3             [really, buggy, terrible, use, recently]
4    [dear, spotify, get, songs, put, playlist, shu...
Name: Review, dtype: object

## Tagging Reviews
Creating a fucntion to tag the reivew as "Positive", "Negative", or "Neutral"

I will assume a positive reviewer has also left an app rating that is postive. The critia is as follows:


Positive = rating: 4-5

Neutral = rating: 3

Negative = rating: 1-2

### Creating a function to make a new column (Sentiment) to catagorize the reviews

In [15]:
#create a function to compute the negative, neutral and positive analysis
def getAnalysis(Rating):
    if Rating<3:
        return 'negative'
    elif Rating==3:
        return 'neutral'
    else:
        return 'positive'
    
df['Sentiment']=df['Rating'].apply(getAnalysis)

#show preview
df.head()

Unnamed: 0,Time_submitted,Review,Rating,Total_thumbsup,Reply,Sentiment
0,2022-07-09 15:00:00,"[great, music, service, audio, high, quality, ...",5,2,,positive
1,2022-07-09 14:21:22,"[please, ignore, previous, negative, rating, a...",5,1,,positive
2,2022-07-09 13:27:32,"[popup, get, best, spotify, experience, androi...",4,0,,positive
3,2022-07-09 13:26:45,"[really, buggy, terrible, use, recently]",1,1,,negative
4,2022-07-09 13:20:49,"[dear, spotify, get, songs, put, playlist, shu...",1,1,,negative


In [16]:
# create two new dataframe all of the positive text
df_positive = df[df['Sentiment'] == 'positive']


# create two new dataframe all of the negative text
df_negative = df[df['Sentiment'] == 'negative']


# create two new dataframe all of the neutral text
df_neutral=df[df['Sentiment'] == 'neutral']

### Counting the number of each type of sentiment in the data set

In [17]:
review_counts = df.Sentiment.value_counts()
review_counts

positive    29937
negative    24771
neutral      6886
Name: Sentiment, dtype: int64

Most reviews are either positive or negative

# Training the Data
Now that I have tagged the tweets as positive, negative, or neutral, it is time to train and test my model

In [18]:
training_data = df

In [19]:
training_data.head()

Unnamed: 0,Time_submitted,Review,Rating,Total_thumbsup,Reply,Sentiment
0,2022-07-09 15:00:00,"[great, music, service, audio, high, quality, ...",5,2,,positive
1,2022-07-09 14:21:22,"[please, ignore, previous, negative, rating, a...",5,1,,positive
2,2022-07-09 13:27:32,"[popup, get, best, spotify, experience, androi...",4,0,,positive
3,2022-07-09 13:26:45,"[really, buggy, terrible, use, recently]",1,1,,negative
4,2022-07-09 13:20:49,"[dear, spotify, get, songs, put, playlist, shu...",1,1,,negative


### Defining Sentiment
Assigning a numerical value to each sentiment

Postive = 2

Negative = 0

Neutral = 1

In [20]:
# Splitting data into X and Y
X = training_data['Review']
Y = training_data['Sentiment']

In [21]:
sentiments = ['negative' , 'neutral', 'positive']
Y = Y.apply(lambda x: sentiments.index(x))

In [22]:
# checking data
Y.head()

0    2
1    2
2    2
3    0
4    0
Name: Sentiment, dtype: int64

### Vectorizing Data

In [23]:
# Import library 
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
X_fit = count_vectorizer.fit_transform(df).toarray()

# Check the shape
X_fit.shape

(6, 6)

## Making a Model
I will be using MultinomialNB- type of Naive Bayes Classifier, which Calculates the probability of sentiment based on the probability of words

In [30]:
pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.21.0-py2.py3-none-any.whl (1.3 MB)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.21.0
Note: you may need to restart the kernel to use updated packages.


In [31]:
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix


from mlxtend.plotting import plot_confusion_matrix

In [32]:
corpus = pd.read_csv('Spotify_reviews.csv')

In [33]:
X = corpus.Review
y = corpus.Rating

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42,
                                                    test_size=0.25)

In [35]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hopmiller\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
sw = stopwords.words('english')

In [37]:
def get_wordnet_pos(treebank_tag):
    '''
    Translate nltk POS to wordnet tags
    '''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [39]:
def doc_preparer(doc, stop_words=sw):
    '''
    
    :param doc: a document from the satire corpus 
    :return: a document string with words which have been 
            lemmatized, 
            parsed for stopwords, 
            made lowercase,
            and stripped of punctuation and numbers.
    '''
    
    regex_token = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
    doc = regex_token.tokenize(doc)
    doc = [word.lower() for word in doc]
    doc = [word for word in doc if word not in sw]
    # print(doc)
    doc = pos_tag(doc)
    doc = [(word[0], get_wordnet_pos(word[1])) for word in doc]
    lemmatizer = WordNetLemmatizer() 
    doc = [lemmatizer.lemmatize(word[0], word[1]) for word in doc]
    return ' '.join(doc)

In [42]:
import nltk
nltk.download('averaged_perceptron_tagger')

token_docs = [doc_preparer(doc, sw) for doc in X_train]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hopmiller\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [43]:
# Secondary train-test split to build our best model
X_test, X_val, y_test, y_val = train_test_split(token_docs, y_train,
                                                test_size=0.25, random_state=42)

In [44]:
cv = CountVectorizer()

X_test_vec = cv.fit_transform(X_test)
X_test_vec = pd.DataFrame.sparse.from_spmatrix(X_test_vec)
X_test_vec.columns = sorted(cv.vocabulary_)
X_test_vec.set_index(y_test.index, inplace=True)

In [45]:
X_test_vec

Unnamed: 0,aa,aaa,aaaa,aaaaaaaa,aaaannndd,aaah,aac,aada,aads,aah,...,zomato,zombie,zombify,zone,zoner,zong,zoom,zpotify,zumo,zuri
24466,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21450,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9456,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29030,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57633,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16899,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39666,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58321,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27421,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
X_val_vec = cv.transform(X_val)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(cv.vocabulary_)
X_val_vec.set_index(y_val.index, inplace=True)

In [58]:
mnb = MultinomialNB()

mnb.fit(X_test_vec, y_test)

MultinomialNB()

In [62]:
y_pred = mnb.predict(X_test_vec)
from sklearn.metrics import classification_report
classification = classification_report(y_test,y_pred)
print(classification)

              precision    recall  f1-score   support

           1       0.60      0.89      0.72      9987
           2       0.62      0.25      0.35      4021
           3       0.63      0.24      0.35      3845
           4       0.53      0.35      0.43      4423
           5       0.79      0.88      0.83     12370

    accuracy                           0.67     34646
   macro avg       0.64      0.52      0.54     34646
weighted avg       0.67      0.67      0.64     34646



In [73]:
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
plot_confusion_matrix(classification, X_test_vec, y_test)  
plt.show()



ValueError: plot_confusion_matrix only supports classifiers

### Splitting the data