### Import Libraries

In [1]:
import pandas as pd
import nltk

### Read Dataset

In [7]:
df = pd.read_csv('/Users/hopmiller/Desktop/Capstone/DataSets/Spotify_reviews.csv')

### Preview Data

In [8]:
df.head()

Unnamed: 0,Time_submitted,Review,Rating,Total_thumbsup,Reply
0,2022-07-09 15:00:00,"Great music service, the audio is high quality...",5,2,
1,2022-07-09 14:21:22,Please ignore previous negative rating. This a...,5,1,
2,2022-07-09 13:27:32,"This pop-up ""Get the best Spotify experience o...",4,0,
3,2022-07-09 13:26:45,Really buggy and terrible to use as of recently,1,1,
4,2022-07-09 13:20:49,Dear Spotify why do I get songs that I didn't ...,1,1,


In [9]:
# Shape of data
df.shape

(61594, 5)

## Cleaning Data

In [10]:
import nltk
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hopmiller\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Making word lowercase

In [12]:
df['Review']=df['Review'].str.lower()
# Previewing data
df['Review'].head()

0    great music service, the audio is high quality...
1    please ignore previous negative rating. this a...
2    this pop-up "get the best spotify experience o...
3      really buggy and terrible to use as of recently
4    dear spotify why do i get songs that i didn't ...
Name: Review, dtype: object

#### Removing stopwords

In [13]:
STOPWORDS = set(stopwords.words('english'))
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
df['Review'] = df['Review'].apply(lambda text: cleaning_stopwords(text))
# Previewing data
df['Review'].head()

0    great music service, audio high quality app ea...
1    please ignore previous negative rating. app su...
2    pop-up "get best spotify experience android 12...
3                   really buggy terrible use recently
4    dear spotify get songs put playlist??? shuffle...
Name: Review, dtype: object

### Standardizing strings

#### Removing puncutation 

In [18]:
import string
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)
df['Review']= df['Review'].apply(lambda x: cleaning_punctuations(x))
# Preview data
df['Review'].head()

0    great music service audio high quality app eas...
1    please ignore previous negative rating app sup...
2    popup get best spotify experience android 12 a...
3                   really buggy terrible use recently
4     dear spotify get songs put playlist shuffle play
Name: Review, dtype: object

#### Removing URLs

In [22]:
#importing Regular Expression
import re

def cleaning_URLs(data):
    return re.sub('((www.[^s]+)|(https?://[^s]+))',' ',data)
df['Review'] = df['Review'].apply(lambda x: cleaning_URLs(x))
# Preview
df['Review'].head()

0    great music service audio high quality app eas...
1    please ignore previous negative rating app sup...
2    popup get best spotify experience android   an...
3                   really buggy terrible use recently
4     dear spotify get songs put playlist shuffle play
Name: Review, dtype: object

#### Removing numbers

In [23]:
def cleaning_numbers(data):
    return re.sub('[0-9]+', ' ', data)
df['Review'] = df['Review'].apply(lambda x: cleaning_numbers(x))
df['Review'].head()

0    great music service audio high quality app eas...
1    please ignore previous negative rating app sup...
2    popup get best spotify experience android   an...
3                   really buggy terrible use recently
4     dear spotify get songs put playlist shuffle play
Name: Review, dtype: object

#### Removing short words 
I am opting to take out words with 2 or less characters

In [24]:
def transform_text(text):
    return ' '.join([word for word in text.split() if len(word) > 2])
df['Review'] = df['Review'].apply(lambda x: transform_text(x))
# Preview data
df['Review'].head() 

0    great music service audio high quality app eas...
1    please ignore previous negative rating app sup...
2    popup get best spotify experience android anno...
3                   really buggy terrible use recently
4     dear spotify get songs put playlist shuffle play
Name: Review, dtype: object

### Tokenization 
Using the Natural Language Toolkit, I will split the strings (reviews) into lists of words

In [29]:
# import word_tokenize from Natural Language Toolkit
from nltk.tokenize import TweetTokenizer

# creating a reference variable
tt = TweetTokenizer()
df['Review']=df['Review'].apply(tt.tokenize)
df['Review'].head()

0    [great, music, service, audio, high, quality, ...
1    [please, ignore, previous, negative, rating, a...
2    [popup, get, best, spotify, experience, androi...
3             [really, buggy, terrible, use, recently]
4    [dear, spotify, get, songs, put, playlist, shu...
Name: Review, dtype: object

### Stemming


In [30]:
import nltk
st = nltk.PorterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return data
df['Review']= df['Review'].apply(lambda x: stemming_on_text(x))
df['Review'].head()

0    [great, music, service, audio, high, quality, ...
1    [please, ignore, previous, negative, rating, a...
2    [popup, get, best, spotify, experience, androi...
3             [really, buggy, terrible, use, recently]
4    [dear, spotify, get, songs, put, playlist, shu...
Name: Review, dtype: object

### Lemmatizer 

In [31]:
import nltk
nltk.download('wordnet')

lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data
df['Review'] = df['Review'].apply(lambda x: lemmatizer_on_text(x))
df['Review'].head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hopmiller\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0    [great, music, service, audio, high, quality, ...
1    [please, ignore, previous, negative, rating, a...
2    [popup, get, best, spotify, experience, androi...
3             [really, buggy, terrible, use, recently]
4    [dear, spotify, get, songs, put, playlist, shu...
Name: Review, dtype: object

## Tagging Reviews
Creating a fucntion to tag the reivew as "Positive", "Negative", or "Neutral"

I will assume a positive reviewer has also left an app rating that is postive. The critia is as follows:


Positive = rating: 4-5

Neutral = rating: 3

Negative = rating: 1-2

### Creating a function to make a new column (Sentiment) to catagorize the reviews

In [34]:
#create a function to compute the negative, neutral and positive analysis
def getAnalysis(Rating):
    if Rating<3:
        return 'negative'
    elif Rating==3:
        return 'neutral'
    else:
        return 'positive'
    
df['Sentiment']=df['Rating'].apply(getAnalysis)

#show preview
df.head()

Unnamed: 0,Time_submitted,Review,Rating,Total_thumbsup,Reply,Sentiment
0,2022-07-09 15:00:00,"[great, music, service, audio, high, quality, ...",5,2,,positive
1,2022-07-09 14:21:22,"[please, ignore, previous, negative, rating, a...",5,1,,positive
2,2022-07-09 13:27:32,"[popup, get, best, spotify, experience, androi...",4,0,,positive
3,2022-07-09 13:26:45,"[really, buggy, terrible, use, recently]",1,1,,negative
4,2022-07-09 13:20:49,"[dear, spotify, get, songs, put, playlist, shu...",1,1,,negative


In [36]:
# create two new dataframe all of the positive text
df_positive = df[df['Sentiment'] == 'positive']


# create two new dataframe all of the negative text
df_negative = df[df['Sentiment'] == 'negative']


# create two new dataframe all of the neutral text
df_neutral=df[df['Sentiment'] == 'neutral']

### Counting the number of each type of sentiment in the data set

In [37]:
review_counts = df.Sentiment.value_counts()
review_counts

positive    29937
negative    24771
neutral      6886
Name: Sentiment, dtype: int64

Most reviews are either positive or negative

# Training the Data
Now that I have tagged the tweets as positive, negative, or neutral, it is time to train and test my model

In [38]:
training_data = df

In [39]:
training_data.head()

Unnamed: 0,Time_submitted,Review,Rating,Total_thumbsup,Reply,Sentiment
0,2022-07-09 15:00:00,"[great, music, service, audio, high, quality, ...",5,2,,positive
1,2022-07-09 14:21:22,"[please, ignore, previous, negative, rating, a...",5,1,,positive
2,2022-07-09 13:27:32,"[popup, get, best, spotify, experience, androi...",4,0,,positive
3,2022-07-09 13:26:45,"[really, buggy, terrible, use, recently]",1,1,,negative
4,2022-07-09 13:20:49,"[dear, spotify, get, songs, put, playlist, shu...",1,1,,negative


### Defining Sentiment
Assigning a numerical value to each sentiment

Postive = 2

Negative = 0

Neutral = 1

In [44]:
# Splitting data into X and Y
X = training_data['Review']
Y = training_data['Sentiment']

In [45]:
sentiments = ['negative' , 'neutral', 'positive']
Y = Y.apply(lambda x: sentiments.index(x))

In [47]:
# checking data
Y.head()

0    2
1    2
2    2
3    0
4    0
Name: Sentiment, dtype: int64

### Vectorizing Data

In [50]:
# Import library 
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
X_fit = count_vectorizer.fit_transform(training_data).toarray()

# Check the shape
X_fit.shape

(6, 6)

## Making a Model
I will be using MultinomialNB- type of Naive Bayes Classifier, which Calculates the probability of sentiment based on the probability of words

In [51]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
model = MultinomialNB()

### Splitting the data

In [64]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

everytime I try to train the model I get this error. I have tried multiple different models

In [69]:
model.fit(X_train,y_train)

ValueError: setting an array element with a sequence.

In [67]:
#from sklearn.ensemble import RandomForestClassifier
#classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
#classifier.fit(X_train, y_train) 

ValueError: setting an array element with a sequence.