# NLP - Starbucks Survery Sentiment Analysis in Python

# Data Fact and Import

In [105]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#Local Directory 
df = pd.read_csv('E:\Starbucks satisfactory survey.csv')
#Data credit https://www.kaggle.com

In [106]:
df.shape

(122, 21)

In [107]:
df.head().transpose()

Unnamed: 0,0,1,2,3,4
Satisfaction,not happy,not happy,not happy,happy,not happy
Avi,Female,Female,Male,Female,Male
Avraham,From 20 to 29,From 20 to 29,From 20 to 29,From 20 to 29,From 20 to 29
Avram,Student,Student,Employed,Student,Student
Avrom,"Less than RM25,000","Less than RM25,000","Less than RM25,000","Less than RM25,000","Less than RM25,000"
Axel,Rarely,Rarely,Monthly,Rarely,Monthly
Aylmer,Dine in,Take away,Dine in,Take away,Take away
Aziz,Between 30 minutes to 1 hour,Below 30 minutes,Between 30 minutes to 1 hour,Below 30 minutes,Between 30 minutes to 1 hour
Bailey,within 1km,1km - 3km,more than 3km,more than 3km,1km - 3km
Bailie,Yes,Yes,Yes,No,No


In [108]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Baird,122.0,3.663934,0.941343,1.0,3.0,4.0,4.0,5.0
Baldwin,122.0,2.893443,1.081836,1.0,2.0,3.0,4.0,5.0
Bancroft,122.0,3.795082,1.090443,1.0,3.0,4.0,5.0,5.0
Barbabas,122.0,3.754098,0.929867,1.0,3.0,4.0,4.0,5.0
Barclay,122.0,3.254098,0.958317,1.0,3.0,3.0,4.0,5.0
Bard,122.0,3.745902,0.828834,1.0,3.0,4.0,4.0,5.0
Barde,122.0,3.516393,1.030394,1.0,3.0,4.0,4.0,5.0


# Data Cleaning and EDA

In [109]:
### Checking Missing values in the Data Set and printing the Percentage for Missing Values for Each Columns ###
count = df.isnull().sum().sort_values(ascending=False)
percentage = ((df.isnull().sum()/len(df)*100)).sort_values(ascending=False)
missing_data = pd.concat([count, percentage], axis=1,
                         keys=['Count','Percentage'])

print('Count and percentage of missing values for the columns:')

missing_data
    

Count and percentage of missing values for the columns:


Unnamed: 0,Count,Percentage
Aylmer,1,0.819672
Barn,1,0.819672
Barnabas,0,0.0
Bailie,0,0.0
Avi,0,0.0
Avraham,0,0.0
Avram,0,0.0
Avrom,0,0.0
Axel,0,0.0
Aziz,0,0.0


In [110]:
df.columns

Index(['Satisfaction', 'Avi', 'Avraham', 'Avram', 'Avrom', 'Axel', 'Aylmer',
       'Aziz', 'Bailey', 'Bailie', 'Baillie', 'Baily', 'Baird', 'Baldwin',
       'Bancroft', 'Barbabas', 'Barclay', 'Bard', 'Barde', 'Barn', 'Barnabas'],
      dtype='object')

In [111]:
# Apply first level cleaning
import re
import string

#This function converts to lower-case, removes square bracket, removes numbers and punctuation
def text_clean_1(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

cleaned1 = lambda x: text_clean_1(x)

In [112]:
# Let's take a look at the updated text
df['cleaned_description'] = pd.DataFrame(df.Satisfaction.apply(cleaned1))
df.head(2)

Unnamed: 0,Satisfaction,Avi,Avraham,Avram,Avrom,Axel,Aylmer,Aziz,Bailey,Bailie,...,Baird,Baldwin,Bancroft,Barbabas,Barclay,Bard,Barde,Barn,Barnabas,cleaned_description
0,not happy,Female,From 20 to 29,Student,"Less than RM25,000",Rarely,Dine in,Between 30 minutes to 1 hour,within 1km,Yes,...,4,3,5,5,4,4,3,Starbucks Website/Apps;Social Media;Emails;Dea...,Yes,not happy
1,not happy,Female,From 20 to 29,Student,"Less than RM25,000",Rarely,Take away,Below 30 minutes,1km - 3km,Yes,...,4,3,4,4,4,5,2,Social Media;In Store displays,Yes,not happy


In [113]:
# Apply a second round of cleaning
def text_clean_2(text):
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

cleaned2 = lambda x: text_clean_2(x)

In [114]:
# Let's take a look at the updated text
df['cleaned_description_new'] = pd.DataFrame(df['cleaned_description'].apply(cleaned2))
df.head(2)

Unnamed: 0,Satisfaction,Avi,Avraham,Avram,Avrom,Axel,Aylmer,Aziz,Bailey,Bailie,...,Baldwin,Bancroft,Barbabas,Barclay,Bard,Barde,Barn,Barnabas,cleaned_description,cleaned_description_new
0,not happy,Female,From 20 to 29,Student,"Less than RM25,000",Rarely,Dine in,Between 30 minutes to 1 hour,within 1km,Yes,...,3,5,5,4,4,3,Starbucks Website/Apps;Social Media;Emails;Dea...,Yes,not happy,not happy
1,not happy,Female,From 20 to 29,Student,"Less than RM25,000",Rarely,Take away,Below 30 minutes,1km - 3km,Yes,...,3,4,4,4,5,2,Social Media;In Store displays,Yes,not happy,not happy


# Model Training


In [116]:
from sklearn.model_selection import train_test_split

Independent_var = df.cleaned_description_new
Dependent_var = df.Satisfaction

IV_train, IV_test, DV_train, DV_test = train_test_split(Independent_var, Dependent_var, test_size = 0.1, random_state = 225)

print('IV_train :', len(IV_train))
print('IV_test  :', len(IV_test))
print('DV_train :', len(DV_train))
print('DV_test  :', len(DV_test))

IV_train : 109
IV_test  : 13
DV_train : 109
DV_test  : 13


In [117]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

tvec = TfidfVectorizer()
clf2 = LogisticRegression(solver = "lbfgs")


from sklearn.pipeline import Pipeline

In [118]:
model = Pipeline([('vectorizer',tvec),('classifier',clf2)])

model.fit(IV_train, DV_train)


from sklearn.metrics import confusion_matrix

predictions = model.predict(IV_test)

confusion_matrix(predictions, DV_test)

array([[9, 0],
       [0, 4]], dtype=int64)

# Model Prediction

In [119]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(predictions, DV_test))
print("Precision : ", precision_score(predictions, DV_test, average = 'weighted'))
print("Recall : ", recall_score(predictions, DV_test, average = 'weighted'))

Accuracy :  1.0
Precision :  1.0
Recall :  1.0


## Trying on new reviews 

In [124]:
example = ["I am not happy"]
result = model.predict(example)

print(result)

['not happy']
