In [7]:
# Supervised machine learning problem - classifying text using your own custom labels to create a custom classifier

In [8]:
# Using classification algorithms such as Logic regression, Naive Bayes and Linear SVM

In [9]:
# Logistic regression is a good baseline model to start with to classify our text

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [11]:
# sklearn package is quite large and we wont be importing all of it. We will be only importing the functions that we need.

In [None]:
# Creating an algorithm that can classify the sentence by our sentiment score. We want to build our own custom model 

In [12]:
import pandas as pd

In [13]:
# This is a sample data of texts and their associated sentiments
data = pd.DataFrame([("i love spending time with my friends and family", "positive"),
                     ("that was the best meal i've ever had in my life", "positive"),
                     ("i feel so grateful for everything i have in my life", "positive"),
                     ("i received a promotion at work and i couldn't be happier", "positive"),
                     ("watching a beautiful sunset always fills me with joy", "positive"),
                     ("my partner surprised me with a thoughtful gift and it made my day", "positive"),
                     ("i am so proud of my daughter for graduating with honors", "positive"),
                     ("listening to my favorite music always puts me in a good mood", "positive"),
                     ("i love the feeling of accomplishment after completing a challenging task", "positive"),
                     ("i am excited to go on vacation next week", "positive"),
                     ("i feel so overwhelmed with work and responsibilities", "negative"),
                     ("the traffic during my commute is always so frustrating", "negative"),
                     ("i received a parking ticket and it ruined my day", "negative"),
                     ("i got into an argument with my partner and we're not speaking", "negative"),
                     ("i have a headache and i feel terrible", "negative"),
                     ("i received a rejection letter for the job i really wanted", "negative"),
                     ("my car broke down and it's going to be expensive to fix", "negative"),
                     ("i'm feeling sad because i miss my friends who live far away", "negative"),
                     ("i'm frustrated because i can't seem to make progress on my project", "negative"),
                     ("i'm disappointed because my team lost the game", "negative")
                    ],
                    columns=['text', 'sentiment'])

In [None]:
# Shuffle the data so that we have a nice mix of positive and negative sentiment in the text
# sample function is to obtain a random sample of rows from the dataframe. frac=1 means sampling 100% of data effectively shuffling the data
#reset index resets the index of the dataframe and drop=True doesnt retain the old index

In [31]:
data = data.sample(frac=1).reset_index(drop=True)

In [33]:
# Create our x and y to feed into our alogrithm

In [35]:
x = data['text']
y= data['sentiment']

In [37]:
# Implement text vectorization using bag of words

In [39]:
# CountVectorizer converts text documents to matrix of tokens that ML alogrithms can work with. fit learns the vocabulary from corpus 
# identifying all unique words and mapping them to feature indices. tranforms generates a matrix where each row is a document and each column
# is a token from the vocabulary. The values of matrix are counts of each token in each document.

In [41]:
countvec = CountVectorizer()

In [43]:
countvec_fit = countvec.fit_transform(x)

In [None]:
# Convert into pandas dataframe. Our column names c

In [47]:
bag_of_words = pd.DataFrame(countvec_fit.toarray(), columns = countvec.get_feature_names_out())

In [49]:
bag_of_words

Unnamed: 0,accomplishment,after,always,am,an,and,argument,at,away,be,...,vacation,ve,wanted,was,watching,we,week,who,with,work
0,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
8,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [None]:
# Create train and test data for our algorithm. test size indicates that 30% of our data is set aside for the test data

In [55]:
x_train, x_test, y_train, y_test = train_test_split(bag_of_words, y, test_size=0.3, random_state=7)

In [None]:
# Create the logistic regression model

In [57]:
lr = LogisticRegression(random_state=1).fit(x_train, y_train)

In [None]:
# Now predict this against our test data set

In [59]:
y_pred_lr = lr.predict(x_test)

In [None]:
# Compute the accuracy score

In [61]:
accuracy_score(y_pred_lr, y_test)

0.5

In [None]:
# Classification report with our test data and predicted data to see how our algortihm works with different types of tag

In [65]:
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

    negative       0.50      0.33      0.40         3
    positive       0.50      0.67      0.57         3

    accuracy                           0.50         6
   macro avg       0.50      0.50      0.49         6
weighted avg       0.50      0.50      0.49         6



In [None]:
# Lets see if we can get a better accuracy score with a Naive Bayes model

In [67]:
from sklearn.naive_bayes import MultinomialNB

In [71]:
nb = MultinomialNB().fit(x_train,y_train)

In [73]:
y_pred_nb = nb.predict(x_test)

In [75]:
accuracy_score(y_pred_nb, y_test)

0.3333333333333333

In [77]:
print( classification_report(y_test, y_pred_nb))

              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         3
    positive       0.40      0.67      0.50         3

    accuracy                           0.33         6
   macro avg       0.20      0.33      0.25         6
weighted avg       0.20      0.33      0.25         6



In [None]:
# Check the accuracy with Linear Support Vector machine that usually works well with text data

In [79]:
from sklearn.linear_model import SGDClassifier # Many other packages can also run this algorithm under the hood

In [83]:
svm = SGDClassifier().fit(x_train, y_train)

In [85]:
y_pred_sv = svm.predict(x_test)

In [87]:
accuracy_score(y_pred_sv, y_test)

0.3333333333333333

In [89]:
print(classification_report(y_test, y_pred_sv))

              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         3
    positive       0.40      0.67      0.50         3

    accuracy                           0.33         6
   macro avg       0.20      0.33      0.25         6
weighted avg       0.20      0.33      0.25         6

