In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Load the data

In [2]:
data = pd.read_csv("Tweets_preprocessed.csv")
data

Unnamed: 0,text,sentiment
0,- It is supposed to be an unrelated (story-wi...,negative
1,homework on a friday night...lame,negative
2,gutted - the handbag I wanted has been sold!,negative
3,i miss 'mr.',negative
4,me too i hate revision,negative
...,...,...
23338,My gut says to replace $$$ appliances instead ...,positive
23339,aaawww no worries fresh start to work on gro...,positive
23340,"weird as usual, but ok... that`s why we like it",positive
23341,"????? ,my latest obession.",positive


In [3]:
# First of all we have to divide the data into inputs and targets
inputs = data[["text"]]
# We have to convert the targets into numerical values
targets = np.array(data["sentiment"].map({'neutral': 0, 'positive': 1, 'negative': 2}))

## We have to use NLP techniques to convert our data into something that our model understand

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
# Create the vectorizer
vectorizer = CountVectorizer()

In [6]:
# Fit the vectorizer with the data
vectorizer.fit(inputs['text'])

CountVectorizer()

In [7]:
# Transforming the data
X = vectorizer.transform(inputs['text'])

In [8]:
# Saving it in the same variable inputs as a dataframe
inputs = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

## Split the data into train and test and shuffle conserving the balance

In [9]:
X_train, X_test, y_train, y_test = train_test_split(inputs, targets, test_size=0.2, random_state=42, stratify=targets)

## Naive Bayes model with SkLearn

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [11]:
# Create the model
nb = MultinomialNB()

In [12]:
# Train the model
nb.fit(X_train, y_train)

MultinomialNB()

In [13]:
# Make predictions
y_pred = nb.predict(X_test)

In [14]:
# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6577425572927822


## Conclusion

It might seem that it is not a good model, but it must be taken into account that the dataset is not very large and that it is only taking into account the full text of the tweet, that is, for any tweet it could have a 65% security rate. answer if it is of a positive, negative or neutral feeling

## Saving the model and the vectorizer

In [15]:
import pickle

In [16]:
with open("model", "wb") as file:
    pickle.dump(nb, file)

In [17]:
with open("vectorizer", "wb") as file:
    pickle.dump(vectorizer, file)