In [1]:
import pandas as pd #libraries here
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer # Or CountVectorizer
from sklearn.linear_model import LogisticRegression # Or another classifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
import string
# Download necessary NLTK resources (run this once)
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df = pd.read_csv("dataset.csv") #loading my dataset. I renamed it into dataset.csv

## Downloaded from UVI Machine Learning repository: Recipe Reviews and User Feedback.csv
## link: https://archive.ics.uci.edu/dataset/911/recipe+reviews+and+user+feedback+dataset

#### Description:
#### the dataset provides recipes with reviews and
#### user feedback, such as thumbs_up, thumbs_down,
#### and a user comment that we can analyze the
#### words used maybe.


In [3]:
df.head(5) #testing only, to show first 5 cols, adjust the number as I look more into them

Unnamed: 0.1,Unnamed: 0,recipe_number,recipe_code,recipe_name,comment_id,user_id,user_name,user_reputation,created_at,reply_count,thumbs_up,thumbs_down,stars,best_score,text
0,0,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2G3aneMRgRMZwXqIHmSdXSG1hEM,u_9iFLIhMa8QaG,Jeri326,1,1665619889,0,0,0,5,527,"I tweaked it a little, removed onions because ..."
1,1,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2FsPC83HtzCsQAtOxlbL6RcaPbY,u_Lu6p25tmE77j,Mark467,50,1665277687,0,7,0,5,724,Bush used to have a white chili bean and it ma...
2,2,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2FPrSGyTv7PQkZq37j92r9mYGkP,u_s0LwgpZ8Jsqq,Barbara566,10,1664404557,0,3,0,5,710,I have a very complicated white chicken chili ...
3,3,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2DzdSIgV9qNiuBaLoZ7JQaartoC,u_fqrybAdYjgjG,jeansch123,1,1661787808,2,2,0,0,581,"In your introduction, you mentioned cream chee..."
4,4,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2DtZJuRQYeTFwXBoZRfRhBPEXjI,u_XXWKwVhKZD69,camper77,10,1664913823,1,7,0,0,820,Wonderful! I made this for a &#34;Chili/Stew&#...


In [4]:
df.shape #ok, it has 18k rows, and 15 columns

(18182, 15)

In [5]:
df.info() #here I can see all the columns.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18182 entries, 0 to 18181
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       18182 non-null  int64 
 1   recipe_number    18182 non-null  int64 
 2   recipe_code      18182 non-null  int64 
 3   recipe_name      18182 non-null  object
 4   comment_id       18182 non-null  object
 5   user_id          18182 non-null  object
 6   user_name        18182 non-null  object
 7   user_reputation  18182 non-null  int64 
 8   created_at       18182 non-null  int64 
 9   reply_count      18182 non-null  int64 
 10  thumbs_up        18182 non-null  int64 
 11  thumbs_down      18182 non-null  int64 
 12  stars            18182 non-null  int64 
 13  best_score       18182 non-null  int64 
 14  text             18180 non-null  object
dtypes: int64(10), object(5)
memory usage: 2.1+ MB


### I should be needing to use Columns:
### user_reputation, reply_count, thumbs_up, thumbs_down, stars, best_score, and text

#### for 'text', lets see if we can get the most common words used regardless of capitalization, and the amount of exclamation points. 

## Data Pre-processing


In [6]:
#Converts this column to lowercase
df["text"] = df["text"].str.lower()

In [7]:
import re   # Removes all punctuations
df["text"] = df["text"].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [8]:
stop_words = set(stopwords.words("english"))  #initialize stop words

#function for stop words
def remove_stopwords(text):
    words = text.split()  # Split sentence into words
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)


df["text"] = df["text"].astype(str).apply(remove_stopwords)   #removes stop words, using above function


# Feature Extraction

## Below of this cell is my weight factors and Thresholds that we can adjust.

## Also we created the targets for model training. As these targets are human made, the results will depend highly on the settings of these cells below

In [18]:
def assign_sentiment(row):
    # Define weight factors
    thumbs_ratio = row["thumbs_up"] - row["thumbs_down"]  # Net thumbs score
    stars_score = row["stars"]
    reply_boost = row["reply_count"]  # Reply count impact

    # Sentiment Logic (Adjust Thresholds as Needed)
    if stars_score >= 4 or thumbs_ratio > 5 or reply_boost > 10:
        return "positive"
    elif stars_score == 3 or (thumbs_ratio >= -2 and thumbs_ratio <= 2):
        return "neutral"
    else:
        return "negative"

# Apply the function to create the sentiment label
df["sentiment_label"] = df.apply(assign_sentiment, axis=1)

In [10]:
print(df["sentiment_label"].value_counts())  # Check how balanced my labels are

sentiment_label
positive    15599
neutral      2336
negative      247
Name: count, dtype: int64


## Let's verify the labels we have created.
#### as of writing, im googling how to do this


In [11]:
df.head(3)  # Ensure labels are correctly assigned

Unnamed: 0.1,Unnamed: 0,recipe_number,recipe_code,recipe_name,comment_id,user_id,user_name,user_reputation,created_at,reply_count,thumbs_up,thumbs_down,stars,best_score,text,sentiment_label
0,0,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2G3aneMRgRMZwXqIHmSdXSG1hEM,u_9iFLIhMa8QaG,Jeri326,1,1665619889,0,0,0,5,527,tweaked little removed onions onion haters hou...,positive
1,1,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2FsPC83HtzCsQAtOxlbL6RcaPbY,u_Lu6p25tmE77j,Mark467,50,1665277687,0,7,0,5,724,bush used white chili bean made recipe super s...,positive
2,2,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2FPrSGyTv7PQkZq37j92r9mYGkP,u_s0LwgpZ8Jsqq,Barbara566,10,1664404557,0,3,0,5,710,complicated white chicken chili recipe made ye...,positive


In [12]:
# TF-IDF Feature Extraction
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df["text"])  # Convert text to vectors

# Bag of Words Feature Extraction
bow_vectorizer = CountVectorizer()
X_bow = bow_vectorizer.fit_transform(df["text"])  # Convert text to word counts


## Splitting the data for Training 
### test_size=0.2 ensures 20% of the dataset is used for testing.
### The remaining 80% is for training.
### while stratify=df["sentiment_label"] guarantees balanced sentiment representation.


In [13]:
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X_tfidf, df["sentiment_label"], test_size=0.2, stratify=df["sentiment_label"], random_state=42
)

X_train_bow, X_test_bow, _, _ = train_test_split(
    X_bow, df["sentiment_label"], test_size=0.2, stratify=df["sentiment_label"], random_state=42
)

print("TF-IDF Train Shape:", X_train_tfidf.shape)
print("Bag of Words Train Shape:", X_train_bow.shape)


TF-IDF Train Shape: (14545, 16511)
Bag of Words Train Shape: (14545, 16511)


## Choose and Train a model, 
## lets go with Logistic regression


In [14]:
# Instantiate model
log_reg_model = LogisticRegression()

# Train the model using TF-IDF features
log_reg_model.fit(X_train_tfidf, y_train)

# Predict on test data
y_pred_tfidf = log_reg_model.predict(X_test_tfidf)

### Make Predictions: 
#### Use the trained model to make predictions on the testing feature vectors using the predict() method.



In [15]:
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_tfidf))

Logistic Regression Accuracy: 0.8682980478416277


In [16]:
y_pred = log_reg_model.predict(X_test_tfidf)

In [17]:
print("Sample Predictions:")
for text, pred in zip(df["text"].iloc[:10], y_pred[:10]):
    print(f"Text: {text} --> Predicted Sentiment: {pred}")


Sample Predictions:
Text: tweaked little removed onions onion haters house used italian seasoning instead oregano use paprika cayenne mix little recipe called like everything bit hot chili amazing easy make everyone absolutely loved staple meal house --> Predicted Sentiment: positive
Text: bush used white chili bean made recipe super simple ive written asked please bring back --> Predicted Sentiment: positive
Text: complicated white chicken chili recipe made years everyone raves saw recipe thought id try easy alternative weeknights husband likes recipe better super easy delicious cut back slightly crushed oregano cayenne pepper otherwise made exactly written --> Predicted Sentiment: positive
Text: introduction mentioned cream cheese none listed ingredients --> Predicted Sentiment: positive
Text: wonderful made 34chilistew34 night dinner church everyone wanted recipe make quite often home it39s always hit also use freshly made rotisserie chicken adds flavor --> Predicted Sentiment: posi

## Please verify the results of my sample prediction in the above cell, as they are
## final for Part 2. Thank you.

### by: Flogio, Steven Iefel D. 