https://www.kaggle.com/code/anastasiyaigonina/reviews-sentiment-topic-modeling-clustering

The experiment performed joining negative and neutral can be found in this link:

http://localhost:8889/notebooks/Documents/GitHub/StrategicThinking/Testing%20ML%20no%20neutral.ipynb

Or in my GitHub Repository: https://github.com/izazaka/StrategicThinking

ML from this

https://www.kaggle.com/code/anastasiyaigonina/reviews-sentiment-topic-modeling-clustering#Sentiment-Analysis

## Sentiment Intensity Analyzer

In [None]:
sia = SentimentIntensityAnalyzer()

def get_sentiment(review):
    scores = sia.polarity_scores(review)
    sentiment_score = scores["compound"]
    if sentiment_score > 0.1:
        return "positive"
    elif sentiment_score < -0.1:
        return "negative"
    else:
        return "neutral"
    

ml2 = ml.copy()
ml2["Predicted_Sentiment"] = ml2["cleaned_text"].apply(get_sentiment)


print("Number of positive reviews:", len(ml2[ml2["Predicted_Sentiment"] == "positive"]))
print("Number of negative reviews:", len(ml2[ml2["Predicted_Sentiment"] == "negative"]))
print("Number of neutral reviews:", len(ml2[ml2["Predicted_Sentiment"] == "neutral"]))

In [None]:
ml2["True_Sentiment"] = ml2["starRating"].map({1: "negative", 
                                           2: "negative", 
                                           3: "neutral", 
                                           4: "positive", 
                                           5: "positive"})

In [None]:
cm = confusion_matrix(ml2["True_Sentiment"], ml2["Predicted_Sentiment"])

labels = ["Negative", "Neutral", "Positive"]
sns.heatmap(cm, annot = True, cmap = "Reds", fmt = "g", xticklabels = labels, yticklabels = labels)
plt.xlabel("Predicted sentiment")
plt.ylabel("True sentiment")
plt.title("Confusion matrix for sentiment analysis")
plt.show()

In [None]:
print("\nClassification report:\n", classification_report(ml2["True_Sentiment"], 
                                                          ml2["Predicted_Sentiment"]))

## Random Forest with Star Rating

In [None]:
positive = [4, 5]
neutral = [3]
negative = [1, 2]

def map_sentiment(rating):
    if rating in positive:
        return 2
    elif rating in negative:
        return 1
    else:
        return 0

ml["sentiment"]= ml["starRating"].apply(map_sentiment)

In [None]:
tfidf = TfidfVectorizer(ngram_range = (1, 3), max_features = 1000, tokenizer = word_tokenize)
X = tfidf.fit_transform(ml["cleaned_text"])
y = ml["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.40, random_state = 12)

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

predicted_rf = rf.predict(X_test)

accuracy_rf = accuracy_score(y_test, predicted_rf)
print('Accuracy:', accuracy_rf)
print('Classification Report:')
print(classification_report(y_test, predicted_rf))

In [None]:
y_train_pred = rf.predict(X_train)

y_test_pred = rf.predict(X_test)

accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)

print("Accuracy on Training Set:", accuracy_train)
print("Accuracy on Test Set:", accuracy_test)

### Best Parameters

In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer = word_tokenize)),
    ('clf', RandomForestClassifier())
])

param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__max_features': [500, 1000, 2000],
    'clf__n_estimators': [50, 100, 200],
    'clf__max_depth': [None, 10, 20]
}

grid_search = GridSearchCV(pipeline, param_grid, cv = 5, n_jobs=- 1, verbose = 1)

grid_search.fit(ml["cleaned_text"], ml["sentiment"])

print("Best Parameters:", grid_search.best_params_)

In [None]:
print("Best Parameters:", grid_search.best_params_)

In [None]:
tfidf = TfidfVectorizer(ngram_range = (1, 1), max_features = 500, 
                        tokenizer = word_tokenize)
X = tfidf.fit_transform(ml["cleaned_text"])
y = ml["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.40, random_state = 12)

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

predicted_rf = rf.predict(X_test)

accuracy_rf = accuracy_score(y_test, predicted_rf)
print('Accuracy:', accuracy_rf)
print('Classification Report:')
print(classification_report(y_test, predicted_rf))

In [None]:
y_train_pred = rf.predict(X_train)

y_test_pred = rf.predict(X_test)

accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)

print("Accuracy on Training Set:", accuracy_train)
print("Accuracy on Test Set:", accuracy_test)

In [None]:
model = RandomForestClassifier()

kfold = KFold(n_splits = 5, shuffle = True, random_state = 12)

scores = cross_val_score(model, X, y, cv = kfold, scoring = "accuracy")

print("Cross-validated Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

## Random Forest with the Sentiment Score

In [None]:
tfidf = TfidfVectorizer(ngram_range = (1, 3), max_features = 2000, tokenizer = word_tokenize)
X = tfidf.fit_transform(ml["cleaned_text"])
y = ml["sentiment_ml"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 12)

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

predicted_rf = rf.predict(X_test)

accuracy_rf = accuracy_score(y_test, predicted_rf)
print('Accuracy:', accuracy_rf)
print('Classification Report:')
print(classification_report(y_test, predicted_rf))

In [None]:
y_train_pred = rf.predict(X_train)

y_test_pred = rf.predict(X_test)

accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)

print("Accuracy on Training Set:", accuracy_train)
print("Accuracy on Test Set:", accuracy_test)

In [None]:
model = RandomForestClassifier()

kfold = KFold(n_splits = 5, shuffle = True, random_state = 12)

scores = cross_val_score(model, X, y, cv = kfold, scoring = "accuracy")

print("Cross-validated Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))