In [19]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Load dataset
df = pd.read_csv('datasets/sentiment-analysis-dataset-google-play-app-reviews.csv')

# Map the `score` column to sentiment
def map_score_to_sentiment(score):
    if score >= 4:
        return "positive"
    elif score == 3:
        return "neutral"
    else:
        return "negative"

df['sentiment'] = df['score'].apply(map_score_to_sentiment)

# Use the `content` column for text data and the mapped `sentiment` column as labels
X = df['content']
y = df['sentiment']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Function to evaluate model with different n-grams
def evaluate_model(ngram_range):
    vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words='english')
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Train Random Forest Classifier
    classifier = RandomForestClassifier(random_state=41)
    classifier.fit(X_train_vec, y_train)
    y_pred = classifier.predict(X_test_vec)

    print(f"\nEvaluating model with ngram_range={ngram_range}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=1))

# Evaluate unigrams
evaluate_model((1, 1))

# Evaluate unigrams + bigrams
evaluate_model((1, 2))

# Evaluate unigrams + bigrams
evaluate_model((1, 3))

# Evaluate bigrams
evaluate_model((2, 2))



Evaluating model with ngram_range=(1, 1):
Accuracy: 0.7746478873239436
Classification Report:
               precision    recall  f1-score   support

    negative       0.78      0.79      0.79      1557
     neutral       0.75      0.67      0.71      1545
    positive       0.79      0.85      0.82      1726

    accuracy                           0.77      4828
   macro avg       0.77      0.77      0.77      4828
weighted avg       0.77      0.77      0.77      4828

Evaluating model with ngram_range=(1, 2):
Accuracy: 0.7806545153272577
Classification Report:
               precision    recall  f1-score   support

    negative       0.78      0.79      0.79      1557
     neutral       0.78      0.66      0.71      1545
    positive       0.78      0.88      0.83      1726

    accuracy                           0.78      4828
   macro avg       0.78      0.78      0.78      4828
weighted avg       0.78      0.78      0.78      4828

Evaluating model with ngram_range=(1, 3):
Accur

In [21]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

# ייבוא ה-DataFrame
df = pd.read_csv('datasets/sentiment-analysis-dataset-google-play-app-reviews.csv')
df = df[['content', 'score']]

# הגדרת מטרות
df['sentiment'] = df['score'].apply(lambda x: 'positive' if x >= 4 else ('neutral' if x == 3 else 'negative'))
X = df['content']
y = df['sentiment']

# פיצול לנתוני אימון ובדיקה
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Stemming Function
stemmer = PorterStemmer()

def apply_stemming(text):
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

# החלת Stemming על נתוני האימון והבדיקה
X_train = X_train.apply(apply_stemming)
X_test = X_test.apply(apply_stemming)

# Vectorization and Evaluation Function
def evaluate_model(ngram_range):
    vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words='english')
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Random Forest Classifier
    classifier = RandomForestClassifier(random_state=42)
    classifier.fit(X_train_vec, y_train)
    y_pred = classifier.predict(X_test_vec)

    print(f"\nEvaluating model with ngram_range={ngram_range}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=1))

# Unigrams
evaluate_model((1, 1))

# Bigrams
evaluate_model((2, 2))

# Unigrams + Bigrams
evaluate_model((1, 2))



Evaluating model with ngram_range=(1, 1):
Accuracy: 0.7804473902236951
Classification Report:
               precision    recall  f1-score   support

    negative       0.78      0.81      0.80      1569
     neutral       0.78      0.65      0.71      1542
    positive       0.78      0.87      0.82      1717

    accuracy                           0.78      4828
   macro avg       0.78      0.78      0.78      4828
weighted avg       0.78      0.78      0.78      4828

Evaluating model with ngram_range=(2, 2):
Accuracy: 0.7359154929577465
Classification Report:
               precision    recall  f1-score   support

    negative       0.80      0.68      0.73      1569
     neutral       0.75      0.64      0.69      1542
    positive       0.69      0.87      0.77      1717

    accuracy                           0.74      4828
   macro avg       0.75      0.73      0.73      4828
weighted avg       0.74      0.74      0.73      4828

Evaluating model with ngram_range=(1, 2):
Accur