In [20]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import nltk

In [21]:
# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to clean tweet text
def clean_tweet(tweet):
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    tweet = re.sub(r'\@\w+|\#', '', tweet)
    tweet = re.sub(r'[^A-Za-z\s]', '', tweet)
    tweet = tweet.lower()
    tokens = tweet.split()
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    cleaned_tweet = ' '.join(tokens)
    return cleaned_tweet

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [22]:
# Load dataset
data = pd.read_csv("mobile_phone_tweets_balanced.csv")  # Replace with your training data path

# Clean the tweet content
data['cleaned_content'] = data['content'].apply(clean_tweet)

In [23]:
# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(data['cleaned_content'])
y = data['sentiment']  # Assuming the training data has a 'sentiment' column with labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [24]:
# Define models with hyperparameter tuning
models = {
    "Random Forest": GridSearchCV(RandomForestClassifier(random_state=42), {'n_estimators': [100, 200, 300]}, cv=5),
    "Logistic Regression": GridSearchCV(LogisticRegression(random_state=42), {'C': [0.01, 0.1, 1, 10]}, cv=5),
    "SVM": GridSearchCV(SVC(random_state=42), {'C': [0.01, 0.1, 1, 10]}, cv=5),
    "Naive Bayes": MultinomialNB()
}

In [25]:
# Train and evaluate models
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    print(f"\n{model_name} Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")



Random Forest Accuracy: 0.3275, Precision: 0.3310, Recall: 0.3276, F1-Score: 0.3262


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Logistic Regression Accuracy: 0.3325, Precision: 0.2186, Recall: 0.3333, F1-Score: 0.2601

SVM Accuracy: 0.3450, Precision: 0.1150, Recall: 0.3333, F1-Score: 0.1710

Naive Bayes Accuracy: 0.3600, Precision: 0.3628, Recall: 0.3598, F1-Score: 0.3593


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
# Select the best model (for demonstration, using Naive Bayes)
selected_model = models["Naive Bayes"]
selected_model.fit(X_train, y_train)

# Function to predict the sentiment of a tweet
def get_tweet_sentiment(tweet, vectorizer, model):
    cleaned_tweet = clean_tweet(tweet)
    tweet_vector = vectorizer.transform([cleaned_tweet])
    sentiment = model.predict(tweet_vector)[0]
    return sentiment

In [27]:
# Predict sentiments for the dataset
data['predicted_sentiment'] = data['content'].apply(lambda x: get_tweet_sentiment(x, vectorizer, selected_model))

# Map numerical sentiment labels to string labels (if applicable)
sentiment_mapping = {1: 'positive', 0: 'neutral', -1: 'negative'}
data['predicted_sentiment'] = data['predicted_sentiment'].map(sentiment_mapping)

# Aggregate sentiment scores for each product
product_sentiment = data.groupby(['product', 'predicted_sentiment']).size().unstack(fill_value=0)
product_sentiment = product_sentiment.div(product_sentiment.sum(axis=1), axis=0)  # Normalize to get proportions

# Display the comparison of the most used phones
print(product_sentiment)


Empty DataFrame
Columns: []
Index: []


In [16]:
# Plot a stacked bar chart for sentiment distribution
ax = product_sentiment.plot(kind='bar', stacked=True, figsize=(12, 8), color=['red', 'blue', 'green'])
plt.title('Sentiment Distribution for Different Products')
plt.xlabel('Product')
plt.ylabel('Proportion')
plt.legend(title='Sentiment')
plt.show()

NameError: name 'product_sentiment' is not defined