In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter

# Function to clean tweet text
def clean_tweet(tweet):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

# Load dataset
data = pd.read_csv("test_train.csv")  # Replace with your training data path

# Clean the tweet content
data['cleaned_content'] = data['content'].apply(clean_tweet)

# Check sentiment distribution
print("Sentiment Distribution before balancing:")
print(data['sentiment'].value_counts())

# Separate features and labels
X = data['cleaned_content']
y = data['sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2))
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Balance the training set using SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_vect, y_train)

print("Sentiment Distribution after balancing:")
print(Counter(y_train_balanced))

# Define models to train
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Support Vector Machine": SVC(kernel='linear', random_state=42)
}

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train_balanced, y_train_balanced)
    y_pred = model.predict(X_test_vect)
    print(f"\n{model_name} Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))

# Function to predict the sentiment of a tweet
def get_tweet_sentiment(tweet, vectorizer, model):
    cleaned_tweet = clean_tweet(tweet)
    tweet_vector = vectorizer.transform([cleaned_tweet])
    sentiment = model.predict(tweet_vector)[0]
    return sentiment

# Load new tweets from a CSV file for prediction
def get_tweets_from_csv(file_path, vectorizer, model):
    try:
        data = pd.read_csv(file_path)
        tweets = data.to_dict(orient='records')
        for tweet in tweets:
            tweet['sentiment'] = get_tweet_sentiment(tweet['content'], vectorizer, model)
        return tweets
    except Exception as e:
        print(f"Error reading CSV: {str(e)}")
        return []

# Path to the new CSV file containing tweets to analyze
new_csv_path = "path_to_new_tweets.csv"  # Replace with the path to your new CSV file
predicted_tweets = get_tweets_from_csv(new_csv_path, vectorizer, models["Random Forest"])

# Display the results
print("\nPredicted Sentiments for New Tweets:")
for tweet in predicted_tweets:
    print(f"Tweet: {tweet['content']}\nSentiment: {tweet['sentiment']}\n")

# Manually check a random tweet for its accuracy
random_tweet = "I love sunny days!"
print(f"\nManual Check - Tweet: {random_tweet}\nSentiment: {get_tweet_sentiment(random_tweet, vectorizer, models['Random Forest'])}")


ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (c:\Users\USER\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py)

In [None]:
# Path to the new CSV file containing tweets to analyze
new_csv_path = "test_train.csv"  # Replace with the path to your new CSV file
predicted_tweets = get_tweets_from_csv(new_csv_path)

# Display the results
print("\nPredicted Sentiments for New Tweets:")
for tweet in predicted_tweets:
    print(f"Tweet: {tweet['content']}\nSentiment: {tweet['sentiment']}\n")




Predicted Sentiments for New Tweets:
Tweet: Just had a great day at the beach!
Sentiment: Positive

Tweet: Feeling frustrated with the traffic today 😞
Sentiment: Negative

Tweet: Excited to announce our new project launch!
Sentiment: Positive

Tweet: So grateful for all the love and support ❤️
Sentiment: Positive

Tweet: Stuck in a never-ending meeting 😩
Sentiment: Negative

Tweet: Enjoying a peaceful evening at home 🏡
Sentiment: Positive

Tweet: Disappointed with the customer service 😔
Sentiment: Negative

Tweet: Can't wait for the weekend to start!
Sentiment: Positive

Tweet: Feeling overwhelmed with work deadlines 😓
Sentiment: Negative

Tweet: Just finished a great workout 💪
Sentiment: Positive

Tweet: Just had a great day at the beach!
Sentiment: Positive

Tweet: Feeling frustrated with the traffic today 😞
Sentiment: Negative

Tweet: Excited to announce our new project launch!
Sentiment: Positive

Tweet: So grateful for all the love and support ❤️
Sentiment: Positive

Tweet: Stuck

In [None]:
# Manually check the sentiment of a random tweet
manual_tweet = input("\nEnter a tweet to check its sentiment: ")
manual_sentiment = get_tweet_sentiment(manual_tweet, vectorizer, selected_model)
print(f"The sentiment of the tweet is: {manual_sentiment}")

The sentiment of the tweet is: Negative
