# Word Cloud and Sentiment Analysis

This notebook contains code for generating word clouds and performing sentiment analysis on text data.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from textblob import TextBlob
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load data
# Replace 'your_file.csv' with your actual file path
df = pd.read_csv('test.csv')

# Display basic info about the dataset
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
df.head()

In [None]:
# Text preprocessing function
def preprocess_text(text):
    """
    Clean and preprocess text data
    """
    if pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

# Apply preprocessing
# Replace 'content' with your actual column name
df['cleaned_content'] = df['content'].apply(preprocess_text)

In [None]:
# Sentiment analysis function
def get_sentiment(text):
    """
    Analyze sentiment of text using TextBlob
    """
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    
    if polarity > 0.1:
        return 'Positive'
    elif polarity < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis
df['sentiment'] = df['cleaned_content'].apply(get_sentiment)
df['polarity'] = df['cleaned_content'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Display sentiment distribution
print("Sentiment Distribution:")
print(df['sentiment'].value_counts())

In [None]:
# Generate word cloud
def create_wordcloud(text, title="Word Cloud"):
    """
    Create and display word cloud
    """
    # Combine all text
    all_text = ' '.join(text.dropna())
    
    # Create word cloud
    wordcloud = WordCloud(
        width=800, 
        height=400, 
        background_color='white',
        max_words=100,
        colormap='viridis'
    ).generate(all_text)
    
    # Display
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=16)
    plt.tight_layout(pad=0)
    plt.show()

# Create word clouds for different sentiments
create_wordcloud(df['cleaned_content'], "Overall Word Cloud")
create_wordcloud(df[df['sentiment'] == 'Positive']['cleaned_content'], "Positive Sentiment Word Cloud")
create_wordcloud(df[df['sentiment'] == 'Negative']['cleaned_content'], "Negative Sentiment Word Cloud")

In [None]:
# Visualize sentiment distribution
plt.figure(figsize=(12, 4))

# Sentiment count plot
plt.subplot(1, 2, 1)
sns.countplot(data=df, x='sentiment', palette='viridis')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')

# Polarity distribution
plt.subplot(1, 2, 2)
plt.hist(df['polarity'], bins=30, alpha=0.7, color='skyblue')
plt.title('Polarity Distribution')
plt.xlabel('Polarity Score')
plt.ylabel('Frequency')
plt.axvline(x=0, color='red', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

In [None]:
# Most common words analysis
def get_most_common_words(text_series, n=10):
    """
    Get most common words from text series
    """
    all_words = []
    for text in text_series.dropna():
        words = text.split()
        all_words.extend(words)
    
    # Filter out common stop words
    stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'}
    
    filtered_words = [word for word in all_words if word not in stop_words and len(word) > 2]
    
    return Counter(filtered_words).most_common(n)

# Get most common words overall
print("Most Common Words (Overall):")
common_words = get_most_common_words(df['cleaned_content'])
for word, count in common_words:
    print(f"{word}: {count}")

# Get most common words by sentiment
print("\nMost Common Words (Positive):")
positive_words = get_most_common_words(df[df['sentiment'] == 'Positive']['cleaned_content'])
for word, count in positive_words:
    print(f"{word}: {count}")

print("\nMost Common Words (Negative):")
negative_words = get_most_common_words(df[df['sentiment'] == 'Negative']['cleaned_content'])
for word, count in negative_words:
    print(f"{word}: {count}")

In [None]:
# Export results
# Save processed data with sentiment scores
df.to_csv('sentiment_analysis_results.csv', index=False)
print("Results saved to 'sentiment_analysis_results.csv'")

# Summary statistics
print("\nSummary Statistics:")
print(f"Total texts analyzed: {len(df)}")
print(f"Average polarity: {df['polarity'].mean():.3f}")
print(f"Positive texts: {len(df[df['sentiment'] == 'Positive'])}")
print(f"Negative texts: {len(df[df['sentiment'] == 'Negative'])}")
print(f"Neutral texts: {len(df[df['sentiment'] == 'Neutral'])}")
print(f"Most positive text: {df.loc[df['polarity'].idxmax()]['cleaned_content'][:100]}...")
print(f"Most negative text: {df.loc[df['polarity'].idxmin()]['cleaned_content'][:100]}...")