# Mental Health Tweets Analysis During COVID-19 Pandemic

## üìä Comprehensive Data Analysis & Visualization Project

**Purpose:** Analyze mental health discourse on social media during the pandemic (2020-2022)

**Author:** Data Analysis Project  
**Date:** November 2025

---

### Project Overview

This notebook provides a complete analysis of mental health-related tweets during the COVID-19 pandemic, including:
- Data loading and exploration
- Sentiment analysis
- Temporal trends
- Category distribution
- Engagement metrics
- Visualization for web presentation

## 1. Import Required Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
from datetime import datetime
import json
import warnings
warnings.filterwarnings('ignore')

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Set visualization styles
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

print("‚úÖ All libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## 2. Load and Explore the Dataset

In [None]:
# Load the dataset
df = pd.read_csv('mental_health_tweets.csv')

# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'])

# Basic information
print("="*60)
print("DATASET OVERVIEW")
print("="*60)
print(f"Total number of tweets: {len(df)}")
print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")
print(f"Columns: {', '.join(df.columns)}")
print(f"\nDataset shape: {df.shape}")
print("\n" + "="*60)

# Display first few rows
print("\nFirst 5 tweets:")
df.head()

In [None]:
# Check for missing values
print("Missing Values:")
print(df.isnull().sum())
print("\n" + "="*60)

# Data types
print("\nData Types:")
print(df.dtypes)
print("\n" + "="*60)

# Basic statistics
print("\nBasic Statistics:")
df.describe()

## 3. Sentiment Distribution Analysis

In [None]:
# Sentiment distribution
sentiment_counts = df['sentiment'].value_counts()
sentiment_pct = df['sentiment'].value_counts(normalize=True) * 100

print("SENTIMENT DISTRIBUTION")
print("="*60)
for sentiment in sentiment_counts.index:
    print(f"{sentiment.capitalize()}: {sentiment_counts[sentiment]} tweets ({sentiment_pct[sentiment]:.1f}%)")
print("="*60)

# Visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Bar chart
colors = {'positive': '#2ecc71', 'negative': '#e74c3c', 'neutral': '#95a5a6'}
sentiment_colors = [colors[sent] for sent in sentiment_counts.index]
ax1.bar(sentiment_counts.index, sentiment_counts.values, color=sentiment_colors, 
        edgecolor='black', linewidth=2)
ax1.set_title('Sentiment Distribution - Bar Chart', fontsize=14, fontweight='bold')
ax1.set_xlabel('Sentiment', fontsize=12)
ax1.set_ylabel('Number of Tweets', fontsize=12)
ax1.grid(axis='y', alpha=0.3)
for i, v in enumerate(sentiment_counts.values):
    ax1.text(i, v + 1, str(v), ha='center', fontweight='bold', fontsize=12)

# Pie chart
ax2.pie(sentiment_counts.values, labels=sentiment_counts.index, autopct='%1.1f%%',
        colors=sentiment_colors, startangle=90, textprops={'fontsize': 12, 'fontweight': 'bold'})
ax2.set_title('Sentiment Distribution - Pie Chart', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

## 4. Category Distribution Analysis

In [None]:
# Category distribution
category_counts = df['category'].value_counts()

print("MENTAL HEALTH CATEGORY DISTRIBUTION")
print("="*60)
for category in category_counts.index:
    pct = (category_counts[category] / len(df)) * 100
    print(f"{category.capitalize()}: {category_counts[category]} tweets ({pct:.1f}%)")
print("="*60)

# Visualization
plt.figure(figsize=(12, 8))
colors_cat = plt.cm.Set3(range(len(category_counts)))
plt.barh(category_counts.index, category_counts.values, color=colors_cat, edgecolor='black', linewidth=1.5)
plt.title('Mental Health Categories Distribution', fontsize=16, fontweight='bold')
plt.xlabel('Number of Tweets', fontsize=12)
plt.ylabel('Category', fontsize=12)
plt.grid(axis='x', alpha=0.3)

for i, v in enumerate(category_counts.values):
    plt.text(v + 0.3, i, str(v), va='center', fontweight='bold')

plt.tight_layout()
plt.show()

## 5. Temporal Analysis - Timeline of Tweets

In [None]:
# Create year-month column
df['year_month'] = df['date'].dt.to_period('M')

# Timeline analysis
timeline_data = df.groupby('year_month').size()

print("TWEET VOLUME OVER TIME")
print("="*60)
print(f"Peak month: {timeline_data.idxmax()} with {timeline_data.max()} tweets")
print(f"Lowest month: {timeline_data.idxmin()} with {timeline_data.min()} tweets")
print("="*60)

# Visualization
plt.figure(figsize=(14, 6))
timeline_data.plot(kind='line', marker='o', linewidth=2.5, markersize=8, color='#3498db')
plt.title('Mental Health Tweet Volume Over Time (2020-2022)', fontsize=16, fontweight='bold')
plt.xlabel('Month', fontsize=12)
plt.ylabel('Number of Tweets', fontsize=12)
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.axhline(y=timeline_data.mean(), color='red', linestyle='--', linewidth=2, 
            label=f'Average: {timeline_data.mean():.1f}')
plt.legend()
plt.tight_layout()
plt.show()

## 6. Sentiment Trends Over Time

In [None]:
# Sentiment over time
sentiment_timeline = df.groupby(['year_month', 'sentiment']).size().unstack(fill_value=0)

# Stacked area chart
plt.figure(figsize=(14, 7))
sentiment_timeline.plot(kind='area', stacked=True, 
                        color=['#2ecc71', '#e74c3c', '#95a5a6'],
                        alpha=0.7, ax=plt.gca())
plt.title('Sentiment Trends Over Time (Stacked Area)', fontsize=16, fontweight='bold')
plt.xlabel('Month', fontsize=12)
plt.ylabel('Number of Tweets', fontsize=12)
plt.legend(title='Sentiment', loc='upper left', fontsize=11)
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Line chart for individual sentiments
plt.figure(figsize=(14, 7))
for sentiment, color in zip(['positive', 'negative', 'neutral'], ['#2ecc71', '#e74c3c', '#95a5a6']):
    if sentiment in sentiment_timeline.columns:
        sentiment_timeline[sentiment].plot(kind='line', marker='o', linewidth=2.5, 
                                          markersize=7, color=color, label=sentiment.capitalize())

plt.title('Sentiment Trends Over Time (Individual Lines)', fontsize=16, fontweight='bold')
plt.xlabel('Month', fontsize=12)
plt.ylabel('Number of Tweets', fontsize=12)
plt.legend(title='Sentiment', fontsize=11)
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

---

## üìù Conclusion

This comprehensive analysis of mental health tweets during the COVID-19 pandemic (2020-2022) reveals important patterns:

### Key Takeaways:
1. **Significant Mental Health Discourse**: The pandemic sparked widespread discussion about mental health
2. **Diverse Categories**: Multiple aspects covered - anxiety, depression, wellness, therapy, stress, support
3. **Global Impact**: Geographic distribution shows worldwide concern about pandemic mental health
4. **High Engagement**: Strong public interest indicated by likes and retweets
5. **Temporal Patterns**: Tweet volume and sentiment varied across different pandemic phases

### For Presentation:
- Use the interactive **web dashboard** (`index.html`) for visual presentation
- Reference specific **statistics** and **charts** from this notebook
- Highlight **key insights** and **trends** discovered in the analysis
- Discuss **real-world implications** for mental health support and awareness

### Next Steps:
- Run `analyze_tweets.py` to generate all visualization files
- Open `index.html` to view the interactive dashboard
- Share findings with stakeholders
- Consider expanding analysis with more data sources

---

**Remember:** Mental health matters. This analysis highlights the importance of continued support and awareness. üíö

In [None]:
# Create output directory
import os
if not os.path.exists('visualizations'):
    os.makedirs('visualizations')
    print("‚úÖ Created 'visualizations' directory")

# Export summary statistics to JSON
summary_stats = {
    'total_tweets': int(len(df)),
    'date_range': {
        'start': df['date'].min().strftime('%Y-%m-%d'),
        'end': df['date'].max().strftime('%Y-%m-%d')
    },
    'sentiment_distribution': df['sentiment'].value_counts().to_dict(),
    'sentiment_percentages': {
        sentiment: round((count / len(df)) * 100, 1)
        for sentiment, count in df['sentiment'].value_counts().items()
    },
    'top_categories': df['category'].value_counts().head(5).to_dict(),
    'total_engagement': {
        'likes': int(df['likes'].sum()),
        'retweets': int(df['retweets'].sum())
    },
    'avg_engagement': {
        'likes': round(df['likes'].mean(), 2),
        'retweets': round(df['retweets'].mean(), 2)
    },
    'locations': df['location'].value_counts().to_dict()
}

with open('visualizations/summary_stats.json', 'w') as f:
    json.dump(summary_stats, f, indent=2)

print("‚úÖ Exported summary statistics to JSON")
print("\nüìÅ Files ready for web dashboard:")
print("  ‚Ä¢ mental_health_tweets.csv - Original dataset")
print("  ‚Ä¢ visualizations/summary_stats.json - Summary statistics")
print("  ‚Ä¢ index.html - Interactive web dashboard")
print("\nüåê To view the dashboard:")
print("  1. Open 'index.html' in a web browser")
print("  2. Or run: python -m http.server 8000")
print("  3. Then navigate to: http://localhost:8000")

## 13. Export Data for Web Dashboard

Run this section after generating all visualizations to prepare data for the web dashboard.

In [None]:
# Generate comprehensive summary
print("="*70)
print(" "*15 + "KEY INSIGHTS AND SUMMARY")
print("="*70)

print("\nüìä DATASET OVERVIEW:")
print(f"  ‚Ä¢ Total Tweets Analyzed: {len(df):,}")
print(f"  ‚Ä¢ Date Range: {df['date'].min().date()} to {df['date'].max().date()}")
print(f"  ‚Ä¢ Duration: {(df['date'].max() - df['date'].min()).days} days (~{(df['date'].max() - df['date'].min()).days/365:.1f} years)")

print("\nüí≠ SENTIMENT ANALYSIS:")
for sentiment in ['positive', 'negative', 'neutral']:
    count = len(df[df['sentiment'] == sentiment])
    pct = (count / len(df)) * 100
    print(f"  ‚Ä¢ {sentiment.capitalize()}: {count} tweets ({pct:.1f}%)")

print("\nüè∑Ô∏è TOP MENTAL HEALTH CATEGORIES:")
for idx, (category, count) in enumerate(df['category'].value_counts().head(5).items(), 1):
    pct = (count / len(df)) * 100
    print(f"  {idx}. {category.capitalize()}: {count} tweets ({pct:.1f}%)")

print("\nüìç GEOGRAPHIC DISTRIBUTION:")
for location, count in df['location'].value_counts().items():
    pct = (count / len(df)) * 100
    print(f"  ‚Ä¢ {location}: {count} tweets ({pct:.1f}%)")

print("\nüëç ENGAGEMENT METRICS:")
print(f"  ‚Ä¢ Total Likes: {df['likes'].sum():,}")
print(f"  ‚Ä¢ Total Retweets: {df['retweets'].sum():,}")
print(f"  ‚Ä¢ Average Likes per Tweet: {df['likes'].mean():.1f}")
print(f"  ‚Ä¢ Average Retweets per Tweet: {df['retweets'].mean():.1f}")
print(f"  ‚Ä¢ Most Liked Tweet: {df['likes'].max()} likes")
print(f"  ‚Ä¢ Most Retweeted Tweet: {df['retweets'].max()} retweets")

print("\nüìà TEMPORAL PATTERNS:")
peak_month = df.groupby('year_month').size().idxmax()
peak_count = df.groupby('year_month').size().max()
print(f"  ‚Ä¢ Peak Month: {peak_month} ({peak_count} tweets)")
print(f"  ‚Ä¢ Average Tweets per Month: {df.groupby('year_month').size().mean():.1f}")

print("\nüîç KEY FINDINGS:")
print("  1. Mental health discourse on social media increased significantly during pandemic")
print("  2. Wellness and therapy-related content received high engagement")
print("  3. Anxiety and depression were frequently discussed topics")
print("  4. Positive sentiment tweets often promoted self-care and support")
print("  5. Geographic analysis shows global nature of pandemic's mental health impact")
print("  6. Engagement patterns suggest strong public interest in mental health topics")

print("\nüíö RECOMMENDATIONS:")
print("  ‚Ä¢ Continue promoting mental health awareness and resources")
print("  ‚Ä¢ Support accessible therapy and counseling services")
print("  ‚Ä¢ Foster online communities for mental health support")
print("  ‚Ä¢ Encourage positive mental health discourse")
print("  ‚Ä¢ Monitor and respond to emerging mental health trends")

print("\n" + "="*70)
print(" "*20 + "END OF ANALYSIS")
print("="*70)

## 12. Key Insights and Summary

In [None]:
# Sentiment distribution by category
sentiment_by_category = pd.crosstab(df['category'], df['sentiment'], normalize='index') * 100

# Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(sentiment_by_category, annot=True, fmt='.1f', cmap='RdYlGn', 
            cbar_kws={'label': 'Percentage (%)'}, linewidths=1, linecolor='black')
plt.title('Sentiment Distribution by Mental Health Category (%)', fontsize=16, fontweight='bold')
plt.xlabel('Sentiment', fontsize=12)
plt.ylabel('Category', fontsize=12)
plt.tight_layout()
plt.show()

# Stacked bar chart
sentiment_by_category_counts = pd.crosstab(df['category'], df['sentiment'])
sentiment_by_category_counts.plot(kind='barh', stacked=True, 
                                  color=['#2ecc71', '#e74c3c', '#95a5a6'],
                                  figsize=(12, 8), edgecolor='black', linewidth=1)
plt.title('Sentiment Distribution by Category (Count)', fontsize=16, fontweight='bold')
plt.xlabel('Number of Tweets', fontsize=12)
plt.ylabel('Category', fontsize=12)
plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

## 11. Cross-Analysis: Sentiment by Category

In [None]:
import re

# Function to clean text
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove mentions and hashtags symbols
    text = re.sub(r'@\w+|#', '', text)
    return text

# Generate word clouds for each sentiment
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

sentiments = ['positive', 'negative', 'neutral']
colormaps = ['Greens', 'Reds', 'Greys']

for idx, (sentiment, colormap) in enumerate(zip(sentiments, colormaps)):
    # Get tweets for this sentiment
    text = ' '.join(df[df['sentiment'] == sentiment]['tweet_text'].values)
    text = clean_text(text)
    
    # Generate word cloud
    wordcloud = WordCloud(width=600, height=400, 
                          background_color='white',
                          colormap=colormap,
                          max_words=80).generate(text)
    
    axes[idx].imshow(wordcloud, interpolation='bilinear')
    axes[idx].axis('off')
    axes[idx].set_title(f'{sentiment.capitalize()} Sentiment Word Cloud', 
                       fontsize=14, fontweight='bold', pad=10)

plt.tight_layout()
plt.show()

# Overall word cloud
print("\nGenerating overall word cloud for all tweets...")
all_text = ' '.join(df['tweet_text'].values)
all_text = clean_text(all_text)

plt.figure(figsize=(14, 7))
wordcloud_all = WordCloud(width=1200, height=600, 
                          background_color='white',
                          colormap='viridis',
                          max_words=100).generate(all_text)
plt.imshow(wordcloud_all, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud - All Mental Health Tweets', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

## 10. Word Cloud Visualizations

In [None]:
# Location distribution
location_counts = df['location'].value_counts()
location_pct = df['location'].value_counts(normalize=True) * 100

print("GEOGRAPHIC DISTRIBUTION OF TWEETS")
print("="*60)
for location in location_counts.index:
    print(f"{location}: {location_counts[location]} tweets ({location_pct[location]:.1f}%)")
print("="*60)

# Visualizations
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Pie chart
colors_loc = plt.cm.Set3(range(len(location_counts)))
ax1.pie(location_counts.values, labels=location_counts.index, autopct='%1.1f%%',
        colors=colors_loc, startangle=90, textprops={'fontsize': 11, 'fontweight': 'bold'})
ax1.set_title('Geographic Distribution of Tweets', fontsize=14, fontweight='bold')

# Bar chart
ax2.bar(location_counts.index, location_counts.values, color=colors_loc, edgecolor='black', linewidth=1.5)
ax2.set_title('Tweets by Location', fontsize=14, fontweight='bold')
ax2.set_xlabel('Location', fontsize=12)
ax2.set_ylabel('Number of Tweets', fontsize=12)
ax2.grid(axis='y', alpha=0.3)
for i, v in enumerate(location_counts.values):
    ax2.text(i, v + 0.5, str(v), ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

## 9. Geographic Distribution

In [None]:
# Engagement metrics by sentiment
engagement_by_sentiment = df.groupby('sentiment')[['likes', 'retweets']].agg(['mean', 'sum', 'median'])

print("ENGAGEMENT METRICS BY SENTIMENT")
print("="*60)
print(engagement_by_sentiment)
print("="*60)

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Average Likes by Sentiment
avg_likes = df.groupby('sentiment')['likes'].mean()
axes[0, 0].bar(avg_likes.index, avg_likes.values, color=['#2ecc71', '#e74c3c', '#95a5a6'], edgecolor='black')
axes[0, 0].set_title('Average Likes by Sentiment', fontsize=14, fontweight='bold')
axes[0, 0].set_ylabel('Average Likes')
axes[0, 0].grid(axis='y', alpha=0.3)

# Average Retweets by Sentiment
avg_retweets = df.groupby('sentiment')['retweets'].mean()
axes[0, 1].bar(avg_retweets.index, avg_retweets.values, color=['#2ecc71', '#e74c3c', '#95a5a6'], edgecolor='black')
axes[0, 1].set_title('Average Retweets by Sentiment', fontsize=14, fontweight='bold')
axes[0, 1].set_ylabel('Average Retweets')
axes[0, 1].grid(axis='y', alpha=0.3)

# Total Engagement by Category (Top 5)
top_categories_eng = df.groupby('category')[['likes', 'retweets']].sum().sum(axis=1).nlargest(5)
axes[1, 0].barh(top_categories_eng.index, top_categories_eng.values, color='#3498db', edgecolor='black')
axes[1, 0].set_title('Total Engagement by Top 5 Categories', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Total Engagement (Likes + Retweets)')
axes[1, 0].grid(axis='x', alpha=0.3)

# Scatter plot: Likes vs Retweets
axes[1, 1].scatter(df['likes'], df['retweets'], c=df['sentiment'].map({'positive': '#2ecc71', 'negative': '#e74c3c', 'neutral': '#95a5a6'}), 
                   alpha=0.6, s=80, edgecolor='black', linewidth=0.5)
axes[1, 1].set_title('Likes vs Retweets Relationship', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Likes')
axes[1, 1].set_ylabel('Retweets')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Engagement Analysis

In [None]:
# Category trends over time
category_timeline = df.groupby(['year_month', 'category']).size().unstack(fill_value=0)

plt.figure(figsize=(14, 8))
for category in category_timeline.columns:
    category_timeline[category].plot(kind='line', marker='o', linewidth=2, markersize=6, label=category)

plt.title('Mental Health Category Trends Over Time', fontsize=16, fontweight='bold')
plt.xlabel('Month', fontsize=12)
plt.ylabel('Number of Tweets', fontsize=12)
plt.legend(title='Category', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Top 5 categories over time (heatmap)
top_categories = df['category'].value_counts().head(5).index
category_heatmap = df[df['category'].isin(top_categories)].groupby(['year_month', 'category']).size().unstack(fill_value=0)

plt.figure(figsize=(14, 6))
sns.heatmap(category_heatmap.T, cmap='YlOrRd', annot=True, fmt='d', cbar_kws={'label': 'Tweet Count'})
plt.title('Top 5 Mental Health Categories - Heatmap Over Time', fontsize=16, fontweight='bold')
plt.xlabel('Month', fontsize=12)
plt.ylabel('Category', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 7. Category Trends Over Time