# Exploratory Data Analysis - Fitness App Reviews

This notebook explores the fitness app reviews dataset to understand its characteristics and prepare it for sentiment analysis.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Set plot style
plt.style.use('ggplot')
sns.set(font_scale=1.2)

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Load the dataset
# If using the generated dummy data
df = pd.read_csv('../data/fitness_app_reviews.csv')

# Display the first few rows
df.head()

In [None]:
# Basic dataset information
print(f"Dataset shape: {df.shape}")
print("\nDataset columns:")
for col in df.columns:
    print(f"- {col}")

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

## Rating Distribution

In [None]:
# Plot rating distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='rating', data=df, palette='viridis')
plt.title('Distribution of Ratings', fontsize=16)
plt.xlabel('Rating', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Add percentage labels
total = len(df)
for i, p in enumerate(plt.gca().patches):
    percentage = f'{100 * p.get_height() / total:.1f}%'
    plt.gca().annotate(percentage, (p.get_x() + p.get_width() / 2., p.get_height()),
                       ha='center', va='bottom', fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
# Map ratings to sentiment categories
def get_sentiment(rating):
    if rating >= 4:
        return 'Positive'
    elif rating >= 2:
        return 'Neutral'
    else:
        return 'Negative'

df['sentiment'] = df['rating'].apply(get_sentiment)

# Plot sentiment distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='sentiment', data=df, palette='RdYlGn')
plt.title('Distribution of Sentiment Categories', fontsize=16)
plt.xlabel('Sentiment', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Add percentage labels
total = len(df)
for i, p in enumerate(plt.gca().patches):
    percentage = f'{100 * p.get_height() / total:.1f}%'
    plt.gca().annotate(percentage, (p.get_x() + p.get_width() / 2., p.get_height()),
                       ha='center', va='bottom', fontsize=12)

plt.tight_layout()
plt.show()

## App Distribution

In [None]:
# Plot distribution by app
plt.figure(figsize=(12, 6))
sns.countplot(y='app_name', data=df, palette='muted', order=df['app_name'].value_counts().index)
plt.title('Number of Reviews by App', fontsize=16)
plt.xlabel('Count', fontsize=14)
plt.ylabel('App', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Plot ratings by app
plt.figure(figsize=(12, 8))
sns.countplot(x='rating', hue='app_name', data=df, palette='Set2')
plt.title('Ratings Distribution by App', fontsize=16)
plt.xlabel('Rating', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.legend(title='App Name', fontsize=12, title_fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Average rating by app
app_avg_rating = df.groupby('app_name')['rating'].mean().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
app_avg_rating.plot(kind='bar', color='teal')
plt.title('Average Rating by App', fontsize=16)
plt.xlabel('App', fontsize=14)
plt.ylabel('Average Rating', fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(fontsize=12)

# Add value labels
for i, v in enumerate(app_avg_rating):
    plt.text(i, v + 0.05, f'{v:.2f}', ha='center', fontsize=12)

plt.ylim(0, 5.5)
plt.tight_layout()
plt.show()

## Text Analysis

In [None]:
# Review length distribution
df['review_length'] = df['review_text'].apply(len)
df['word_count'] = df['review_text'].apply(lambda x: len(str(x).split()))

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(df['review_length'], bins=50, kde=True, color='skyblue')
plt.title('Review Length Distribution', fontsize=14)
plt.xlabel('Character Count', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

plt.subplot(1, 2, 2)
sns.histplot(df['word_count'], bins=30, kde=True, color='lightgreen')
plt.title('Word Count Distribution', fontsize=14)
plt.xlabel('Word Count', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
# Review length by sentiment
plt.figure(figsize=(10, 6))
sns.boxplot(x='sentiment', y='word_count', data=df, palette='RdYlGn')
plt.title('Review Word Count by Sentiment', fontsize=16)
plt.xlabel('Sentiment', fontsize=14)
plt.ylabel('Word Count', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Function for text preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    return tokens

# Get all words from reviews
all_words = []
for review in df['review_text']:
    tokens = preprocess_text(str(review))
    all_words.extend(tokens)

# Count word frequency
word_counts = Counter(all_words)
most_common_words = word_counts.most_common(20)

In [None]:
# Plot most common words
plt.figure(figsize=(12, 8))
words, counts = zip(*most_common_words)
sns.barplot(x=list(counts), y=list(words), palette='viridis')
plt.title('Most Common Words in Reviews', fontsize=16)
plt.xlabel('Count', fontsize=14)
plt.ylabel('Word', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Get common words by sentiment
def get_sentiment_words(sentiment):
    words = []
    subset = df[df['sentiment'] == sentiment]
    
    for review in subset['review_text']:
        tokens = preprocess_text(str(review))
        words.extend(tokens)
    
    return Counter(words).most_common(15)

positive_words = get_sentiment_words('Positive')
neutral_words = get_sentiment_words('Neutral')
negative_words = get_sentiment_words('Negative')

In [None]:
# Plot common words by sentiment
fig, axes = plt.subplots(1, 3, figsize=(20, 8))

# Positive words
words, counts = zip(*positive_words)
sns.barplot(x=list(counts), y=list(words), ax=axes[0], color='green')
axes[0].set_title('Positive Reviews', fontsize=14)
axes[0].set_xlabel('Count', fontsize=12)
axes[0].set_ylabel('Word', fontsize=12)

# Neutral words
words, counts = zip(*neutral_words)
sns.barplot(x=list(counts), y=list(words), ax=axes[1], color='gold')
axes[1].set_title('Neutral Reviews', fontsize=14)
axes[1].set_xlabel('Count', fontsize=12)
axes[1].set_ylabel('', fontsize=12)

# Negative words
words, counts = zip(*negative_words)
sns.barplot(x=list(counts), y=list(words), ax=axes[2], color='red')
axes[2].set_title('Negative Reviews', fontsize=14)
axes[2].set_xlabel('Count', fontsize=12)
axes[2].set_ylabel('', fontsize=12)

plt.tight_layout()
plt.show()

## Example Reviews

In [None]:
# Display example reviews for each sentiment
for sentiment in ['Positive', 'Neutral', 'Negative']:
    print(f"\n{sentiment} Review Examples:")
    examples = df[df['sentiment'] == sentiment].sample(3)['review_text'].values
    
    for i, example in enumerate(examples):
        print(f"{i+1}. {example}")

## Conclusions

Based on this exploratory analysis, we can observe:

1. **Class Imbalance**: There is a significant class imbalance with positive reviews dominating the dataset.

2. **Review Length**: The average review length varies by sentiment, with negative reviews typically being longer than positive ones.

3. **Common Words**: The vocabulary differs across sentiment categories, with positive reviews focusing on words like "great", "love", and "helpful", while negative reviews contain words like "terrible", "waste", and "poor".

4. **App Differences**: Different apps have varying rating distributions and average ratings.

These insights will inform our approach to implementing sentiment classification models.