# Exploratory Data Analysis - Fitness App Reviews (Simplified)

This simplified notebook explores the fitness app reviews dataset without requiring complex dependencies.

In [None]:
# Import only essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import re

# Set basic plot parameters
plt.rcParams['figure.figsize'] = (10, 6)
plt.style.use('ggplot')

In [None]:
# Load the dataset
# Adjust path if needed
try:
    df = pd.read_csv('../data/fitness_app_reviews.csv')
except FileNotFoundError:
    df = pd.read_csv('fitness_app_reviews.csv')

# Display the first few rows
df.head()

In [None]:
# Basic dataset information
print(f"Dataset shape: {df.shape}")
print("\nDataset columns:")
for col in df.columns:
    print(f"- {col}")

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

## Rating Distribution

In [None]:
# Plot rating distribution
plt.figure()
rating_counts = df['rating'].value_counts().sort_index()
rating_counts.plot(kind='bar', color='skyblue')
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')

# Add percentage labels
total = len(df)
for i, count in enumerate(rating_counts):
    percentage = f'{100 * count / total:.1f}%'
    plt.text(i, count + 10, percentage, ha='center')

plt.tight_layout()
plt.show()

In [None]:
# Map ratings to sentiment categories
def get_sentiment(rating):
    if rating >= 4:
        return 'Positive'
    elif rating >= 2:
        return 'Neutral'
    else:
        return 'Negative'

df['sentiment'] = df['rating'].apply(get_sentiment)

# Plot sentiment distribution
plt.figure()
sentiment_counts = df['sentiment'].value_counts()
colors = {'Positive': 'green', 'Neutral': 'gold', 'Negative': 'red'}
sentiment_counts.plot(kind='bar', color=[colors[x] for x in sentiment_counts.index])
plt.title('Distribution of Sentiment Categories')
plt.xlabel('Sentiment')
plt.ylabel('Count')

# Add percentage labels
total = len(df)
for i, count in enumerate(sentiment_counts):
    percentage = f'{100 * count / total:.1f}%'
    plt.text(i, count + 10, percentage, ha='center')

plt.tight_layout()
plt.show()

## App Distribution

In [None]:
# Average rating by app
app_avg_rating = df.groupby('app_name')['rating'].mean().sort_values(ascending=False)

plt.figure()
app_avg_rating.plot(kind='bar', color='teal')
plt.title('Average Rating by App')
plt.xlabel('App')
plt.ylabel('Average Rating')
plt.xticks(rotation=45, ha='right')

# Add value labels
for i, v in enumerate(app_avg_rating):
    plt.text(i, v + 0.05, f'{v:.2f}', ha='center')

plt.ylim(0, 5.5)
plt.tight_layout()
plt.show()

## Text Analysis

In [None]:
# Review length distribution
df['review_length'] = df['review_text'].apply(len)
df['word_count'] = df['review_text'].apply(lambda x: len(str(x).split()))

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.hist(df['review_length'], bins=30, color='skyblue', alpha=0.7)
plt.title('Review Length Distribution')
plt.xlabel('Character Count')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(df['word_count'], bins=30, color='lightgreen', alpha=0.7)
plt.title('Word Count Distribution')
plt.xlabel('Word Count')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Review length by sentiment (boxplot)
plt.figure()
sentiment_order = ['Negative', 'Neutral', 'Positive']
boxplot_data = [df[df['sentiment'] == sentiment]['word_count'] for sentiment in sentiment_order]
plt.boxplot(boxplot_data, labels=sentiment_order)
plt.title('Review Word Count by Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Word Count')
plt.tight_layout()
plt.show()

In [None]:
# Simple text preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Simple tokenization by splitting on whitespace
    tokens = text.split()
    
    # Simple stopword removal (most common English stopwords)
    stopwords = {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 
                 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 
                 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 
                 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 
                 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 
                 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 
                 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 
                 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 
                 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 
                 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very'}
    tokens = [word for word in tokens if word not in stopwords]
    
    return tokens

# Get all words from reviews (limited to a sample for performance)
all_words = []
sample_size = min(1000, len(df))
for review in df['review_text'].sample(sample_size):
    tokens = preprocess_text(str(review))
    all_words.extend(tokens)

# Count word frequency
word_counts = Counter(all_words)
most_common_words = word_counts.most_common(15)

In [None]:
# Plot most common words
plt.figure()
words, counts = zip(*most_common_words)
plt.barh(range(len(words)), counts, color='purple', alpha=0.7)
plt.yticks(range(len(words)), words)
plt.title('Most Common Words in Reviews')
plt.xlabel('Count')
plt.ylabel('Word')
plt.tight_layout()
plt.show()

In [None]:
# Get common words by sentiment (with sampling for better performance)
def get_sentiment_words(sentiment, sample_size=500):
    words = []
    subset = df[df['sentiment'] == sentiment]
    # Take a sample if there are many reviews
    if len(subset) > sample_size:
        subset = subset.sample(sample_size)
    
    for review in subset['review_text']:
        tokens = preprocess_text(str(review))
        words.extend(tokens)
    
    return Counter(words).most_common(10)

# Get the most common words for each sentiment
positive_words = get_sentiment_words('Positive')
neutral_words = get_sentiment_words('Neutral')
negative_words = get_sentiment_words('Negative')

In [None]:
# Plot common words by sentiment (in separate plots for simplicity)
plt.figure(figsize=(15, 5))

# Positive words
plt.subplot(1, 3, 1)
pos_words, pos_counts = zip(*positive_words)
plt.barh(range(len(pos_words)), pos_counts, color='green', alpha=0.7)
plt.yticks(range(len(pos_words)), pos_words)
plt.title('Positive Reviews')
plt.xlabel('Count')

# Neutral words
plt.subplot(1, 3, 2)
neu_words, neu_counts = zip(*neutral_words)
plt.barh(range(len(neu_words)), neu_counts, color='gold', alpha=0.7)
plt.yticks(range(len(neu_words)), neu_words)
plt.title('Neutral Reviews')
plt.xlabel('Count')

# Negative words
plt.subplot(1, 3, 3)
neg_words, neg_counts = zip(*negative_words)
plt.barh(range(len(neg_words)), neg_counts, color='red', alpha=0.7)
plt.yticks(range(len(neg_words)), neg_words)
plt.title('Negative Reviews')
plt.xlabel('Count')

plt.tight_layout()
plt.show()

## Example Reviews

In [None]:
# Display example reviews for each sentiment
for sentiment in ['Positive', 'Neutral', 'Negative']:
    print(f"\n{sentiment} Review Examples:")
    # Get up to 3 examples or all if less than 3
    subset = df[df['sentiment'] == sentiment]
    sample_size = min(3, len(subset))
    examples = subset.sample(sample_size)['review_text'].values
    
    for i, example in enumerate(examples):
        print(f"{i+1}. {example}")

## Conclusions

Based on this exploratory analysis, we can observe:

1. **Class Imbalance**: There is a significant class imbalance with positive reviews dominating the dataset.

2. **Review Length**: The average review length varies by sentiment, with negative reviews typically being longer than positive ones.

3. **Common Words**: The vocabulary differs across sentiment categories, with positive reviews focusing on words like "great", "love", and "helpful", while negative reviews contain words like "terrible", "waste", and "poor".

4. **App Differences**: Different apps have varying rating distributions and average ratings.

These insights will inform our approach to implementing sentiment classification models.