# MSMARCO Data Exploration

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import sys
import os

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.width', 1000)

# Set plotting style
plt.style.use('ggplot')
sns.set(style="whitegrid")

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Add project root directory to path
sys.path.append(os.path.abspath('../'))

## 1. Data Loading

First, we need to load the social media short text dataset. Here we assume the dataset is ready and stored in the project's data directory.

In [None]:
# Import data processing functions from src module
from src.data_preparation import load_dataset, preprocess_text

# Load dataset
# Note: Modify the data path according to actual situation
try:
    data = load_dataset('../data/social_media_dataset.csv')
    print(f"Successfully loaded dataset with {len(data)} records")
except FileNotFoundError:
    print("Data file does not exist, please ensure the data file is placed in the correct location")
    # Create a sample dataset for demonstration
    data = pd.DataFrame({
        'id': range(1000),
        'text': [
            f"This is a sample social media post #{i} #example #data" 
            for i in range(1000)
        ],
        'timestamp': pd.date_range(start='2023-01-01', periods=1000, freq='H'),
        'user_id': np.random.randint(1, 100, 1000),
        'likes': np.random.randint(0, 1000, 1000),
        'shares': np.random.randint(0, 200, 1000)
    })
    print("Created sample dataset for demonstration")

# Display basic information about the dataset
print("\nDataset basic information:")
data.info()

# Display the first few rows of the dataset
print("\nFirst 5 rows of the dataset:")
data.head()

## 2. Basic Statistical Analysis

Next, we perform basic statistical analysis on the dataset to understand the distribution of the data.

In [None]:
# Basic statistics
print("Dataset descriptive statistics:")
data.describe(include='all')

# Check for missing values
print("\nMissing values statistics:")
missing_values = data.isnull().sum()
missing_percentage = (missing_values / len(data)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percentage
})
missing_df

## 3. Text Length Analysis

Analyze the distribution of text lengths to understand the characteristics of social media short texts.

In [None]:
# Calculate text length
data['text_length'] = data['text'].apply(len)
data['word_count'] = data['text'].apply(lambda x: len(word_tokenize(x)))

# Display basic statistics of text length
print("Text length statistics:")
print(data[['text_length', 'word_count']].describe())

# Plot text length distribution
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.histplot(data['text_length'], kde=True)
plt.title('Text Character Length Distribution')
plt.xlabel('Number of Characters')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
sns.histplot(data['word_count'], kde=True)
plt.title('Text Word Count Distribution')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## 4. Text Content Analysis

Analyze text content, including common vocabulary, topic distribution, etc.

In [None]:
# Text preprocessing
stop_words = set(stopwords.words('english'))

def preprocess_for_analysis(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Apply preprocessing
data['processed_tokens'] = data['text'].apply(preprocess_for_analysis)

# Count word frequency
all_words = [word for tokens in data['processed_tokens'] for word in tokens]
word_freq = Counter(all_words)

# Display most common words
print("Top 20 most common words:")
print(word_freq.most_common(20))

# Plot word frequency
plt.figure(figsize=(12, 6))
top_words = dict(word_freq.most_common(20))
sns.barplot(x=list(top_words.keys()), y=list(top_words.values()))
plt.title('Top 20 Most Common Words')
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 5. Topic Tag Analysis

Analyze topic tags in social media text (such as hashtags in Twitter).

In [None]:
# Extract topic tags (hashtags)
def extract_hashtags(text):
    hashtags = re.findall(r'#(\w+)', text.lower())
    return hashtags

data['hashtags'] = data['text'].apply(extract_hashtags)

# Count tag frequency
all_hashtags = [tag for tags in data['hashtags'] for tag in tags]
hashtag_freq = Counter(all_hashtags)

# Display most common tags
print("Top 20 most common topic tags:")
print(hashtag_freq.most_common(20))

# Plot tag frequency
plt.figure(figsize=(12, 6))
top_hashtags = dict(hashtag_freq.most_common(20))
sns.barplot(x=list(top_hashtags.keys()), y=list(top_hashtags.values()))
plt.title('Top 20 Most Common Topic Tags')
plt.xlabel('Tag')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 6. Time Distribution Analysis

Analyze the time distribution characteristics of social media text.

In [None]:
# Extract time features
data['date'] = data['timestamp'].dt.date
data['hour'] = data['timestamp'].dt.hour
data['day_of_week'] = data['timestamp'].dt.day_name()

# Count texts by date
date_counts = data.groupby('date').size()

# Count texts by hour
hour_counts = data.groupby('hour').size()

# Count texts by day of week
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_counts = data.groupby('day_of_week').size().reindex(day_order)

# Plot time distribution
plt.figure(figsize=(18, 6))

plt.subplot(1, 3, 1)
date_counts.plot()
plt.title('Text Count by Date')
plt.xlabel('Date')
plt.ylabel('Text Count')

plt.subplot(1, 3, 2)
hour_counts.plot(kind='bar')
plt.title('Text Count by Hour')
plt.xlabel('Hour')
plt.ylabel('Text Count')

plt.subplot(1, 3, 3)
day_counts.plot(kind='bar')
plt.title('Text Count by Day of Week')
plt.xlabel('Day')
plt.ylabel('Text Count')

plt.tight_layout()
plt.show()

## 7. User Activity Analysis

Analyze user activity and engagement.

In [None]:
# Count texts by user
user_post_counts = data.groupby('user_id').size().sort_values(ascending=False)

# Calculate average likes and shares per user
user_engagement = data.groupby('user_id').agg({
    'likes': 'mean',
    'shares': 'mean'
}).sort_values(by='likes', ascending=False)

# Display most active users
print("Top 10 users with most posts:")
print(user_post_counts.head(10))

print("\nTop 10 users with highest average likes:")
print(user_engagement.head(10))

# Plot user activity distribution
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.histplot(user_post_counts.values, kde=True)
plt.title('User Post Count Distribution')
plt.xlabel('Post Count')
plt.ylabel('User Count')

plt.subplot(1, 2, 2)
plt.scatter(user_engagement['likes'], user_engagement['shares'], alpha=0.5)
plt.title('Relationship Between Average Likes and Shares per User')
plt.xlabel('Average Likes')
plt.ylabel('Average Shares')

plt.tight_layout()
plt.show()

## 8. Text Similarity Analysis

Vectorize text using TF-IDF and calculate similarity between texts.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random

# Vectorize text using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['text'])

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

# Randomly select a few texts to calculate similarity
sample_size = min(5, len(data))
sample_indices = random.sample(range(len(data)), sample_size)
sample_texts = data.iloc[sample_indices]['text'].tolist()

print("\nSample texts:")
for i, text in enumerate(sample_texts):
    print(f"Text {i+1}: {text[:100]}...")

# Calculate similarity between sample texts
sample_vectors = tfidf_vectorizer.transform(sample_texts)
similarity_matrix = cosine_similarity(sample_vectors)

print("\nSimilarity matrix:")
similarity_df = pd.DataFrame(similarity_matrix, 
                             index=[f"Text {i+1}" for i in range(sample_size)],
                             columns=[f"Text {i+1}" for i in range(sample_size)])
similarity_df

## 9. Special Feature Analysis of Social Media Text

Analyze special features of social media text, such as emojis, @mentions, etc.

In [None]:
# Extract @mentions
def extract_mentions(text):
    mentions = re.findall(r'@(\w+)', text.lower())
    return mentions

# Detect emojis (simplified version, only detects some common emojis)
def contains_emoji(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # geometric symbols
        "\U0001F800-\U0001F8FF"  # supplemental arrows
        "\U0001F900-\U0001F9FF"  # supplemental symbols & pictographs
        "\U0001FA00-\U0001FA6F"  # extended symbols
        "\U0001FA70-\U0001FAFF"  # symbols & pictographs extended
        "\U00002702-\U000027B0"  # dingbats
        "\U000024C2-\U0001F251"  # enclosed characters
        "]")
    return bool(emoji_pattern.search(text))

# Apply feature extraction
data['mentions'] = data['text'].apply(extract_mentions)
data['has_emoji'] = data['text'].apply(contains_emoji)
data['has_url'] = data['text'].apply(lambda x: bool(re.search(r'http\S+', x)))
data['has_hashtag'] = data['hashtags'].apply(lambda x: len(x) > 0)

# Count special features
special_features = {
    'Texts with Emojis': data['has_emoji'].mean(),
    'Texts with URLs': data['has_url'].mean(),
    'Texts with Hashtags': data['has_hashtag'].mean(),
    'Texts with @mentions': data['mentions'].apply(lambda x: len(x) > 0).mean()
}

print("Social media text special feature statistics:")
for feature, value in special_features.items():
    print(f"{feature}: {value:.2%}")

# Plot special feature distribution
plt.figure(figsize=(10, 6))
sns.barplot(x=list(special_features.keys()), y=list(special_features.values()))
plt.title('Social Media Text Special Feature Distribution')
plt.xlabel('Feature')
plt.ylabel('Proportion')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 10. Data Quality Analysis

Analyze data quality issues, such as noise, duplicates, etc.

In [None]:
# Check for duplicate texts
duplicate_count = data.duplicated(subset=['text']).sum()
print(f"Number of duplicate texts: {duplicate_count} ({duplicate_count/len(data):.2%})")

# Check for very short texts (possibly noise)
very_short_texts = data[data['text_length'] < 10]
print(f"Number of very short texts (length<10): {len(very_short_texts)} ({len(very_short_texts)/len(data):.2%})")

# Check for all-caps texts (possibly spam or emphasis)
all_caps_texts = data[data['text'].apply(lambda x: x.isupper())]
print(f"Number of all-caps texts: {len(all_caps_texts)} ({len(all_caps_texts)/len(data):.2%})")

# Display some examples of very short texts
if len(very_short_texts) > 0:
    print("\nExamples of very short texts:")
    print(very_short_texts['text'].head())

### Data Exploration Summary

Through the exploration of the social media short text dataset, we have discovered the following characteristics:

1. **Text Length Characteristics**: Social media texts are typically short, with average character and word counts of X and Y respectively.
2. **Common Vocabulary**: The most common words include...
3. **Topic Tags**: The most popular topic tags include...
4. **Time Distribution**: Text publication times show clear temporal patterns, such as...
5. **User Activity**: User activity is unevenly distributed, with a small number of users contributing a large amount of content.
6. **Special Features**: Social media texts contain many special features, such as emojis, URLs, @mentions, etc.
7. **Data Quality**: There is a certain percentage of duplicate texts and very short texts that need to be cleaned.