In [None]:
"""
# Twitter Sentiment Analysis - Data Preprocessing Notebook

## 1. Import Libraries
"""

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

"""
## 2. Load Data
"""

df = pd.read_csv('../Twitter_Data.csv')
print("Original dataset shape:", df.shape)

"""
## 3. Data Cleaning
"""

# Remove duplicates
initial_count = len(df)
df = df.drop_duplicates()
print(f"Removed {initial_count - len(df)} duplicate rows")

# Handle missing values
df = df.dropna()
print(f"After handling missing values: {len(df)} rows")

"""
## 4. Text Preprocessing Functions
"""

class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
    
    def clean_text(self, text):
        """Comprehensive text cleaning function"""
        if not isinstance(text, str):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http\S+', '', text)
        
        # Remove user mentions and hashtags
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'#\w+', '', text)
        
        # Remove special characters and numbers
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
        
        # Tokenize
        tokens = word_tokenize(text)
        
        # Remove stopwords and short tokens
        tokens = [token for token in tokens if token not in self.stop_words and len(token) > 2]
        
        # Lemmatization
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        
        return ' '.join(tokens)

"""
## 5. Apply Text Preprocessing
"""

preprocessor = TextPreprocessor()

print("Starting text preprocessing...")
df['cleaned_text'] = df['clean_text'].apply(preprocessor.clean_text)

# Check results
print("\nOriginal vs Cleaned Text Examples:")
for i in range(3):
    print(f"Original: {df['clean_text'].iloc[i][:100]}...")
    print(f"Cleaned: {df['cleaned_text'].iloc[i][:100]}...")
    print("-" * 50)

"""
## 6. Text Length Analysis After Cleaning
"""

df['cleaned_text_length'] = df['cleaned_text'].str.len()

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Before vs After cleaning
axes[0].hist(df['clean_text'].str.len(), bins=50, alpha=0.7, label='Original', color='blue')
axes[0].hist(df['cleaned_text_length'], bins=50, alpha=0.7, label='Cleaned', color='red')
axes[0].set_title('Text Length Distribution: Before vs After Cleaning')
axes[0].set_xlabel('Text Length')
axes[0].set_ylabel('Frequency')
axes[0].legend()

# Box plot by sentiment
sns.boxplot(data=df, x='category', y='cleaned_text_length', ax=axes[1])
axes[1].set_title('Cleaned Text Length by Sentiment')

plt.tight_layout()
plt.show()

"""
## 7. Feature Extraction
"""

# TF-IDF Vectorization
print("Performing TF-IDF vectorization...")
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    min_df=5,
    max_df=0.8,
    ngram_range=(1, 2),
    stop_words='english'
)

X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_text'])
print(f"TF-IDF matrix shape: {X_tfidf.shape}")

# Count Vectorization
count_vectorizer = CountVectorizer(
    max_features=5000,
    min_df=5,
    max_df=0.8,
    ngram_range=(1, 2),
    stop_words='english'
)

X_count = count_vectorizer.fit_transform(df['cleaned_text'])
print(f"Count matrix shape: {X_count.shape}")

"""
## 8. Train-Test Split
"""

X = df['cleaned_text']
y = df['category']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Training sentiment distribution:\n{y_train.value_counts()}")

"""
## 9. Save Processed Data
"""

# Save processed dataset
processed_df = df[['cleaned_text', 'category']]
processed_df.to_csv('../data/processed/cleaned_twitter_data.csv', index=False)

# Save train/test splits
train_df = pd.DataFrame({'text': X_train, 'sentiment': y_train})
test_df = pd.DataFrame({'text': X_test, 'sentiment': y_test})

train_df.to_csv('../data/processed/train_data.csv', index=False)
test_df.to_csv('../data/processed/test_data.csv', index=False)

print("\nPreprocessing completed successfully!")
print(f"Processed data saved with {len(processed_df)} samples")