In [15]:
import re
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

# Define a broader sentiment-based emoji mapping
emoji_sentiment_mapping = {
    "😃": "positive",
    "😊": "positive",
    "😁": "positive",
    "😄": "positive",
    "😍": "positive",
    "😎": "positive",
    "🥰": "positive",
    "❤️": "positive",
    "👍": "positive",
    
    "😔": "negative",
    "😞": "negative",
    "😟": "negative",
    "😢": "negative",
    "😠": "negative",
    "😡": "negative",
    "👎": "negative",
    
    "😐": "neutral",
    "😕": "neutral",
    "😶": "neutral",
    "😑": "neutral",
    
    "😂": "joyful",
    "🤣": "joyful",
    "😆": "joyful",
    
    "😒": "confused",
    "🙄": "confused",
    "😪": "confused",
    
    # Add more emojis and sentiment labels as needed
}

def convert_emoji_to_sentiment(emoji_char):
    return emoji_sentiment_mapping.get(emoji_char, "")

def clean_google_play_reviews(reviews):
    cleaned_reviews = []
    stop_words = set(stopwords.words('english'))
    
    for review in reviews:
        # Remove URLs
        review = re.sub(r'http\S+', '', review)
        
        # Extract emojis and their sentiment labels
        emojis = [c for c in review if c in emoji_sentiment_mapping]
        emoji_sentiments = [convert_emoji_to_sentiment(emoji) for emoji in emojis]
        review = ''.join([c for c in review if c not in emoji_sentiment_mapping])
        
        # Tokenize the review into words
        words = word_tokenize(review.lower())  # Convert to lowercase
        
        # Remove punctuation, special characters, and numbers
        words = [re.sub(r'[^a-zA-Z]', '', word) for word in words if word.isalpha()]
        
        # Remove stopwords
        words = [word for word in words if word not in stop_words]
        
        # Reconstruct the cleaned review
        cleaned_review = ' '.join(words)
        
        # Add emoji sentiments as words back to the cleaned review
        cleaned_review += ' ' + ' '.join(emoji_sentiments)
        
        # Remove extra spaces and append to the cleaned_reviews list
        cleaned_reviews.append(cleaned_review.strip())
    
    return cleaned_reviews

# Example usage:
dirty_reviews = [
    "Luv dis app ❤️ it's gr8! bt there r 2 many bugs 🐛🐜",
    "This app sux, don't download it. Waste of time 😡",
]

cleaned_reviews = clean_google_play_reviews(dirty_reviews)
for review in cleaned_reviews:
    print(review)


luv dis app bt r many bugs
app sux download waste time negative


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
import re
import emoji
import nltk
import spacy
from nltk.corpus import stopwords

nltk.download('stopwords')

# Load the spaCy model for English
nlp = spacy.load("en_core_web_sm")

def extract_emojis(text):
    return ''.join(c for c in text if c in emoji.EMOJI_DATA)

def clean_google_play_reviews(reviews):
    cleaned_reviews = []
    stop_words = set(stopwords.words('english'))
    
    for review in reviews:
        # Remove URLs
        review = re.sub(r'http\S+', '', review)
        
        # Extract emojis and store them as key factors
        emojis = extract_emojis(review)
        key_factors = emojis.replace(":", " ")  # Replace ':' in emoji names with spaces
        review = re.sub(r':\S+:', '', review)  # Remove emojis
        
        # Tokenize the review into words
        words = review.split()
        
        # Join the words back into a sentence
        review = ' '.join(words)
        
        # Remove punctuation, special characters, and numbers
        review = re.sub(r'[^a-zA-Z\s]', '', review)
        
        # Remove stopwords and perform lemmatization
        doc = nlp(review)
        words = [token.lemma_ for token in doc if token.lemma_ not in stop_words]
        
        # Reconstruct the cleaned review
        cleaned_review = ' '.join(words)
        
        # Add key factors back to the cleaned review
        cleaned_review += ' ' + key_factors
        
        # Remove extra spaces and append to the cleaned_reviews list
        cleaned_reviews.append(cleaned_review.strip())
    
    return cleaned_reviews

# Example usage:
dirty_reviews = [
    "Luv dis app ❤️ it's gr8! bt there r 2 many bugs 🐛🐜",
    "This app sux, don't download it. Waste of time 😡",
]

cleaned_reviews = clean_google_play_reviews(dirty_reviews)
for review in cleaned_reviews:
    print(review)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Luv dis app   gr bt r   many bug ❤🐛🐜
app sux download Waste time 😡


# Final Data Cleaning Function

In [17]:
import re
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

# Define a mapping of emojis to sentiment labels
emoji_sentiment_mapping = {
    "😃": "happy",
    "😔": "sad",
    "😡": "angry",
    "❤️": "love",
    "👍": "positive",
    "👎": "negative",
    # Add more emojis and sentiment labels as needed
}

def convert_emoji_to_sentiment(emoji_char):
    return emoji_sentiment_mapping.get(emoji_char, "")

def clean_google_play_reviews(reviews):
    cleaned_reviews = []
    stop_words = set(stopwords.words('english'))
    
    for review in reviews:
        # Remove URLs
        review = re.sub(r'http\S+', '', review)
        
        # Extract emojis and their sentiment labels
        emojis = [c for c in review if c in emoji_sentiment_mapping]
        emoji_sentiments = [convert_emoji_to_sentiment(emoji) for emoji in emojis]
        review = ''.join([c for c in review if c not in emoji_sentiment_mapping])
        
        # Tokenize the review into words
        words = word_tokenize(review.lower())  # Convert to lowercase
        
        # Remove punctuation, special characters, and numbers
        words = [re.sub(r'[^a-zA-Z]', '', word) for word in words if word.isalpha()]
        
        # Remove stopwords
        words = [word for word in words if word not in stop_words]
        
        # Reconstruct the cleaned review
        cleaned_review = ' '.join(words)
        
        # Add emoji sentiments as words back to the cleaned review
        cleaned_review += ' ' + ' '.join(emoji_sentiments)
        
        # Remove extra spaces and append to the cleaned_reviews list
        cleaned_reviews.append(cleaned_review.strip())
    
    return cleaned_reviews

# Example usage:
dirty_reviews = [
    "Luv dis app ❤️ it's gr8! bt there r 2 many bugs 🐛🐜",
    "This app sux, don't download it. Waste of time 😡",
]

cleaned_reviews = clean_google_play_reviews(dirty_reviews)
for review in cleaned_reviews:
    print(review)

luv dis app bt r many bugs
app sux download waste time angry


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
