In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords

def clean_text(text):
    # Remove '@' but keep the username
    text = re.sub(r'@', '', text)
    # Remove all URLs including 'pic.twitter.com'
    text = re.sub(r'(https?://\S+|www\.\S+|pic\.twitter\.com\S*)', '', text)
    # Remove hashtags and their words
    text = re.sub(r'#\S+', '', text)
    # Remove non-alphanumeric characters (except spaces)
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Strip extra whitespace
    text = text.strip()
    return text

# Load dataset
file_path = r"C:\Users\lelgr\Downloads\tweet.csv"
df = pd.read_csv(file_path)

# Drop unnecessary column
df = df.drop(columns=['Unnamed: 0'], errors='ignore')

# Normalize sentiment column
df['sentimen'] = df['sentimen'].str.lower().str.strip()

# Normalize text (convert informal words to standard form)
def normalize_text(text):
    transformations = {
        # Kata Ganti Orang
        r'\bgw\b': 'aku',
        r'\bgue\b': 'aku',
        r'\bloe\b': 'kamu',
        r'\blu\b': 'kamu',
        r'\bnya\b': '',
        r'\bmu\b': '',
        r'\bsi\b': '',
        
        # Kata Depan
        r'\bkpd\b': 'kepada',
        
        # Kata Sambung
        r'\btp\b': 'tapi',
        r'\bklo\b': 'kalau',
        
        # Kata Keterangan
        r'\bkok\b': '',
        r'\bmalah\b': '',
        r'\bcuma\b': 'hanya',
        r'\baja\b': 'saja',
        r'\bkan\b': '',
        r'\bnih\b': '',
        r'\bga\b': 'tidak',
        r'\bggak\b': 'tidak',
        
        # Kata Seru/Partikel
        r'\bdong\b': '',
        
        # Singkatan Umum
        r'\byg\b': 'yang',
        r'\bpd\b': 'pada',
        r'\butk\b': 'untuk',
        r'\bdr\b': 'dari',
        r'\bdg\b': 'dengan',
        r'\bbkn\b': 'bukan',
        r'\bdll\b': 'dan lain-lain',
        r'\bbhw\b': 'bahwa',
        r'\bkrn\b': 'karena',
        r'\bsbg\b': 'sebagai',
        r'\bdlm\b': 'dalam',
        r'\bjgn\b': 'jangan',
        r'\bhrs\b': 'harus',
        r'\bmsh\b': 'masih',
        r'\bsdh\b': 'sudah',
        r'\bblm\b': 'belum',
        r'\bkl\b': 'kalau',
        r'\bdpt\b': 'dapat',
        r'\bsll\b': 'selalu',
        r'\btsb\b': 'tersebut',
        
        # Kata Penegas/Penguat
        r'\bbanget\b': 'sangat',
        
        # Normalisasi kata yang sering salah ketik atau variasi
        r'\bkarna\b': 'karena',
        r'\btrus\b': 'terus'
    }
    for pattern, replacement in transformations.items():
        text = re.sub(pattern, replacement, text)
    return text

# Stopwords removal
def remove_stopwords(text):
    stop_words = set(stopwords.words("indonesian") + stopwords.words("english"))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Apply normalization and stopwords removal
df['tweet'] = df['tweet'].apply(clean_text)
df['tweet'] = df['tweet'].apply(normalize_text)
df['tweet'] = df['tweet'].apply(remove_stopwords)

In [2]:
from kbbi import KBBI

In [3]:
from joblib import Parallel, delayed

In [4]:
def lemmatize_text(text):
    words = text.split()
    lemmatized_words = []
    
    for word in words:
        try:
            lemma = KBBI(word).lemma()[0]  # Get first lemma
        except:
            lemma = word  # Keep word unchanged if not found
        lemmatized_words.append(lemma)
    
    return " ".join(lemmatized_words)

# Apply cleaning and then lemmatization
df['tweet'] = Parallel(n_jobs=-1)(delayed(lemmatize_text)(tweet) for tweet in df['tweet'])

In [5]:
df

Unnamed: 0,sentimen,tweet
0,negatif,prabowo indonesia dihargai bangsa asing berita...
1,netral,batuan langka tasbih jokowi hadiah habib luthf...
2,netral,era jokowi ekonomi indonesia
3,positif,sumatera selatan asian games berdampak ekonomi...
4,negatif,negara ngutang bngun infrastruktur udah dipake...
...,...,...
1810,netral,negarawan sejati bangga mengedepankan harga ba...
1811,netral,1 ceramah damai indonesiaku 2 perekonomian mem...
1812,netral,mari bangun bangsa dgn mendukung perekonomian ...
1813,netral,bantu majukan perekonomian bangsa jokowi yuk


In [6]:
# Save cleaned data
df.to_csv("cleaned_tweets(1).csv", index=False)

In [7]:
df

Unnamed: 0,sentimen,tweet
0,negatif,prabowo indonesia dihargai bangsa asing berita...
1,netral,batuan langka tasbih jokowi hadiah habib luthf...
2,netral,era jokowi ekonomi indonesia
3,positif,sumatera selatan asian games berdampak ekonomi...
4,negatif,negara ngutang bngun infrastruktur udah dipake...
...,...,...
1810,netral,negarawan sejati bangga mengedepankan harga ba...
1811,netral,1 ceramah damai indonesiaku 2 perekonomian mem...
1812,netral,mari bangun bangsa dgn mendukung perekonomian ...
1813,netral,bantu majukan perekonomian bangsa jokowi yuk


In [15]:
df

Unnamed: 0,sentimen,tweet
0,negatif,prabowo indonesia dihargai bangsa asing berita...
1,netral,batuan langka tasbih jokowi hadiah habib luthf...
2,netral,era jokowi ekonomi indonesia
3,positif,sumatera selatan asian games berdampak ekonomi...
4,negatif,negara ngutang bngun infrastruktur udah dipake...
...,...,...
1810,netral,negarawan sejati bangga mengedepankan harga ba...
1811,netral,1 ceramah damai indonesiaku 2 perekonomian mem...
1812,netral,mari bangun bangsa dgn mendukung perekonomian ...
1813,netral,bantu majukan perekonomian bangsa jokowi yuk
