In [None]:
# Cell 1: Setup, Imports, and Configuration
import os
import sys
import re
import subprocess
from google.colab import drive
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from thefuzz import fuzz
from tqdm.notebook import tqdm
import numpy as np

# --- Stage 1: Install Required Libraries ---
def install(packages):
    """Installs a list of packages quietly using pip."""
    for package in packages:
        try:
            print(f"📦 Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])
        except subprocess.CalledProcessError as e:
            print(f"❌ Failed to install {package}: {e}"); raise
print("--- Starting Environment Setup ---")
required_packages = ["pandas", "scikit-learn", "thefuzz", "python-Levenshtein", "tqdm"]
install(required_packages); print("✅ All libraries are ready.\n")

# --- Stage 2: Mount Drive and Define Configuration ---
drive.mount('/content/drive')
GDRIVE_PATH = '/content/drive/MyDrive/eecsi_revise/'
RAW_DATA_FILE = 'merged_twitter_data.csv'
OUTPUT_CLEAN_FILE = 'final_preprocessed_data_with_dates.csv'
OUTPUT_DUPLICATES_FILE = 'duplicates_removed.csv'
TEXT_COLUMN_RAW = 'full_text'; TEXT_COLUMN_CLEAN = 'cleaned_text'
DATE_COLUMN = 'created_at'; KEYWORD = 'ikn'
FILTER_THRESHOLD = 0.80; VERIFY_THRESHOLD = 90

print(f"✅ Setup complete. All files will be processed in: {GDRIVE_PATH}")

--- Starting Environment Setup ---
📦 Installing pandas...
📦 Installing scikit-learn...
📦 Installing thefuzz...
📦 Installing python-Levenshtein...
📦 Installing tqdm...
✅ All libraries are ready.

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Setup complete. All files will be processed in: /content/drive/MyDrive/eecsi_revise/


In [None]:
# Cell 2: Load and Initial Filter
print("--- Stage 1: Loading and Initial Filtering ---")
raw_file_path = os.path.join(GDRIVE_PATH, RAW_DATA_FILE)
try:
    df = pd.read_csv(raw_file_path)
    print(f"Initial raw data loaded with {len(df)} rows.")
    df = df[[DATE_COLUMN, TEXT_COLUMN_RAW]]
    df.dropna(subset=[TEXT_COLUMN_RAW], inplace=True)
    df[TEXT_COLUMN_RAW] = df[TEXT_COLUMN_RAW].astype(str)
    initial_count = len(df)
    df = df[df[TEXT_COLUMN_RAW].str.contains(KEYWORD, case=False, na=False)].copy()
    df.reset_index(drop=True, inplace=True)
    print(f"Filtered for keyword '{KEYWORD}'. Kept {len(df)} rows out of {initial_count}.")
except FileNotFoundError:
    print(f"❌ ERROR: Raw data file not found at '{raw_file_path}'."); raise

--- Stage 1: Loading and Initial Filtering ---
Initial raw data loaded with 83847 rows.
Filtered for keyword 'ikn'. Kept 82062 rows out of 83847.


In [None]:
# Cell 3: Define and Apply Text Cleaning Functions

print("\n--- Stage 2: Defining and Applying Text Cleaning ---")

# --- All your powerful cleaning functions ---
def remove_mentions(text): return re.sub(r'@\w+', '', text)
def remove_hashtags(text): return re.sub(r'#\S+', '', text)
def remove_urls(text): return re.sub(r'http\S+|www\S+', '', text)
def remove_punctuation(text): return re.sub(r'[^\w\s]', '', text) # Removes all punctuation including _ and -
def remove_extra_whitespace(text): return re.sub(r'\s+', ' ', text).strip()

def clean_text(text):
    text = remove_mentions(text)
    text = remove_hashtags(text)
    text = remove_urls(text)
    text = remove_punctuation(text)
    text = remove_extra_whitespace(text)
    text = text.lower()
    return text

# Apply the main cleaning function to create the 'cleaned_text' column
tqdm.pandas(desc="Cleaning Tweets")
df[TEXT_COLUMN_CLEAN] = df[TEXT_COLUMN_RAW].progress_apply(clean_text)

# Drop rows that might become empty after cleaning all the noise
df.dropna(subset=[TEXT_COLUMN_CLEAN], inplace=True)
df = df[df[TEXT_COLUMN_CLEAN] != ''].copy()
df.reset_index(drop=True, inplace=True)

print(f"✅ Text cleaning complete. Data now has {len(df)} rows.")
display(df.head())


--- Stage 2: Defining and Applying Text Cleaning ---


Cleaning Tweets:   0%|          | 0/82062 [00:00<?, ?it/s]

✅ Text cleaning complete. Data now has 82062 rows.


Unnamed: 0,created_at,full_text,cleaned_text
0,Mon Dec 30 15:23:04 +0000 2024,@HarryPattyRM00 @Beritasatu Mungkin juga delus...,mungkin juga delusi tp jokowi sengaja push ikn...
1,Mon Dec 30 15:19:50 +0000 2024,@Srik4ndiMuslim2 Proyek IKN sudah mendapat per...,proyek ikn sudah mendapat persetujuan dpr cita...
2,Mon Dec 30 15:17:49 +0000 2024,@democrazymedia Ratusan Investor sudah ngantri...,ratusan investor sudah ngantri mau masuk ke ik...
3,Mon Dec 30 15:17:31 +0000 2024,@zzzZz_zZZz03 Sorry ya. Aku udah punya yang le...,sorry ya aku udah punya yang lebih baik 12 dar...
4,Mon Dec 30 15:12:50 +0000 2024,#KisiKisiPengemis500-1000dulu Bicara.Presiden ...,bicarapresiden ri ke 7 joko widodo sebagai bpk...


In [None]:
# Cell 4 (NEW): Contextual Filtering based on Cleaned Text Length

print("\n--- Stage 3: Applying Contextual Filtering ---")

# Set the minimum number of words required for a tweet to be considered relevant
MINIMUM_WORD_COUNT = 3

initial_rows = len(df)
# Keep only rows where the cleaned_text has at least MINIMUM_WORD_COUNT words
df = df[df[TEXT_COLUMN_CLEAN].str.split().str.len() >= MINIMUM_WORD_COUNT].copy()
df.reset_index(drop=True, inplace=True)

print(f"Filtered out tweets with fewer than {MINIMUM_WORD_COUNT} words.")
print(f"Removed {initial_rows - len(df)} low-context tweets.")
print(f"✅ Contextual filtering complete. Data now has {len(df)} rows.")


--- Stage 3: Applying Contextual Filtering ---
Filtered out tweets with fewer than 3 words.
Removed 1333 low-context tweets.
✅ Contextual filtering complete. Data now has 80729 rows.


In [None]:
# Cell 5: Deduplication Stage 1 - Finding Duplicate Candidates
print("\n--- Stage 3: Finding duplicate candidates on CLEANED text ---")

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df[TEXT_COLUMN_CLEAN])
nn = NearestNeighbors(metric='cosine', algorithm='brute'); nn.fit(tfidf_matrix)
distances, indices = nn.radius_neighbors(tfidf_matrix, radius=1 - FILTER_THRESHOLD)

potential_groups = []; processed_indices = set()
for i in range(len(indices)):
    if i in processed_indices: continue
    group = set(indices[i])
    if len(group) > 1:
        potential_groups.append(list(group))
        processed_indices.update(group)
print(f"✅ Stage 3 complete. Found {len(potential_groups)} potential groups to verify.")


--- Stage 3: Finding duplicate candidates on CLEANED text ---
✅ Stage 3 complete. Found 4811 potential groups to verify.


In [None]:
# Cell 6: Deduplication Stage 2 - Verifying Candidates with TheFuzz
print("\n--- Stage 4: Verifying candidate groups with TheFuzz (this may take a while)... ---")

final_duplicate_indices = set()
for group in tqdm(potential_groups, desc="Verifying Candidate Groups"):
    for i in range(len(group)):
        for j in range(i + 1, len(group)):
            idx1 = group[i]; idx2 = group[j]
            score = fuzz.token_set_ratio(df.loc[idx1, TEXT_COLUMN_CLEAN], df.loc[idx2, TEXT_COLUMN_CLEAN])
            if score > VERIFY_THRESHOLD:
                final_duplicate_indices.add(max(idx1, idx2))
print(f"✅ Verification complete. Found {len(final_duplicate_indices)} unique tweets to be removed.")


--- Stage 4: Verifying candidate groups with TheFuzz (this may take a while)... ---


Verifying Candidate Groups:   0%|          | 0/4811 [00:00<?, ?it/s]

✅ Verification complete. Found 13088 unique tweets to be removed.


In [None]:
# Cell 7: Finalizing and Saving Results
print("\n--- Stage 5: Finalizing data and saving results ---")

indices_to_remove = sorted(list(final_duplicate_indices))
df_clean_final = df.drop(index=indices_to_remove)

# --- CRITICAL: Keep only the two columns you need for the final output ---
final_output = df_clean_final[[DATE_COLUMN, TEXT_COLUMN_CLEAN]].copy()
final_output.reset_index(drop=True, inplace=True)

# Also create a DataFrame of the duplicates for archival purposes
df_duplicates_final = df.iloc[indices_to_remove]

print("\n--- FINAL PREPROCESSING SUMMARY ---")
print(f"Initial rows after keyword filtering: {len(df)}")
print(f"Number of duplicate rows removed:    {len(df_duplicates_final)}")
print(f"Final number of clean (unique) rows: {len(final_output)}")

# Save the results to new CSV files in your Drive
output_clean_path = os.path.join(GDRIVE_PATH, OUTPUT_CLEAN_FILE)
output_duplicates_path = os.path.join(GDRIVE_PATH, OUTPUT_DUPLICATES_FILE)
final_output.to_csv(output_clean_path, index=False, encoding='utf-8-sig')
df_duplicates_final.to_csv(output_duplicates_path, index=False, encoding='utf-8-sig')

print(f"\n✅ Preprocessing complete!")
print(f"Final clean data has been saved to: '{output_clean_path}'")
print(f"Removed duplicates have been saved to: '{output_duplicates_path}'")


--- Stage 5: Finalizing data and saving results ---

--- FINAL PREPROCESSING SUMMARY ---
Initial rows after keyword filtering: 80729
Number of duplicate rows removed:    13088
Final number of clean (unique) rows: 67641

✅ Preprocessing complete!
Final clean data has been saved to: '/content/drive/MyDrive/eecsi_revise/final_preprocessed_data_with_dates.csv'
Removed duplicates have been saved to: '/content/drive/MyDrive/eecsi_revise/duplicates_removed.csv'
