# Trained on combined 200 kaggle & bluesky training data, tested on ~340 bluesky data

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import string

In [2]:
# Load training and testing data
train_df = pd.read_csv('kag_blue_combined_train.csv')
test_df = pd.read_csv('test1.csv')

print("Training data shape:", train_df.shape)
print("Testing data shape:", test_df.shape)
print("\nFirst few rows of training data:")
print(train_df.head())

Training data shape: (200, 4)
Testing data shape: (341, 3)

First few rows of training data:
   id     keyword                                               text  target
0   1  earthquake  Magnitude 6.9 shook the central Philippines la...       1
1   2  earthquake  6.7 Magnitude Earthquake hit our region hard.....       1
2   3       flood  Hey, @democrats.org - the constant flood of pl...       0
3   4  earthquake  New earthquake reported: M 6.0 - 32 km SE of K...       1
4   5        none  It’s up to people to #RiseForGaza\nSHUT IT DOW...       0


In [3]:
def preprocess_text(text):
    """
    Preprocesses tweet text by:
    - Converting to lowercase
    - Removing URLs
    - Removing mentions (@username)
    - Removing emojis
    - Removing standalone numbers (magnitudes, depths, coordinates)
    - Removing dates and times
    - Removing special characters and punctuation
    - Removing extra whitespace
    - Keeping meaningful labels (location, magnitude, depth, etc.)
    """
    if pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    
    # Remove hashtags (keep the word, remove the #)
    text = re.sub(r'#', '', text)
    
    # Remove emojis (comprehensive pattern for all emoji ranges)
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags
        "\U00002702-\U000027B0"  # dingbats
        "\U000024C2-\U0001F251"  # enclosed characters
        "\U0001F900-\U0001F9FF"  # supplemental symbols
        "\U0001FA00-\U0001FAFF"  # extended symbols
        "\U00002500-\U00002BEF"  # additional symbols
        "\U0001F000-\U0001F02F"  # mahjong tiles
        "\U0001F0A0-\U0001F0FF"  # playing cards
        "\U00002300-\U000023FF"  # miscellaneous technical (like ⏰)
        "\U00002B00-\U00002BFF"  # miscellaneous symbols and arrows
        "\U0000FE00-\U0000FE0F"  # variation selectors
        "\U0001F200-\U0001F2FF"  # enclosed ideographic supplement
        "]+", 
        flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)
    
    # Remove standalone numbers that might be coordinates, magnitudes, depths
    # But keep words with numbers (like "2day" or "gr8")
    text = re.sub(r'\b\d+\.?\d*\b', '', text)
    
    # Remove dates and times (various formats)
    text = re.sub(r'\d{4}-\d{2}-\d{2}', '', text)
    text = re.sub(r'\d{2}:\d{2}:\d{2}', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

# Apply preprocessing to training and testing data
train_df['cleaned_text'] = train_df['text'].apply(preprocess_text)
test_df['cleaned_text'] = test_df['text'].apply(preprocess_text)

print("\nExample of cleaned training text:")
print(f"Original: {train_df['text'].iloc[0]}")
print(f"Cleaned: {train_df['cleaned_text'].iloc[0]}")

print("\nExample of cleaned test text:")
print(f"Original: {test_df['text'].iloc[0]}")
print(f"Cleaned: {test_df['cleaned_text'].iloc[0]}")


Example of cleaned training text:

Example of cleaned test text:
Original: New earthquake reported: M 4.9 - 79 km NNE of Popondetta, Papua New Guinea - 2025-10-06T17:36:45.040Z
Cleaned: new earthquake reported m km nne of popondetta papua new guinea 06t17040z


In [4]:
# Feature extraction using TF-IDF

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(
    max_features=5000,  # Limit to top 5000 features
    min_df=2,           # Ignore terms that appear in less than 2 documents
    max_df=0.8,         # Ignore terms that appear in more than 80% of documents
    ngram_range=(1, 2), # Use unigrams and bigrams
    stop_words='english'
)

# Fit and transform training data
X_train = tfidf.fit_transform(train_df['cleaned_text'])
y_train = train_df['target']

# Transform testing data (use same vectorizer)
X_test = tfidf.transform(test_df['cleaned_text'])

print(f"\nTF-IDF feature matrix shape (training): {X_train.shape}")
print(f"TF-IDF feature matrix shape (testing): {X_test.shape}")



TF-IDF feature matrix shape (training): (200, 479)
TF-IDF feature matrix shape (testing): (341, 479)


In [5]:
# Split training data to create a validation set
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, 
    test_size=0.2, 
    random_state=42,
    stratify=y_train
)

print(f"\nTraining set size: {X_train_split.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")


Training set size: 160
Validation set size: 40


In [6]:
# Initialize and train the model
model = LogisticRegression(
    max_iter=1000,
    random_state=42,
    C=1.0,  # Regularization parameter
    class_weight='balanced'  # Handle class imbalance
)

print("\nTraining the model...")
model.fit(X_train_split, y_train_split)
print("Model training complete!")


Training the model...
Model training complete!


In [7]:
# Evaluate model on Validation set

# Make predictions on validation set
y_val_pred = model.predict(X_val)

# Calculate accuracy
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"\nValidation Accuracy: {val_accuracy:.4f}")

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, target_names=['Not Disaster', 'Disaster']))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))


Validation Accuracy: 0.7500

Classification Report:
              precision    recall  f1-score   support

Not Disaster       0.75      0.75      0.75        20
    Disaster       0.75      0.75      0.75        20

    accuracy                           0.75        40
   macro avg       0.75      0.75      0.75        40
weighted avg       0.75      0.75      0.75        40

Confusion Matrix:
[[15  5]
 [ 5 15]]


In [8]:
# Train model on full training data
print("\nTraining final model on full training data...")
final_model = LogisticRegression(
    max_iter=1000,
    random_state=42,
    C=1.0
)
final_model.fit(X_train, y_train)
print("Final model training complete!")


Training final model on full training data...
Final model training complete!


In [9]:
# Make predictions on test data
test_predictions = final_model.predict(X_test)

print(f"\nNumber of predictions: {len(test_predictions)}")
print(f"Predicted disasters: {sum(test_predictions)}")
print(f"Predicted non-disasters: {len(test_predictions) - sum(test_predictions)}")


Number of predictions: 341
Predicted disasters: 213
Predicted non-disasters: 128


In [11]:
# Create CSV for predicted results
results = pd.DataFrame({
    'id': test_df['id'],
    'text': test_df['text'],
    'target': test_predictions
})

# Save to CSV
results.to_csv('results1.csv', index=False)
print("\nResults file created: results1.csv")
print(results.head(10))


Results file created: results1.csv
           id                                               text  target
0  028e52b32c  New earthquake reported: M 4.9 - 79 km NNE of ...       1
1  2fff02eced  New earthquake reported: M 4.3 - 56 km SE of M...       1
2  80109009d2  Earthquake Location: OFF EAST COAST OF KAMCHAT...       1
3  f7a8e7d8d5  Earthquake Location: OFF EAST COAST OF KAMCHAT...       1
4  763fca34c1  Reviewed M 1.1 earthquake in Grady County, Okl...       1
5  2490bfc09d  #Earthquake M5.1 | Russia: Off East Coast of K...       1
6  237bb90a61  Earthquake Report\n\nEarthquake M5.1 has been ...       1
7  06154f4926  Reviewed M 1.8 earthquake in Alfalfa County, O...       1
8  eeb64ad17c  🌍 Earthquake Alert 🌍\n📍 Location: OFF EAST COA...       1
9  05198e8418  Magnitude : 5.3\nRegion: *Off East Coast of Ka...       1
