In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import string

In [2]:
# Load training and testing data
train_df = pd.read_csv('cleaned_train.csv')
test_df = pd.read_csv('cleaned_test.csv')

print("Training data shape:", train_df.shape)
print("Testing data shape:", test_df.shape)
print("\nFirst few rows of training data:")
print(train_df.head())

Training data shape: (7613, 4)
Testing data shape: (3263, 3)

First few rows of training data:
   id keyword                                               text  target
0   1    none  Our Deeds are the Reason of this #earthquake M...       1
1   4    none             Forest fire near La Ronge Sask. Canada       1
2   5    none  All residents asked to 'shelter in place' are ...       1
3   6    none  13,000 people receive #wildfires evacuation or...       1
4   7    none  Just got sent this photo from Ruby #Alaska as ...       1


In [3]:
# Data Preprocessing
def preprocess_text(text):
    """
    Preprocesses tweet text by:
    - Converting to lowercase
    - Removing URLs
    - Removing mentions (@username)
    - Removing special characters and punctuation
    - Removing extra whitespace
    """
    if pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    
    # Remove hashtags (keep the word, remove the #)
    text = re.sub(r'#', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

# Apply preprocessing to training and testing data
train_df['cleaned_text'] = train_df['text'].apply(preprocess_text)
test_df['cleaned_text'] = test_df['text'].apply(preprocess_text)

print("\nExample of cleaned text:")
print(f"Original: {train_df['text'].iloc[0]}")
print(f"Cleaned: {train_df['cleaned_text'].iloc[0]}")


Example of cleaned text:
Original: Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
Cleaned: our deeds are the reason of this earthquake may allah forgive us all


In [4]:
# Feature extraction using TF-IDF

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(
    max_features=5000,  # Limit to top 5000 features
    min_df=2,           # Ignore terms that appear in less than 2 documents
    max_df=0.8,         # Ignore terms that appear in more than 80% of documents
    ngram_range=(1, 2), # Use unigrams and bigrams
    stop_words='english'
)

# Fit and transform training data
X_train = tfidf.fit_transform(train_df['cleaned_text'])
y_train = train_df['target']

# Transform testing data (use same vectorizer)
X_test = tfidf.transform(test_df['cleaned_text'])

print(f"\nTF-IDF feature matrix shape (training): {X_train.shape}")
print(f"TF-IDF feature matrix shape (testing): {X_test.shape}")



TF-IDF feature matrix shape (training): (7613, 5000)
TF-IDF feature matrix shape (testing): (3263, 5000)


In [5]:
# Split training data to create a validation set
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, 
    test_size=0.2, 
    random_state=42,
    stratify=y_train
)

print(f"\nTraining set size: {X_train_split.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")


Training set size: 6090
Validation set size: 1523


In [11]:
# Initialize and train the model
model = LogisticRegression(
    max_iter=1000,
    random_state=42,
    C=1.0,  # Regularization parameter
    class_weight='balanced'  # Handle class imbalance
)

print("\nTraining the model...")
model.fit(X_train_split, y_train_split)
print("Model training complete!")


Training the model...
Model training complete!


In [13]:
# Evaluate model on Validation set

# Make predictions on validation set
y_val_pred = model.predict(X_val)

# Calculate accuracy
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"\nValidation Accuracy: {val_accuracy:.4f}")

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, target_names=['Not Disaster', 'Disaster']))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))


Validation Accuracy: 0.8076

Classification Report:
              precision    recall  f1-score   support

Not Disaster       0.83      0.84      0.83       869
    Disaster       0.78      0.77      0.77       654

    accuracy                           0.81      1523
   macro avg       0.80      0.80      0.80      1523
weighted avg       0.81      0.81      0.81      1523

Confusion Matrix:
[[727 142]
 [151 503]]


In [8]:
# Train model on full training data
print("\nTraining final model on full training data...")
final_model = LogisticRegression(
    max_iter=1000,
    random_state=42,
    C=1.0
)
final_model.fit(X_train, y_train)
print("Final model training complete!")


Training final model on full training data...
Final model training complete!


In [9]:
# Make predictions on test data
test_predictions = final_model.predict(X_test)

print(f"\nNumber of predictions: {len(test_predictions)}")
print(f"Predicted disasters: {sum(test_predictions)}")
print(f"Predicted non-disasters: {len(test_predictions) - sum(test_predictions)}")


Number of predictions: 3263
Predicted disasters: 1107
Predicted non-disasters: 2156


In [10]:
# Create CSV for predicted results
results = pd.DataFrame({
    'id': test_df['id'],
    'target': test_predictions
})

# Save to CSV
results.to_csv('results.csv', index=False)
print("\nResults file created: results.csv")
print(results.head(10))


Results file created: results.csv
   id  target
0   0       1
1   2       1
2   3       1
3   9       1
4  11       1
5  12       1
6  21       0
7  22       0
8  27       0
9  29       0
