# Programming Exercise

## 1. Binary Classification on Text Data 

### (a) Download the data. 

In [99]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print("Number of training samples:", train.shape[0])
print("Number of test samples:", test.shape[0])

# Calculate the percentage of real disaster tweets
disaster_P = (train['target'].value_counts(normalize=True) * 100)
print("Percentage of real disaster tweets:", disaster_P[1])
print("Percentage of non disaster tweets:", disaster_P[0])


Number of training samples: 7613
Number of test samples: 3263
Percentage of real disaster tweets: 42.96597924602653
Percentage of non disaster tweets: 57.03402075397347


### (b) Split the training data.

In [100]:
from sklearn.model_selection import train_test_split

# Split the data
train_set, dev_set = train_test_split(train, test_size=0.3, random_state=42)

### (c) Preprocess the data. 

In [101]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    # Convert text to lowercase
    text = text.lower()    
    
    # Remove punctuation, @, and URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^\w\s]', '', text)
    
    # Lemmatize words
    tokens = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = " ".join([word for word in lemmatized_words if word not in stop_words])
    return text

# Apply preprocessing to both train and dev sets
train_set['text'] = train_set['text'].apply(preprocess_text)
dev_set['text'] = dev_set['text'].apply(preprocess_text)


### (d) Bag of words model.

In [102]:
from sklearn.feature_extraction.text import CountVectorizer

# Define the vectorizer with min_df = 3
M = 3
count_vec = CountVectorizer(binary=True, min_df=M)
X_train = count_vec.fit_transform(train_set['text']).toarray()
X_dev = count_vec.transform(dev_set['text']).toarray()

print("Number of features:", X_train.shape[1])

Number of features: 3061


### (e) Logistic regression.

In [103]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

model = LogisticRegression(penalty=None,  max_iter=500, random_state=42)
model.fit(X_train, train_set['target'])
# Report performance on both the training and dev set
y_train_pred = model.predict(X_train)
y_dev_pred = model.predict(X_dev)
f1_train = f1_score(train_set['target'], y_train_pred)
f1_dev = f1_score(dev_set['target'], y_dev_pred)
print(f"F1 Score on training set (no regularization): {f1_train}")
print(f"F1 Score on development set (no regularization): {f1_dev}")

F1 Score on training set (no regularization): 0.9792168015751477
F1 Score on development set (no regularization): 0.6848484848484848


In [104]:
# Training Logistic Regression with L1 regularization
lr_l1 = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)
lr_l1.fit(X_train, train_set['target'])

# Evaluate F1 score on both training and development sets
y_train_pred_l1 = lr_l1.predict(X_train)
y_dev_pred_l1 = lr_l1.predict(X_dev)
f1_train_l1 = f1_score(train_set['target'], y_train_pred_l1)
f1_dev_l1 = f1_score(dev_set['target'], y_dev_pred_l1)
print("F1 Score on training set with L1 :", f1_train_l1)
print("F1 Score on development set with L1:", f1_dev_l1)



F1 Score on training set with L1 : 0.8481071098799631
F1 Score on development set with L1: 0.743013698630137


In [105]:
# Training Logistic Regression with L2 regularization
lr_l2 = LogisticRegression(penalty='l2', random_state=42)
lr_l2.fit(X_train, train_set['target'])

# Evaluate F1 score on both training and development sets
y_train_pred_l2 = lr_l2.predict(X_train)
y_dev_pred_l2 = lr_l2.predict(X_dev)
f1_train_l2 = f1_score(train_set['target'], y_train_pred_l2)
f1_dev_l2 = f1_score(dev_set['target'], y_dev_pred_l2)
print("F1 Score on training set (L2 regularization):", f1_train_l2)
print("F1 Score on development set (L2 regularization):", f1_dev_l2)

F1 Score on training set (L2 regularization): 0.8896
F1 Score on development set (L2 regularization): 0.7501360914534567


In [106]:
import numpy as np
# Find the 10 most important words that determines a disaster
feature_names = count_vec.get_feature_names_out()
coefficients = lr_l1.coef_[0]
top_indices = np.argsort(np.abs(coefficients))[-10:] 

print("Top 10 most important words:")
for index in reversed(top_indices): 
    word = feature_names[index]
    coef = coefficients[index]
    print(f"word: '{word}' {(10 - len(word)) * ' '} coefficient: {coef}")

Top 10 most important words:
word: 'hiroshima'   coefficient: 3.4346707499188924
word: 'spill'       coefficient: 3.3984058462691933
word: 'mh370'       coefficient: 3.322315652744539
word: 'airport'     coefficient: 3.228504789630286
word: 'derailment'  coefficient: 3.216282178210582
word: 'typhoon'     coefficient: 3.145955359880152
word: 'migrant'     coefficient: 3.133761583749795
word: 'wildfire'    coefficient: 3.0836921168088325
word: 'earthquake'  coefficient: 2.8840924642956045
word: 'crew'        coefficient: 2.5972241151595705


### (f) Bernoulli Naive Bayes.

In [107]:
import numpy as np
from sklearn.metrics import f1_score

class BernoulliNB:
    def __init__(self, alpha):
        self.alpha = alpha
        self.class_log_pri = None
        self.feature_log_prob = None

    def fit(self, X, y):
        _, num_features = X.shape
        self.classes = np.unique(y)
        num_classes = len(self.classes)

        feature_count = np.zeros((num_classes, num_features))
        class_count = np.zeros(num_classes)
        
        # Calculate class counts and feature counts for each class
        for i, cls in enumerate(self.classes):
            X_class = X[y == cls]
            feature_count[i, :] = X_class.sum(axis=0)
            class_count[i] = X_class.shape[0]

        # Calculate log probabilities with Laplace smoothing
        self.feature_log_prob = np.log((feature_count + self.alpha) / (class_count[:, None] + 2 * self.alpha))
        self.class_log_pri = np.log(class_count / y.shape[0])

    def predict(self, X):
        log_likelihood = X @ self.feature_log_prob.T + (1 - X) @ (np.log(1 - np.exp(self.feature_log_prob))).T
        log_prob = log_likelihood + self.class_log_pri
        return self.classes[np.argmax(log_prob, axis=1)]

model_BNB = BernoulliNB(alpha=1.0)
model_BNB.fit(X_train, train_set['target'])
y_pred_BNB = model_BNB.predict(X_dev)

f1_BNB = f1_score(dev_set['target'], y_pred_BNB)
print("F1 Score for Bernoulli Naive Bayes model:", f1_BNB)


F1 Score for Bernoulli Naive Bayes model: 0.7526881720430108


### (h) N-gram model.

In [108]:
# Step 1: Create 2-grams using CountVectorizer
M = 2  # Setting the threshold for minimum document frequency
vectorizer = CountVectorizer(ngram_range=(2, 2), min_df=M, binary=True)
X_train_ngrams = vectorizer.fit_transform(train_set['text']).toarray()
X_dev_ngrams = vectorizer.transform(dev_set['text']).toarray()

# Report the total number of 2-grams in the vocabulary
print(f"Total number of 2-grams in the vocabulary: {len(vectorizer.get_feature_names_out())}")

# Display 10 example 2-grams from the vocabulary
print("10 example 2-grams from the vocabulary:")
print(vectorizer.get_feature_names_out()[:10])

# Train Logistic Regression on 2-grams (Reusing code from 1e)
lr_ngram = LogisticRegression(penalty='l2', random_state=42)
lr_ngram.fit(X_train_ngrams, train_set['target'])

y_pred_lr = lr_ngram.predict(X_dev_ngrams)
f1_lr = f1_score(dev_set['target'], y_pred_lr)
print("F1 Score of development set for Logistic Regression:", f1_lr)

y_pred_lr_train = lr_ngram.predict(X_train_ngrams)
f1_lr_train = f1_score(train_set['target'], y_pred_lr_train)
print("F1 Score of training set for Logistic Regression:", f1_lr_train)

# Train Bernoulli Naive Bayes on 2-grams (Reusing code from 1f)
bnb_ngram = BernoulliNB(alpha=1.0)
bnb_ngram.fit(X_train_ngrams, train_set['target'])


y_pred_bnb = bnb_ngram.predict(X_dev_ngrams)
f1_bnb = f1_score(dev_set['target'], y_pred_bnb)
print("F1 Score on development set for Bernoulli Naive Bayes:", f1_bnb)

y_pred_bnb_train = bnb_ngram.predict(X_train_ngrams)
f1_bnb_train = f1_score(train_set['target'], y_pred_bnb_train)
print("F1 Score on training set for Bernoulli Naive Bayes:", f1_bnb_train)


Total number of 2-grams in the vocabulary: 3527
10 example 2-grams from the vocabulary:
['010401 utc20150805' '10 year' '10 yr' '101 cook' '1030 pm' '10401 utc'
 '109 sn' '10km maximum' '10th death' '11 charged']
F1 Score of development set for Logistic Regression: 0.5644599303135889
F1 Score of training set for Logistic Regression: 0.7212317666126418
F1 Score on development set for Bernoulli Naive Bayes: 0.48259860788863107
F1 Score on training set for Bernoulli Naive Bayes: 0.614521841794569


In [109]:
full_train = pd.concat([train_set, dev_set])

# Build feature vectors not using n-grams method
vectorizer_full = CountVectorizer(min_df=4, binary=True)
X_full_train = vectorizer_full.fit_transform(full_train['text']).toarray()
X_test = vectorizer_full.transform(test['text']).toarray()  

# Train using BernoulliNaiveBayes Model
bnb = BernoulliNB(alpha=1.0)
bnb.fit(X_full_train, full_train_data['target'])

y_test_pred = bnb.predict(X_test)

submission = pd.DataFrame({'id': test['id'], 'target': y_test_pred})
submission.to_csv('submission.csv', index=False)