In [None]:
# Cell 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import os

# Cell 2: Load data
# Get the current working directory
current_directory = os.getcwd()
print(f"Current working directory: {current_directory}")

# Construct file paths relative to the current directory
train_file_path = os.path.join(current_directory, "train-data.tsv")
test_file_path = os.path.join(current_directory, "valid-data.tsv")

# Alternatively, you can specify absolute paths if you know where the files are located
# train_file_path = "/path/to/your/data/train-data.tsv"  # Replace with the actual path
# test_file_path = "/path/to/your/data/valid-data.tsv"  # Replace with the actual path


train_data = pd.read_csv(train_file_path, sep='\t', header=None, names=["Label", "Message"])
test_data = pd.read_csv(test_file_path, sep='\t', header=None, names=["Label", "Message"])

# Cell 3: Preprocess the data
# Map labels to numbers: ham = 0, spam = 1
train_data['Label'] = train_data['Label'].map({'ham': 0, 'spam': 1})
test_data['Label'] = test_data['Label'].map({'ham': 0, 'spam': 1})

X_train = train_data['Message']
y_train = train_data['Label']
X_test = test_data['Message']
y_test = test_data['Label']

# Cell 4: Vectorize the text messages using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Cell 5: Train the model (Naive Bayes classifier)
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Cell 6: Define the predict_message function
def predict_message(message):
    # Transform the message to the same vectorized form as the training data
    message_tfidf = vectorizer.transform([message])

    # Predict the class (0: ham, 1: spam)
    prediction = model.predict(message_tfidf)

    # Get the probability of being 'spam' (1)
    prob_spam = model.predict_proba(message_tfidf)[0][1]  # Probability of spam (1)

    # Return the likelihood of ham or spam and the class
    return [prob_spam, 'spam' if prediction[0] == 1 else 'ham']

# Cell 7: Test the function with examples
test_message1 = "Free money! Claim your prize now!"
test_message2 = "Hey, are we still meeting at 6?"

# Print the predictions
print(predict_message(test_message1))  # Expected output: [prob_spam, 'spam']
print(predict_message(test_message2))  # Expected output: [prob_spam, 'ham']

# Optional: Evaluate the model on the test set
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on the test set: {accuracy:.4f}")


# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
