In [1]:
import nltk
import pandas as pd
import string
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split

try:
    messages = pd.read_csv('C:\\Users\\DELL\\OneDrive\\Desktop\\aiml\\spamhamdata.csv', encoding="latin-1", header=None, names=['label', 'text', 'col3', 'col4'])
    messages = messages[['label', 'text']]  
except:
    
    messages = pd.read_csv('C:\\Users\\DELL\\OneDrive\\Desktop\\aiml\\spamhamdata.csv', encoding="latin-1", sep='\t', header=None, names=['label', 'text'])
    
print('Number of nulls in labels:', messages['label'].isnull().sum())
print('Number of nulls in text:', messages['text'].isnull().sum())
print(messages.head(20))

stopwords = stopwords.words('english')

def clean_text(text):
    text = "".join([char.lower() for char in str(text) if char not in string.punctuation])
    tokens = re.split(r'\W+', text)  # Fixed escape sequence
    text = [word for word in tokens if word not in stopwords]
    return text


tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(messages['text'])
X_features = pd.DataFrame(X_tfidf.toarray())


X_train, X_test, y_train, y_test = train_test_split(
    X_features, 
    messages['label'], 
    test_size=0.2,
    random_state=42
)
rf = RandomForestClassifier()
rf_model = rf.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
print(f'Precision: {round(precision, 3)} / Recall: {round(recall, 3)}')

#manual testing
text = ["you won 1000 dollar for free share your otp now"]
text_tfidf = tfidf_vect.transform(text)
print("Prediction:", rf_model.predict(text_tfidf)[0])

Number of nulls in labels: 0
Number of nulls in text: 0
   label                                               text
0    ham  Go until jurong point, crazy.. Available only ...
1    ham                      Ok lar... Joking wif u oni...
2   spam  Free entry in 2 a wkly comp to win FA Cup fina...
3    ham  U dun say so early hor... U c already then say...
4    ham  Nah I don't think he goes to usf, he lives aro...
5   spam  FreeMsg Hey there darling it's been 3 week's n...
6    ham  Even my brother is not like to speak with me. ...
7    ham  As per your request 'Melle Melle (Oru Minnamin...
8   spam  WINNER!! As a valued network customer you have...
9   spam  Had your mobile 11 months or more? U R entitle...
10   ham  I'm gonna be home soon and i don't want to tal...
11  spam  SIX chances to win CASH! From 100 to 20,000 po...
12  spam  URGENT! You have won a 1 week FREE membership ...
13   ham  I've been searching for the right words to tha...
14   ham                I HAVE A DATE ON SUN