In [14]:
import nltk
import pandas as pd
import string
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split

try:
    messages = pd.read_csv('C:\\Users\\DELL\\OneDrive\\Desktop\\aiml\\spamhamdata.csv', encoding="latin-1", header=None, names=['label', 'text', 'col3', 'col4'])
    messages = messages[['label', 'text']]  
except:
    
    messages = pd.read_csv('C:\\Users\\DELL\\OneDrive\\Desktop\\aiml\\spamhamdata.csv', encoding="latin-1", sep='\t', header=None, names=['label', 'text'])
    
print('Number of nulls in labels:', messages['label'].isnull().sum())
print('Number of nulls in text:', messages['text'].isnull().sum())
print(messages.head(20))

stopwords = stopwords.words('english')

def clean_text(text):
    text = "".join([char.lower() for char in str(text) if char not in string.punctuation])
    tokens = re.split(r'\W+', text)  # Fixed escape sequence
    text = [word for word in tokens if word not in stopwords]
    return text


tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(messages['text'])
X_features = pd.DataFrame(X_tfidf.toarray())


X_train, X_test, y_train, y_test = train_test_split(
    X_features, 
    messages['label'], 
    test_size=0.2,
    random_state=42
)
rf = RandomForestClassifier()
rf_model = rf.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
print(f'Precision: {round(precision, 3)} / Recall: {round(recall, 3)}')

#manual testing
text = ["free entry in 2 a wkli comp to win FA cup final tkts 21st May 2005"]
text_tfidf = tfidf_vect.transform(text)
print("Prediction:", rf_model.predict(text_tfidf)[0])

Number of nulls in labels: 0
Number of nulls in text: 0
   label  \
0    ham   
1    ham   
2   spam   
3    ham   
4    ham   
5   spam   
6    ham   
7    ham   
8   spam   
9   spam   
10   ham   
11  spam   
12  spam   
13   ham   
14   ham   
15  spam   
16   ham   
17   ham   
18   ham   
19  spam   

                                                                                                   text  
0   Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...  
1                                                                         Ok lar... Joking wif u oni...  
2   Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...  
3                                                     U dun say so early hor... U c already then say...  
4                                         Nah I don't think he goes to usf, he lives around here though  
5   FreeMsg Hey there darling it's been 3 week's now an