## Installing and Importing Dependencies

In [1]:
!pip install pandas numpy kagglehub nltk gensim 



In [2]:
import pandas as pd
import numpy as np
import kagglehub
import os
import nltk
import gensim

nltk.download('all')

from sklearn.datasets import load_iris
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from gensim.models import Word2Vec

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\kalat\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\kalat\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\kalat\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\kalat\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\kalat\AppData\Roaming\nltk_data...
[

## Loading Data

##### Boston Housing Dataset

In [3]:
path = kagglehub.dataset_download("altavish/boston-housing-dataset")
print("Path to dataset files:", path)
csv_file = os.path.join(path, 'HousingData.csv')
boston_house_data = pd.read_csv(csv_file)

Path to dataset files: C:\Users\kalat\.cache\kagglehub\datasets\altavish\boston-housing-dataset\versions\1


In [4]:
boston_house_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2


##### SMS Spam Dataset

In [5]:
path = kagglehub.dataset_download("marslinoedward/sms-spam-dataset")
print("Path to dataset files:", path)
csv_file = os.path.join(path, 'spam.csv')
spam_data = pd.read_csv(csv_file)

Path to dataset files: C:\Users\kalat\.cache\kagglehub\datasets\marslinoedward\sms-spam-dataset\versions\1


In [6]:
spam_data = spam_data[['spamORham', 'Message']]
spam_data = spam_data.rename(columns = {'spamORham' : 'Status'})
spam_data.head()

Unnamed: 0,Status,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Downloading Directly from Kaggle

###### Boston Housing Dataset --> https://www.kaggle.com/datasets/altavish/boston-housing-dataset
###### SMS Spam Dataset --> https://www.kaggle.com/datasets/marslinoedward/sms-spam-dataset

## Supervised Learning

### Linear Regression

#### Preprocessing

In [7]:
house_data = boston_house_data.copy()
house_data.isnull().sum()

CRIM       20
ZN         20
INDUS      20
CHAS       20
NOX         0
RM          0
AGE        20
DIS         0
RAD         0
TAX         0
PTRATIO     0
B           0
LSTAT      20
MEDV        0
dtype: int64

In [8]:
house_data = house_data.fillna(house_data.mean())

#### Splitting data for Model Training

In [9]:
X_reg = house_data.iloc[:, :-1]
y_reg = house_data['MEDV']

In [10]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size = 0.2, random_state = 2744)

#### Model Training

In [11]:
linear_reg = LinearRegression()
linear_reg.fit(X_train_reg, y_train_reg)

In [12]:
y_pred_reg = linear_reg.predict(X_test_reg)

#### Performance evaluation

In [13]:
print('Mean Squared Error(MSE) :', mean_squared_error(y_test_reg, y_pred_reg))
print('R2 Score                :', r2_score(y_test_reg, y_pred_reg))

Mean Squared Error(MSE) : 14.298176666463789
R2 Score                : 0.8003426828750593


### Logistic Regression

In [14]:
log_data = spam_data.copy()
log_data.head()

Unnamed: 0,Status,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Preprocessing Data

In [15]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [16]:
def preprocess_text(text) :
    text = text.lower()
    for i in '~!@#$%^&*()_+-=<>?,/:;"{}[]\n\t0123456789.' :
        text = text.replace(i, ' ')
        text = text.replace("'",'')
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

log_data['Processed Text'] = log_data['Message'].apply(preprocess_text)
log_data.head()

Unnamed: 0,Status,Message,Processed Text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf life around though


#### Using td-idf Vectorizer

##### Vectorizing Using tf-idf

In [17]:
vectorizer = TfidfVectorizer()

X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(log_data['Processed Text'], log_data['Status'], test_size = 0.3, random_state = 2744)
X_train_log_tfidf = vectorizer.fit_transform(X_train_log)
X_test_log_tfidf = vectorizer.transform(X_test_log)

##### Fitting LogisticRegreesion() Model 

In [18]:
log_reg = LogisticRegression()
log_reg.fit(X_train_log_tfidf, y_train_log)

In [19]:
y_pred_log = log_reg.predict(X_test_log_tfidf)

##### Evaluating Model

In [20]:
print('Accuracy Score        :', accuracy_score(y_test_log, y_pred_log))
print('Confusion Matrix      :\n', confusion_matrix(y_test_log, y_pred_log))
print('Classification Report :\n', classification_report(y_test_log, y_pred_log))

Accuracy Score        : 0.9706937799043063
Confusion Matrix      :
 [[1467    2]
 [  47  156]]
Classification Report :
               precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1469
        spam       0.99      0.77      0.86       203

    accuracy                           0.97      1672
   macro avg       0.98      0.88      0.92      1672
weighted avg       0.97      0.97      0.97      1672



#### Using Word2Vec

In [21]:
w2v = Word2Vec(log_data['Processed Text'], vector_size = 150, min_count = 1)

In [22]:
def sentence_vector(tokens, model):
    vectors = []
    for word in tokens:
        if word in model.wv:
            vectors.append(model.wv[word])
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [23]:
X_wv = np.array([sentence_vector(tokens, w2v) for tokens in log_data['Processed Text']])
y_wv = log_data['Status']

##### Fitting LogisticRegreesion() Model 

In [24]:
X_train_wv, X_test_wv, y_train_wv, y_test_wv = train_test_split(X_wv, y_wv, test_size=0.3, random_state = 2744)

In [25]:
logreg_wv = LogisticRegression()
logreg_wv.fit(X_train_wv, y_train_wv)

In [26]:
y_pred_wv = logreg_wv.predict(X_test_wv)

##### Evaluating Model

In [27]:
print('Accuracy Score        :', accuracy_score(y_test_wv, y_pred_wv))
print('Confusion Matrix      :\n', confusion_matrix(y_test_wv, y_pred_wv))
print('Classification Report :\n', classification_report(y_test_wv, y_pred_wv))

Accuracy Score        : 0.9007177033492823
Confusion Matrix      :
 [[1454   15]
 [ 151   52]]
Classification Report :
               precision    recall  f1-score   support

         ham       0.91      0.99      0.95      1469
        spam       0.78      0.26      0.39       203

    accuracy                           0.90      1672
   macro avg       0.84      0.62      0.67      1672
weighted avg       0.89      0.90      0.88      1672

