In [1]:
import numpy as np
import pandas as pd
import string
import nltk
import itertools
from wordcloud import WordCloud

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, f1_score, roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


from sklearn.ensemble import RandomForestClassifier

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
spam_dataset = pd.read_csv('spam.csv', encoding = "ISO-8859-1", usecols=[0, 1], names=['Spam', 'Text'],
                           skiprows=1)
spam_dataset['Spam'] = spam_dataset['Spam'].replace(['ham', 'spam'], [0, 1])

In [4]:
def remove_puncation(text):
    cleaned = ''.join([word for word in text if word not in string.punctuation])
    return cleaned
spam_dataset['Cleaned_Text'] = spam_dataset['Text'].apply(lambda x: remove_puncation(x))

def tokenize(text):

    # Usunięcie wielkich liter
    clean_text = text.lower()

    # Tokenizacja
    tokenized_text = nltk.word_tokenize(clean_text)
    return tokenized_text

spam_dataset['Tokenized_Text'] = spam_dataset['Cleaned_Text'].apply(lambda x: tokenize(x))

stopwords = nltk.corpus.stopwords.words("english")

def remove_stopwords(text):
    without_stopwords = [word for word in text if word not in stopwords]
    return without_stopwords
spam_dataset['WithoutStop_Text'] = spam_dataset['Tokenized_Text'].apply(lambda x: remove_stopwords(x))

stemmer = nltk.PorterStemmer()
def stemming(text):
    stemmed_words = [stemmer.stem(word) for word in text]
    return stemmed_words
spam_dataset['Stemmed_Text'] = spam_dataset['WithoutStop_Text'].apply(lambda x: stemming(x))

lemmater = nltk.WordNetLemmatizer()
def lemmatizing(text):
    lemmatized_words = [lemmater.lemmatize(word) for word in text]
    return lemmatized_words
spam_dataset['Lemmatized_Text'] = spam_dataset['WithoutStop_Text'].apply(lambda x: lemmatizing(x))

In [5]:
words_spam = list(spam_dataset.loc[spam_dataset['Spam']==1, 'Lemmatized_Text'].values)
words_spam = list(itertools.chain.from_iterable(words_spam))
words_spam = ' '.join(words_spam)
words_notspam = list(spam_dataset.loc[spam_dataset['Spam']==0, 'Lemmatized_Text'].values)
words_notspam = list(itertools.chain.from_iterable(words_notspam))
words_notspam = ' '.join(words_notspam)

In [6]:
metrics_dataframe = pd.DataFrame(columns = ['Model', 'F1_score', 'AUC'])
metrics_dataframe
models = []
models_names = []
predictions_proba_list = []

def calculate_metrics(model, name, X_checked, y_checked):
    models.append(model)
    models_names.append(name)
    global metrics_dataframe
    predictions = model.predict(X_checked)
    predictions_proba = model.predict_proba(X_checked)
    predictions_proba_list.append(predictions_proba[:,1])

    ############## metryki dla sprawdzanego modelu ################

    # Precision, Recall, F1, Accuracy
    print(classification_report(y_checked, predictions))

    # Confusion matrix
    plt.figure()
    cm = confusion_matrix(y_checked, predictions)
    ax = sns.heatmap(cm, annot=True, cmap='Blues', fmt='.0f')
    ax.set_title('Confusion Matrix\n\n')
    ax.set_xlabel('\nPredicted Values')
    ax.set_ylabel('Actual Values ')
    plt.show()

    # plot ROC curve
    fig = plt.figure(figsize=(6, 6))
    plt.plot([0, 1], [0, 1], 'k--')
    for model_selected, name_selected, pred_proba in zip(models, models_names, predictions_proba_list):
        fpr, tpr, thresholds = roc_curve(y_checked, pred_proba)
        plt.plot(fpr, tpr, label=name_selected)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    plt.show()

    f1_metric = f1_score(y_checked, predictions)
    auc_metric = roc_auc_score(y_checked, predictions_proba[:,1])
    metrics_dataframe = metrics_dataframe.append({'Model': name, 'F1_score': f1_metric, 'AUC': auc_metric},
                                                 ignore_index=True)
    return metrics_dataframe

# ZAMIANA TEKSTU W WEKTOR (TF-IDF)

In [7]:
X = spam_dataset['Lemmatized_Text'].apply(lambda x: ' '.join(x))
y = spam_dataset['Spam']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [33]:
vectorizer = TfidfVectorizer()
X_train_forest = vectorizer.fit_transform(X_train)

forest = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
forest = forest.fit(X_train_forest, y_train)

In [34]:
feature_names = vectorizer.get_feature_names_out()
feature_names.size

7573

In [35]:
forest.feature_importances_.size

7573

In [36]:
feature_names = vectorizer.get_feature_names_out()
df = pd.DataFrame({'Feature_names':feature_names, 'Importances': forest.feature_importances_})
df_imp = df[df["Importances"] > 0.001]
feature = df['Feature_names']
feature

0       008704050406
1             0089my
2               0121
3        01223585236
4        01223585334
            ...     
7568             ûªt
7569            ûªve
7570              ûï
7571         ûïharry
7572              ûò
Name: Feature_names, Length: 7573, dtype: object

In [12]:
X_train

4747    beauty life next second hide thousand secret w...
5295                                    alex say he ok ok
5568                          ì b going esplanade fr home
4654    prasanth ettans mother passed away last night ...
1133    entered cabin pa said happy bday bos felt spec...
                              ...                        
2329            surfing online store offer want buy thing
1932                            jus finished avatar nigro
5316                             jus finish watching tv u
3215    urgent trying contact u today draw show å£2000...
763     nothing jus tot u would ask co u ba gua went m...
Name: Lemmatized_Text, Length: 4179, dtype: object

In [13]:
X_counts = vectorizer.fit_transform(X_train)

bow_df = pd.DataFrame(
    X_counts.toarray(), columns=feature_names, index=X_train
)
X_imp = bow_df[feature_names]

In [14]:
model_pipeline = Pipeline(steps=[('model', RandomForestClassifier(n_estimators=1000, n_jobs=-1))])

params = {'model__max_depth': [3, 5, 10, 20],
          'model__min_samples_leaf': [3, 5, 10, 15]}

grid_search = GridSearchCV(model_pipeline, params, cv=5, n_jobs=-1, verbose=10, scoring='f1_macro')

grid_search.fit(X_imp, y_train)

print('Wybrane hiperparametry: ', grid_search.best_params_)
model_pipeline_v1 = grid_search.best_estimator_

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Wybrane hiperparametry:  {'model__max_depth': 20, 'model__min_samples_leaf': 3}


In [15]:
X_counts_test = vectorizer.fit_transform(X_test)
#feature_names = vectorizer.get_feature_names_out()
#print(feature_names.shape)
print(X_counts_test.shape)
print(X_counts_test.toarray().shape)
print(X_test.shape)
X_test.explode()

(1393, 3821)
(1393, 3821)
(1393,)


3722                                 left already orchard
3062    hi babe jordan r u im home abroad lonely text ...
472     nothing meant money enters account bank remove...
4829    word checkmate chess come persian phrase shah ...
5371                                lol oh got friend dog
                              ...                        
2412    dont know u u dont know send chat 86688 let fi...
2326    xmas story peace xmas msg love xmas miracle je...
1224    winner u specially selected 2 receive å£1000 c...
3148                         sorry meeting ill call later
1481                            im guy browsin compulsory
Name: Lemmatized_Text, Length: 1393, dtype: object

In [17]:
feature_names = vectorizer.get_feature_names_out()

In [18]:
bow_df_test = pd.DataFrame(
    X_counts_test.toarray(), columns=feature_names, index=X_test.explode()
)
X_imp_test = bow_df_test[feature_names]

In [19]:
from sklearn.model_selection import cross_validate
results = pd.DataFrame(cross_validate(model_pipeline_v1, X_imp_test, y_test, return_train_score=True))
results

Unnamed: 0,fit_time,score_time,test_score,train_score
0,1.680809,0.171844,0.903226,0.938061
1,1.691939,0.149275,0.935484,0.93447
2,1.679478,0.154222,0.9319,0.93447
3,1.463166,0.158978,0.913669,0.93991
4,1.665741,0.155754,0.917266,0.939013
