### Načteme data

In [1]:
import numpy as np
import os

PATH = os.path.join("datasets", "05", "spam")

In [2]:
import pandas as pd

def load_data(file, path=PATH, encoding='latin-1'):
    csv = os.path.join(path, file)
    return pd.read_csv(csv, encoding=encoding)


In [3]:
data = load_data("spam.csv")

### Prozkoumáme a předpřipravíme data

In [4]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


Odstraníme zbytečné sloupečky a přejmenujeme užitečné

In [6]:
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v1":"label", "v2":"text"})

In [7]:
data.groupby("label").describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


Cílové hodnoty

In [8]:
data["label"].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

Cílová třída jako číselný (nebo bool) atribut

In [9]:
data["spam"] = (data["label"] == "spam").astype(int)

### Rozdělíme dataset na trénovací a testovací

In [10]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2, random_state=42)

### Převod textu na vektor

Klíčovým problémem je jak převést text na reprezentaci, kterou můžeme použít pro ML modely.
Základní způsoby:
- bag of words
- TF/IDF
- word embeddings

**Ukážeme si bag of words**

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
bow = CountVectorizer()

In [13]:
bow.fit(train["text"])

CountVectorizer()

In [14]:
len(bow.vocabulary_)

7735

In [15]:
bow.vocabulary_

{'no': 4773,
 'in': 3637,
 'the': 6786,
 'same': 5872,
 'boat': 1415,
 'still': 6455,
 'here': 3416,
 'at': 1105,
 'my': 4646,
 'moms': 4549,
 'check': 1758,
 'me': 4390,
 'out': 4988,
 'on': 4912,
 'yo': 7674,
 'half': 3308,
 'naked': 4661,
 'bank': 1218,
 'of': 4869,
 'granite': 3237,
 'issues': 3749,
 'strong': 6494,
 'buy': 1580,
 'explosive': 2741,
 'pick': 5178,
 'for': 2954,
 'our': 4986,
 'members': 4423,
 'up': 7152,
 'over': 5003,
 '300': 419,
 'nasdaq': 4675,
 'symbol': 6639,
 'cdgt': 1701,
 'that': 6781,
 'is': 3739,
 '00': 0,
 'per': 5126,
 'they': 6809,
 'giving': 3157,
 'second': 5948,
 'chance': 1726,
 'to': 6906,
 'rahul': 5526,
 'dengra': 2246,
 'played': 5221,
 'smash': 6217,
 'bros': 1522,
 'lt': 4234,
 'gt': 3271,
 'religiously': 5661,
 'private': 5399,
 'your': 7685,
 '2003': 345,
 'account': 783,
 'statement': 6429,
 '07973788240': 45,
 'shows': 6108,
 '800': 630,
 'un': 7099,
 'redeemed': 5630,
 'points': 5270,
 'call': 1608,
 '08715203649': 122,
 'identifier': 

In [16]:
sample = train["text"][0]

In [17]:
sample

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [18]:
vector = bow.transform([sample])
vector

<1x7735 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [19]:
print(vector)

  (0, 943)	1
  (0, 1145)	1
  (0, 1549)	1
  (0, 1551)	1
  (0, 1821)	1
  (0, 2067)	1
  (0, 3169)	1
  (0, 3211)	1
  (0, 3247)	1
  (0, 3637)	1
  (0, 3871)	1
  (0, 3976)	1
  (0, 4924)	1
  (0, 5269)	1
  (0, 6803)	1
  (0, 7150)	1
  (0, 7366)	1
  (0, 7565)	1


In [20]:
for col in vector.nonzero()[1]:
    print(bow.get_feature_names()[col])

amore
available
buffet
bugis
cine
crazy
go
got
great
in
jurong
la
only
point
there
until
wat
world


### Nachystáme Pipeline pro zpracování dat

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [60]:
bow_transformer = ColumnTransformer([
    ('text', CountVectorizer(), 'text')
])

In [61]:
pipeline = Pipeline([
    ('bow', bow_transformer),
])

### Připravíme si data pro model

In [24]:
X_train = pipeline.fit_transform(train)

In [25]:
X_train

<4457x7735 sparse matrix of type '<class 'numpy.int64'>'
	with 58978 stored elements in Compressed Sparse Row format>

In [26]:
y_train = train["spam"]

### Natrénujeme první model

In [27]:
from sklearn.svm import SVC

svc = SVC(kernel="rbf")
svc.fit(X_train, y_train)

SVC()

In [28]:
from sklearn.model_selection import cross_val_score

In [29]:
cross_val_score(svc, X_train, y_train, cv=5).mean()

0.9771129330172679

In [30]:
cross_val_score(svc, X_train, y_train, cv=5, scoring="f1").mean()

0.9064022861105576

**Ověříme na testovacích datech**

In [31]:
svc.fit(X_train, y_train)

SVC()

In [32]:
X_test = pipeline.transform(test)
y_test = test["spam"]

y_pred = svc.predict(X_test)

In [33]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [34]:
accuracy_score(y_pred, y_test)

0.979372197309417

In [35]:
f1_score(y_pred, y_test)

0.9169675090252708

### Vyzkoušíme přidat TF IDF

In [36]:
from sklearn.feature_extraction.text import TfidfTransformer

In [37]:
bow_transformer = ColumnTransformer([
    ('text', CountVectorizer(), 'text')
])

In [38]:
pipeline = Pipeline([
    ('bow', bow_transformer),
    ('tfidf', TfidfTransformer()),
])

In [39]:
X_train = pipeline.fit_transform(train)

In [40]:
X_train

<4457x7735 sparse matrix of type '<class 'numpy.float64'>'
	with 58978 stored elements in Compressed Sparse Row format>

In [41]:
for y in X_train[0].nonzero()[1]:
    print(X_train[0,y])

0.2780580098469636
0.12427677819754165
0.21083615486777574
0.26710534096823246
0.1903904544048287
0.15957326527314455
0.17659786192165677
0.3714150879090969
0.14901928724105368
0.3610761139104893
0.14278469554070108
0.1356900314797867
0.22465058036136068
0.2780580098469636
0.2736419760203263
0.3714150879090969
0.16911106738950407


In [42]:
cross_val_score(svc, X_train, y_train, cv=5).mean()

0.9746450554372826

In [43]:
cross_val_score(svc, X_train, y_train, cv=5, scoring="f1").mean()

0.8956291186826174

**Ověříme na testovacích datech**

In [44]:
svc.fit(X_train, y_train)

X_test = pipeline.transform(test)
y_test = test["spam"]

y_pred = svc.predict(X_test)

In [45]:
accuracy_score(y_pred, y_test)

0.9820627802690582

In [46]:
f1_score(y_pred, y_test)

0.9285714285714286

### Vyzkoušíme různé varianty převodu textu

In [73]:
from sklearn.model_selection import GridSearchCV
from sklearn.base import clone

parameters = [
   {'tfidf__use_idf': (True, False), 'bow__stop_words': (None, 'english'), 'bow__ngram_range': ((1,1), (1,3))} 
]

grid_pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('svc', SVC())
])

grid = GridSearchCV(grid_pipeline, parameters, cv=5, scoring='f1')

In [74]:
grid.fit(train["text"], y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('bow', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('svc', SVC())]),
             param_grid=[{'bow__ngram_range': ((1, 1), (1, 3)),
                          'bow__stop_words': (None, 'english'),
                          'tfidf__use_idf': (True, False)}],
             scoring='f1')

In [75]:
grid.best_params_

{'bow__ngram_range': (1, 3), 'bow__stop_words': None, 'tfidf__use_idf': False}

In [76]:
grid.best_score_

0.9330436762212722

### Upravíme pipeline podle nejlepších parametrů

In [90]:
grid_pipeline = Pipeline([
    ('bow', CountVectorizer(ngram_range=(1,3), stop_words=None)),
    ('tfidf', TfidfTransformer(use_idf=False)),
])

X_train = grid_pipeline.fit_transform(train["text"])

### Vyzkoušíme grid search pro nalezení vhodných hyperparametrů SVM

In [104]:
parameters = [
    {'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto'], 'C': [1, 3, 5, 10],},
    {'kernel': ['poly'], 'gamma': ['scale', 'auto'], 'degree':[1, 3, 5, 10], 'C': [1, 3, 5, 10],},
]

grid = GridSearchCV(SVC(probability=True), parameters, cv=5, scoring='f1', return_train_score=True, verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


GridSearchCV(cv=5, estimator=SVC(probability=True), n_jobs=-1,
             param_grid=[{'C': [1, 3, 5, 10], 'gamma': ['scale', 'auto'],
                          'kernel': ['linear', 'rbf']},
                         {'C': [1, 3, 5, 10], 'degree': [1, 3, 5, 10],
                          'gamma': ['scale', 'auto'], 'kernel': ['poly']}],
             return_train_score=True, scoring='f1', verbose=1)

In [109]:
grid.best_params_

{'C': 3, 'gamma': 'scale', 'kernel': 'linear'}

In [110]:
grid.best_score_

0.9296113485075829

In [111]:
svc = grid.best_estimator_

### Vyzkoušíme další klasifikátory

In [99]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier()
cross_val_score(forest, X_train, y_train, scoring='f1').mean()

0.8358550325796867

In [101]:
from sklearn.linear_model import LogisticRegression

logreg  = LogisticRegression()
cross_val_score(logreg, X_train, y_train, scoring='f1').mean()

0.7613728787746228

**Ensemble metoda - Voting classifier**

In [112]:
from sklearn.ensemble import VotingClassifier
voting = VotingClassifier(
    estimators=[('lr', logreg), ('rf', forest), ('svc', svc)],
    voting='soft'
)

In [113]:
cross_val_score(voting, X_train, y_train, cv=5, scoring='f1').mean()

0.9204107497264733