#                Feature Extraction from Dummy Data

### Creating some documents

In [1]:
%%writefile one.txt
This is a story about cats
our feline pets
Cats are furry animals

Writing one.txt


In [2]:
%%writefile two.txt
This is a story about surfing
Catching waves is fun
Surfing is a popular water sport

Writing two.txt


### Building a vocabulary

In [4]:
vocab = {}

i = 1

with open('one.txt') as f:
    x = f.read().lower().split()

for word in x:
    if word in vocab:
        continue
    else:
        vocab[word] = i
        i += 1
print(vocab)

{'this': 1, 'is': 2, 'a': 3, 'story': 4, 'about': 5, 'cats': 6, 'our': 7, 'feline': 8, 'pets': 9, 'are': 10, 'furry': 11, 'animals': 12}


In [5]:
with open('two.txt') as f:
    x = f.read().lower().split()

for word in x:
    if word in vocab:
        continue
    else:
        vocab[word] = i
        i += 1
print(vocab)

{'this': 1, 'is': 2, 'a': 3, 'story': 4, 'about': 5, 'cats': 6, 'our': 7, 'feline': 8, 'pets': 9, 'are': 10, 'furry': 11, 'animals': 12, 'surfing': 13, 'catching': 14, 'waves': 15, 'fun': 16, 'popular': 17, 'water': 18, 'sport': 19}


### Feature Extraction

In [7]:
one = ['one.txt'] + [0] * len(vocab)
one

['one.txt', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [8]:
with open('one.txt') as f:
    x = f.read().lower().split()

for word in x:
    one[vocab[word]] += 1
one

['one.txt', 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]

In [9]:
two = ['two.txt'] + [0] * len(vocab)
two

['two.txt', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [10]:
with open('two.txt') as f:
    x = f.read().lower().split()

for word in x:
    two[vocab[word]] += 1
two

['two.txt', 1, 3, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1]

In [11]:
print(f'{one}\n{two}')

['one.txt', 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
['two.txt', 1, 3, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1]


# Feature Extraction from real dataset

### Importing libraries
### Loading the dataset

In [2]:
import numpy as np
import pandas as pd

df = pd.read_csv('smsspamcollection.tsv', sep='\t')
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [3]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [4]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

### Train, test & split the dataset

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X = df['message']
y = df['label']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Count vectorization

#### Two ways
#### 1. count vector -> TF-iDF transform

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
count_vect = CountVectorizer()

In [10]:
X_train_counts = count_vect.fit_transform(X_train)

In [24]:
X_train_counts

<3733x7082 sparse matrix of type '<class 'numpy.int64'>'
	with 49992 stored elements in Compressed Sparse Row format>

In [25]:
X_train.shape

(3733,)

In [26]:
X_train_counts.shape

(3733, 7082)

In [27]:
from sklearn.feature_extraction.text import TfidfTransformer

In [28]:
tfidf_tranformer = TfidfTransformer()

In [29]:
X_train_tfidf = tfidf_tranformer.fit_transform(X_train_counts)

In [30]:
X_train_tfidf.shape

(3733, 7082)

#### 2. TF-iDF vector

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
vectorizer = TfidfVectorizer()

In [33]:
X_train_tfidf = vectorizer.fit_transform(X_train)

In [15]:
from sklearn.svm import LinearSVC

In [35]:
clf = LinearSVC()

In [36]:
clf.fit(X_train_tfidf, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

#### TF-IDF vectorization both on train & test set in one call

In [11]:
from sklearn.pipeline import Pipeline

In [16]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [17]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [18]:
predictions = text_clf.predict(X_test)

In [19]:
from sklearn.metrics import confusion_matrix, classification_report

In [20]:
print(confusion_matrix(y_test, predictions))

[[1586    7]
 [  12  234]]


In [21]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

   micro avg       0.99      0.99      0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [22]:
from sklearn import metrics

In [23]:
metrics.accuracy_score(y_test, predictions)

0.989668297988037

In [25]:
text_clf.predict(["Congrats! you have won a lottery! give some money and take it"])

array(['ham'], dtype=object)