# Module 6: Preprocessing Categorical Variables

### Categorical Variables: Ordinal Encoding

In [1]:
import pandas as pd

In [3]:
X_toy = pd.DataFrame(['Good', 'Bad', 'Good', 'Good', 'Bad', 'Neutral', 'Good', 'Good', 'Neutral', 'Neutral', 'Neutral', 'Good', 'Bad', 'Good'],
                     columns = ['rating'])
X_toy

Unnamed: 0,rating
0,Good
1,Bad
2,Good
3,Good
4,Bad
5,Neutral
6,Good
7,Good
8,Neutral
9,Neutral


In [4]:
pd.DataFrame(X_toy['rating'].value_counts()).rename(columns={'rating': 'frequency'}).T

Unnamed: 0,Good,Neutral,Bad
frequency,7,4,3


In [5]:
from sklearn.preprocessing import OrdinalEncoder

In [6]:
oe = OrdinalEncoder(dtype=int)
oe.fit(X_toy);
X_toy_ord = oe.transform(X_toy)
X_toy_ord

array([[1],
       [0],
       [1],
       [1],
       [0],
       [2],
       [1],
       [1],
       [2],
       [2],
       [2],
       [1],
       [0],
       [1]])

In [7]:
encoding_view = X_toy.assign(language_enc=X_toy_ord)
encoding_view

Unnamed: 0,rating,language_enc
0,Good,1
1,Bad,0
2,Good,1
3,Good,1
4,Bad,0
5,Neutral,2
6,Good,1
7,Good,1
8,Neutral,2
9,Neutral,2


In [10]:
ratings_order = ['Bad', 'Neutral', 'Good']

In [11]:
oe = OrdinalEncoder(categories = [ratings_order], dtype=int)
oe.fit(X_toy);
X_toy_ord = oe.transform(X_toy)
X_toy_ord

array([[2],
       [0],
       [2],
       [2],
       [0],
       [1],
       [2],
       [2],
       [1],
       [1],
       [1],
       [2],
       [0],
       [2]])

In [12]:
encoding_view =X_toy.assign(language_enc=X_toy)
encoding_view

Unnamed: 0,rating,language_enc
0,Good,Good
1,Bad,Bad
2,Good,Good
3,Good,Good
4,Bad,Bad
5,Neutral,Neutral
6,Good,Good
7,Good,Good
8,Neutral,Neutral
9,Neutral,Neutral


In [14]:
X_toy = pd.DataFrame(data = ['English', 'Vietnamese', 'English', 'Mandarin', 'English', 'English', 'Mandarin', 'English', 'Vietnamese', 'Mandarin', 'French', 'Spanish', 'Mandarin', 'Hindi'], columns = ['language'])
X_toy

Unnamed: 0,language
0,English
1,Vietnamese
2,English
3,Mandarin
4,English
5,English
6,Mandarin
7,English
8,Vietnamese
9,Mandarin


In [15]:
pd.DataFrame(X_toy['language'].value_counts()).rename(columns={'language':'frequency'}).T

Unnamed: 0,English,Mandarin,Vietnamese,French,Spanish,Hindi
frequency,5,4,2,1,1,1


In [16]:
oe = OrdinalEncoder(dtype=int)
oe.fit(X_toy);
X_toy_ord = oe.transform(X_toy)
X_toy_ord

encoding_view = X_toy.assign(language_enc=X_toy_ord)
encoding_view

Unnamed: 0,language,language_enc
0,English,0
1,Vietnamese,5
2,English,0
3,Mandarin,3
4,English,0
5,English,0
6,Mandarin,3
7,English,0
8,Vietnamese,5
9,Mandarin,3


Ordinal Encoding is only appropriate when you have some natural ordering for the column (not for languages for examples).

### One-Hot Encoding

In [17]:
X_toy

Unnamed: 0,language
0,English
1,Vietnamese
2,English
3,Mandarin
4,English
5,English
6,Mandarin
7,English
8,Vietnamese
9,Mandarin


In [18]:
from sklearn.preprocessing import OneHotEncoder

In [19]:
ohe = OneHotEncoder(sparse=False, dtype='int')
ohe.fit(X_toy);
X_toy_ohe = ohe.transform(X_toy)
X_toy_ohe



array([[1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0]])

In [22]:
pd.DataFrame(
    data = X_toy_ohe,
    #columns=ohe.get_feature_names(['language']),
    columns = ['English', 'French', 'Hindi', 'Mandarin', 'Spanish', 'Vietnamese'],
    index=X_toy.index)

Unnamed: 0,English,French,Hindi,Mandarin,Spanish,Vietnamese
0,1,0,0,0,0,0
1,0,0,0,0,0,1
2,1,0,0,0,0,0
3,0,0,0,1,0,0
4,1,0,0,0,0,0
5,1,0,0,0,0,0
6,0,0,0,1,0,0
7,1,0,0,0,0,0
8,0,0,0,0,0,1
9,0,0,0,1,0,0


In [28]:
from sklearn.model_selection import train_test_split

housing_df = pd.read_csv("data/housing.csv")
train_df, test_df = train_test_split(housing_df, test_size=0.1, random_state=123)

X_train = train_df.drop(columns=["median_house_value"])
y_train = train_df["median_house_value"]

X_test = test_df.drop(columns=["median_house_value"])
y_test = test_df["median_house_value"]

In [29]:
X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
6051,-117.75,34.04,22.0,2948.0,636.0,2600.0,602.0,3.125,INLAND
20113,-119.57,37.94,17.0,346.0,130.0,51.0,20.0,3.4861,INLAND
14289,-117.13,32.74,46.0,3355.0,768.0,1457.0,708.0,2.6604,NEAR OCEAN
13665,-117.31,34.02,18.0,1634.0,274.0,899.0,285.0,5.2139,INLAND
14471,-117.23,32.88,18.0,5566.0,1465.0,6303.0,1458.0,1.858,NEAR OCEAN


In [30]:
X_train['ocean_proximity'].unique()

array(['INLAND', 'NEAR OCEAN', '<1H OCEAN', 'NEAR BAY', 'ISLAND'],
      dtype=object)

In [31]:
ohe = OneHotEncoder(sparse=False, dtype='int')
ohe.fit(X_train[["ocean_proximity"]])



In [32]:
X_imp_ohe_train = ohe.transform(X_train[["ocean_proximity"]])
X_imp_ohe_train

array([[0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1],
       ...,
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0]])

In [33]:
transformed_ohe = pd.DataFrame(
    data = X_imp_ohe_train,
    columns = ['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
    index=X_train.index)

transformed_ohe.head()

Unnamed: 0,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
6051,0,1,0,0,0
20113,0,1,0,0,0
14289,0,0,0,0,1
13665,0,1,0,0,0
14471,0,0,0,0,1


### Column Transformer

Before we fit our model, we want to apply different transformations on different columns.

**Numeric columns:**
* imputation
* scaling

**Categorical columns:**
* imputation
* one-hot encoding

In [34]:
from sklearn.compose import ColumnTransformer

In [35]:
X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
6051,-117.75,34.04,22.0,2948.0,636.0,2600.0,602.0,3.125,INLAND
20113,-119.57,37.94,17.0,346.0,130.0,51.0,20.0,3.4861,INLAND
14289,-117.13,32.74,46.0,3355.0,768.0,1457.0,708.0,2.6604,NEAR OCEAN
13665,-117.31,34.02,18.0,1634.0,274.0,899.0,285.0,5.2139,INLAND
14471,-117.23,32.88,18.0,5566.0,1465.0,6303.0,1458.0,1.858,NEAR OCEAN


In [36]:
X_train.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
ocean_proximity        object
dtype: object

In [51]:
numeric_features = ["longitude", 
                    "latitude",
                    "housing_median_age",
                    "households",
                    "median_income",
                    "total_rooms",
                    "total_bedrooms",
                    "population"]

categorical_features = ["ocean_proximity"]

In [64]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_validate

In [52]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")),
           ("scaler", StandardScaler())])

categorical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
           ("onehot", OneHotEncoder(handle_unknown="ignore"))])

In [53]:
col_transformer = ColumnTransformer(
    transformers=[
        ("numeric", numeric_transformer, numeric_features),
        ("categorical", categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

If we do not specify remainder="passthrough" as an argument in ColumnTransformer, the columns not being transformed will be dropped .

In [54]:
col_transformer.fit(X_train)

In [56]:
x = list(X_train.columns.values)
del x[5]
X_train_pp = col_transformer.transform(X_train)
#pd.DataFrame(X_train_pp, columns=(x + list(col_transformer.named_transformers_["categorical"].named_steps["onehot"].get_feature_names

In [60]:
#onehot_cols = col_transformer.named_transformers_["categorical"].named_steps["onehot"].get_feature_names(categorical_features)
#columns = numeric_features + list(onehot_cols)

In [62]:
main_pipe = Pipeline(
    steps = [
        ("preprocessor", col_transformer),
        ("reg", KNeighborsRegressor())])

In [65]:
with_categorical_scores = cross_validate(main_pipe, X_train, y_train, return_train_score=True)
pd.DataFrame(with_categorical_scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.14699,0.158743,0.722058,0.811359
1,0.042035,0.149229,0.724976,0.810671
2,0.041575,0.141985,0.726172,0.807567
3,0.04303,0.14142,0.699779,0.815426
4,0.044731,0.184805,0.698182,0.813712


In [66]:
pipe = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("reg", KNeighborsRegressor())])

pipe.fit(X_train.drop(columns=['ocean_proximity']), y_train);

In [67]:
no_categorical_scores = cross_validate(pipe, X_train.drop(columns=['ocean_proximity']), y_train, return_train_score=True)
pd.DataFrame(no_categorical_scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.123818,0.228444,0.715453,0.805028
1,0.04007,0.131685,0.706362,0.806376
2,0.027077,0.105783,0.71371,0.804266
3,0.026655,0.085673,0.702028,0.80703
4,0.027041,0.087088,0.696488,0.80814


In [68]:
pd.DataFrame(no_categorical_scores).mean()

fit_time       0.048932
score_time     0.127735
test_score     0.706808
train_score    0.806168
dtype: float64

In [69]:
pd.DataFrame(with_categorical_scores).mean()

fit_time       0.063672
score_time     0.155236
test_score     0.714234
train_score    0.811747
dtype: float64

### Make - Pipelines & Column Transformers

In [70]:
adult = pd.read_csv("data/adult.csv")

In [72]:
train_df, test_df = train_test_split(adult, test_size=0.2, random_state=42)
train_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
5514,26,Private,256263,HS-grad,9,Never-married,Craft-repair,Not-in-family,White,Male,0,0,25,United-States,<=50K
19777,24,Private,170277,HS-grad,9,Never-married,Other-service,Not-in-family,White,Female,0,0,35,United-States,<=50K
10781,36,Private,75826,Bachelors,13,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32240,22,State-gov,24395,Some-college,10,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,20,United-States,<=50K
9876,31,Local-gov,356689,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,<=50K


In [73]:
X_train = train_df.drop(columns=['income'])
y_train = train_df['income']

X_test = test_df.drop(columns=['income'])
y_test = test_df['income']

In [108]:
numeric_features = [
    "age",
    "fnlwgt",
    "education.num",
    "capital.gain",
    "capital.loss",
    "house.per.week"
]

categorical_features = [
    "workclass",
    "education",
    "marital.status",
    "occupation",
    "relationship",
    "sex",
    "native.country"
]

In [75]:
from sklearn.svm import SVC

In [109]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")),
           ("scaler", StandardScaler())])

categorical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
           ("onehot", OneHotEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("clf", SVC())])

In [110]:
model_pipeline = Pipeline(
    steps=[
        ("scaling", StandardScaler()),
        ("clf", SVC())])

In [111]:
from sklearn.pipeline import make_pipeline

model_pipeline = make_pipeline(
    StandardScaler(), SVC())

In [112]:
model_pipeline

In [113]:
numeric_transformer = make_pipeline(SimpleImputer(strategy="median"),
                                    StandardScaler())

categorical_transformer = make_pipeline(
    SimpleImputer(strategy = "constant", fill_value="missing"),
    OneHotEncoder()
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

pipe = make_pipeline(preprocessor, SVC())

**make_column_transformer**

In [114]:
from sklearn.compose import make_column_transformer

In [115]:
preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features))

Running cross_validate results in error. To fix, use handle_unknown="ignore".

In [116]:
numeric_transformer = make_pipeline(SimpleImputer(strategy="median"),
                                    StandardScaler())

categorical_transformer = make_pipeline(
    SimpleImputer(strategy = "constant", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore")
)

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features))

pipe = make_pipeline(preprocessor, SVC())

In [120]:
#scores = cross_validate(pipe,X_train,y_train, cv=5, return_train_score=True)
#pd.DataFrame(scores).mean()

#### Cases where it's OK to break golden rule

* If it's some fixed number of categories

In [90]:
all_countries = adult["native.country"].unique()
all_countries

array(['United-States', '?', 'Mexico', 'Greece', 'Vietnam', 'China',
       'Taiwan', 'India', 'Philippines', 'Trinadad&Tobago', 'Canada',
       'South', 'Holand-Netherlands', 'Puerto-Rico', 'Poland', 'Iran',
       'England', 'Germany', 'Italy', 'Japan', 'Hong', 'Honduras', 'Cuba',
       'Ireland', 'Cambodia', 'Peru', 'Nicaragua', 'Dominican-Republic',
       'Haiti', 'El-Salvador', 'Hungary', 'Columbia', 'Guatemala',
       'Jamaica', 'Ecuador', 'France', 'Yugoslavia', 'Scotland',
       'Portugal', 'Laos', 'Thailand', 'Outlying-US(Guam-USVI-etc)'],
      dtype=object)

In [91]:
ohe_cat = OneHotEncoder(categories=all_countries)

### Handeling Categorical Features: Binary, Ordinal and More

In [92]:
train_df["education"].unique()

array(['HS-grad', 'Bachelors', 'Some-college', '11th', '5th-6th',
       'Assoc-voc', 'Masters', '9th', 'Doctorate', 'Prof-school',
       '7th-8th', '10th', '12th', '1st-4th', 'Assoc-acdm', 'Preschool'],
      dtype=object)

In [94]:
oe = OrdinalEncoder(dtype=int)
oe.fit(X_train[["education"]]);
ed_transformed = oe.transform(X_train[["education"]])
ed_transformed = pd.DataFrame(data=ed_transformed, columns=["education_enc"], index=X_train.index)
ed_transformed.head()

Unnamed: 0,education_enc
5514,11
19777,11
10781,9
32240,15
9876,9


In [95]:
ed_transformed['education_enc'].unique()

array([11,  9, 15,  1,  4,  8, 12,  6, 10, 14,  5,  0,  2,  3,  7, 13])

In [96]:
oe.categories_[-1]

array(['10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th',
       'Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad',
       'Masters', 'Preschool', 'Prof-school', 'Some-college'],
      dtype=object)

In [98]:
import numpy as np

In [99]:
pd.DataFrame(data=np.arange(len(oe.categories_[0])), columns=["transformed"], index=oe.categories_[0]).head(10)

Unnamed: 0,transformed
10th,0
11th,1
12th,2
1st-4th,3
5th-6th,4
7th-8th,5
9th,6
Assoc-acdm,7
Assoc-voc,8
Bachelors,9


In [100]:
train_df["education"].unique()

array(['HS-grad', 'Bachelors', 'Some-college', '11th', '5th-6th',
       'Assoc-voc', 'Masters', '9th', 'Doctorate', 'Prof-school',
       '7th-8th', '10th', '12th', '1st-4th', 'Assoc-acdm', 'Preschool'],
      dtype=object)

In [101]:
education_levels = ['Preschool', '1st-4th', '5th-6th', '7th-8th',
                    '9th', '10th', '11th', '12th', 'HS-grad',
                    'Prof-school', 'Assoc-voc', 'Assoc-acdm',
                    'Some-college', 'Bachelors', 'Masters', 'Doctorate']

In [102]:
assert set(education_levels) == set(train_df["education"].unique())

In [103]:
oe = OrdinalEncoder(categories=[education_levels], dtype=int)
oe.fit(X_train[["education"]]);
ed_transformed = oe.transform(X_train[["education"]])
ed_transformed = pd.DataFrame(data=ed_transformed, columns=["education_enc"], index=X_train.index)
oe.categories_

[array(['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th',
        '11th', '12th', 'HS-grad', 'Prof-school', 'Assoc-voc',
        'Assoc-acdm', 'Some-college', 'Bachelors', 'Masters', 'Doctorate'],
       dtype=object)]

In [104]:
pd.DataFrame(data=np.arange(len(oe.categories_[0])), columns=["transformed"], index=oe.categories_[0]).head(10)

Unnamed: 0,transformed
Preschool,0
1st-4th,1
5th-6th,2
7th-8th,3
9th,4
10th,5
11th,6
12th,7
HS-grad,8
Prof-school,9


In [106]:
numeric_features = ['age',
 'fnlwgt',
 'capital.gain',
 'capital.loss',
 'house.per.week']

In [119]:
categorical_features = ['workclass',
 'marital.status',
 'occupation',
 'relationship',
 'sex',
 'native.country']

ordinal_features = ["education"]
target_column = 'income'

In [122]:
numeric_transformer = make_pipeline(SimpleImputer(strategy="median"),
                                    StandardScaler())

categorical_transformer = make_pipeline(
    SimpleImputer(strategy = "constant", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore")
)

ordinal_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OrdinalEncoder(categories=[education_levels], dtype=int))

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features),
    (ordinal_transformer, ordinal_features))

pipe = make_pipeline(preprocessor, SVC())

In [124]:
#scores = cross_validate(pipe, X_train, y_train, return_train_score=True)

In [125]:
X_train['sex'].unique()

array(['Male', 'Female'], dtype=object)

In [126]:
ohe = OneHotEncoder(sparse=False, dtype=int)
ohe.fit(X_train[["sex"]])



In [128]:
#ohe_df = pd.DataFrame(data=ohe.transform(X_train[["sex"]]), columns=ohe.get_feature_names(["sex"]), index=X_train.index)
ohe_df = pd.DataFrame(data=ohe.transform(X_train[["sex"]]), columns=['female', 'male'], index=X_train.index)
ohe_df

Unnamed: 0,female,male
5514,0,1
19777,1,0
10781,1,0
32240,1,0
9876,0,1
...,...,...
29802,0,1
5390,0,1
860,0,1
15795,0,1


In [129]:
ohe = OneHotEncoder(sparse=False, dtype=int, drop="if_binary")
ohe.fit(X_train[["sex"]])



In [130]:
ohe_df = pd.DataFrame(data=ohe.transform(X_train[["sex"]]), columns=['male'], index=X_train.index)
ohe_df

Unnamed: 0,male
5514,1
19777,0
10781,0
32240,0
9876,1
...,...
29802,1
5390,1
860,1
15795,1


In [131]:
numeric_features = ['age',
 'fnlwgt',
 'capital.gain',
 'capital.loss',
 'house.per.week']

categorical_features = ['workclass',
 'marital.status',
 'occupation',
 'relationship',
 'race',
 'native.country']

ordinal_features = ["education"]
binary_features = ['sex']
target_column = 'income'

In [133]:
numeric_transformer = make_pipeline(SimpleImputer(strategy="median"),
                                    StandardScaler())

categorical_transformer = make_pipeline(
    SimpleImputer(strategy = "constant", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore")
)

ordinal_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OrdinalEncoder(categories=[education_levels], dtype=int))

binary_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder(drop="if_binary", dtype=int))    

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features),
    (ordinal_transformer, ordinal_features),
    (binary_transformer, binary_features))

pipe = make_pipeline(preprocessor, SVC())

In [135]:
#scores = cross_validate(pipe,X_train,y_train,return_train_score=True)

In [136]:
X_train["native.country"].value_counts()

United-States                 23315
Mexico                          512
?                               474
Philippines                     165
Germany                         115
Canada                           97
El-Salvador                      92
Puerto-Rico                      85
India                            82
England                          74
Cuba                             70
South                            63
China                            63
Jamaica                          62
Italy                            60
Dominican-Republic               56
Columbia                         53
Vietnam                          53
Guatemala                        52
Japan                            44
Poland                           44
Taiwan                           41
Haiti                            36
Iran                             34
Portugal                         31
Nicaragua                        29
Greece                           28
Peru                        

### Text Data

In [137]:
X = [
    "URGENT! As a valued network customer you have been selected to receive a prize!",
    "Lol you are always so convincing.",
    "Nah I don't think he goes to usf",
    "URGENT! You have won a 1 week FREE membership Jackpot!",
    "Had your mobile 11 months or more? U R entitles to Update to latest color mobiles for Free!",
    "As per your request, 'Melle Melle' has been set as your callertune for all Callers"]

In [138]:
y = ["spam", "non spam", "non spam", "spam", "spam", "non spam"]

In [139]:
from sklearn.feature_extraction.text import CountVectorizer

In [141]:
vec = CountVectorizer()
X_counts = vec.fit_transform(X);
bow_df = pd.DataFrame(X_counts.toarray(), columns=sorted(vec.vocabulary_), index=X)
bow_df

Unnamed: 0,11,all,always,are,as,been,callers,callertune,color,convincing,...,think,to,update,urgent,usf,valued,week,won,you,your
URGENT! As a valued network customer you have been selected to receive a prize!,0,0,0,0,1,1,0,0,0,0,...,0,1,0,1,0,1,0,0,1,0
Lol you are always so convincing.,0,0,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
Nah I don't think he goes to usf,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,1,0,0,0,0,0
URGENT! You have won a 1 week FREE membership Jackpot!,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,1,1,0
Had your mobile 11 months or more? U R entitles to Update to latest color mobiles for Free!,1,0,0,0,0,0,0,0,1,0,...,0,2,1,0,0,0,0,0,0,1
"As per your request, 'Melle Melle' has been set as your callertune for all Callers",0,1,0,0,2,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,2


**Important hyperparameters of CountVectorizer**:

* `binary`: whether to use absence/presence feature values or counts
* `max_features`: only considers top `max_features` ordered by frequency in the corpus
* `max_df`: when building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold
* `min_df`: when building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold
* `ngram_range`: consider word sequences in the given range

In [143]:
param_grid = {"countvectorizer__max_features": range(1,1000)}

In [144]:
pipe = make_pipeline(CountVectorizer(), SVC())
pipe.fit(X,y);

In [145]:
pipe.predict(X)

array(['spam', 'non spam', 'non spam', 'spam', 'spam', 'non spam'],
      dtype='<U8')

In [146]:
pipe.score(X,y)

1.0

In [149]:
X_new = [
    "Congratulation! You have been awarded $1000!",
    "Mom, can you pick me up?",
    "I'm trying to bake a cake and forgot to put sugar in it.",
    "URGENT: please pick up your car at 2pm today",
    "Call 234950323 for a FREE consultation. It's your lucky day!"]
y_new = ["spam", "non spam", "non spam", "non spam", "spam"]

In [150]:
pipe.score(X_new,y_new)

0.8