# Classification of text messages

## Test combining model vectors with Hstack and make continuous predictions

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import normalize, minmax_scale, Normalizer
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

### Load labeled SMS dataset

In [3]:
df_in=pd.read_csv('data/sms.tsv.zip', sep='\t', header=None)
df_in.columns=['label','text']

### Make a small test dataset and add continuous and multicategories

In [4]:
dfa=df_in[:5]

In [5]:
dfa['id']=[1,2,3,4,5]

In [6]:
dfa['rating']=[2.1,3.4,6.6,1.1,3.4]

In [7]:
dfa['value']=[5.34,6.23,4.6,2.3,5.4]

In [8]:
dfa['categories']=[('a','z','l'),('b','g'),('c',),('d','b'),('e','c')]

In [9]:
dfa.head()

Unnamed: 0,label,text,id,rating,value,categories
0,ham,"Go until jurong point, crazy.. Available only ...",1,2.1,5.34,"(a, z, l)"
1,ham,Ok lar... Joking wif u oni...,2,3.4,6.23,"(b, g)"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,3,6.6,4.6,"(c,)"
3,ham,U dun say so early hor... U c already then say...,4,1.1,2.3,"(d, b)"
4,ham,"Nah I don't think he goes to usf, he lives aro...",5,3.4,5.4,"(e, c)"


In [10]:
dfa.shape

(5, 6)

### Vectorize text

In [11]:
vect =TfidfVectorizer(stop_words='english')

In [12]:
X_train_text = vect.fit_transform(dfa['text'])

In [13]:
X_train_text.shape

(5, 48)

In [14]:
dfb = pd.DataFrame(X_train_text.toarray(),columns=vect.get_feature_names(), index=dfa.index)

In [15]:
dfb.shape

(5, 48)

In [16]:
dfb.head()

Unnamed: 0,08452810075over18,2005,21st,87121,amore,apply,available,buffet,bugis,cine,...,text,think,tkts,txt,usf,wat,wif,win,wkly,world
0,0.0,0.0,0.0,0.0,0.27735,0.0,0.27735,0.27735,0.27735,0.27735,...,0.0,0.0,0.0,0.0,0.0,0.27735,0.0,0.0,0.0,0.27735
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0
2,0.196116,0.196116,0.196116,0.196116,0.0,0.196116,0.0,0.0,0.0,0.0,...,0.196116,0.0,0.196116,0.196116,0.0,0.0,0.0,0.196116,0.196116,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.408248,0.0,0.0,0.408248,0.0,0.0,0.0,0.0,0.0


### Vectorize categories

In [17]:
mlb = MultiLabelBinarizer()

In [18]:
X_train_cat =mlb.fit_transform(dfa['categories']).astype('float')

In [19]:
X_train_cat.shape

(5, 8)

In [20]:
dfc = pd.DataFrame(X_train_cat,columns=mlb.classes_, index=dfa.index)

In [21]:
dfc.shape

(5, 8)

In [22]:
dfc.head()

Unnamed: 0,a,b,c,d,e,g,l,z
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


### Normalized Value

In [23]:
norm=Normalizer()

In [24]:
X_train_value=norm.fit_transform([dfa['value']]).reshape((5,1))

In [25]:
dfd= pd.DataFrame(X_train_value,columns=['norm_value',], index=dfa.index)

In [26]:
X_train_value.shape

(5, 1)

In [27]:
dfd.head()

Unnamed: 0,norm_value
0,0.481612
1,0.561881
2,0.414872
3,0.207436
4,0.487023


### Create the model Hstack

In [28]:
X_train_hstack=np.hstack((X_train_text.toarray(),X_train_value,X_train_cat))

In [29]:
dfe=pd.concat((dfb,dfc,dfd),axis=1)

In [30]:
dfe.shape

(5, 57)

In [31]:
X_train_hstack.shape

(5, 57)

In [32]:
dfe.head()

Unnamed: 0,08452810075over18,2005,21st,87121,amore,apply,available,buffet,bugis,cine,...,world,a,b,c,d,e,g,l,z,norm_value
0,0.0,0.0,0.0,0.0,0.27735,0.0,0.27735,0.27735,0.27735,0.27735,...,0.27735,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.481612
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.561881
2,0.196116,0.196116,0.196116,0.196116,0.0,0.196116,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.414872
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.207436
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.487023


### Train a Stochastic Gradient Descent Model on Hstack

In [33]:
sgd_regr = SGDRegressor()

In [34]:
%time sgd_regr.fit(X_train_hstack, dfa['rating'])

CPU times: user 1.33 ms, sys: 1.7 ms, total: 3.03 ms
Wall time: 2.22 ms


SGDRegressor()

### Train a Random Forrest Model on Hstack

In [35]:
rf_regr=RandomForestRegressor()

In [36]:
%time rf_regr.fit(X_train_hstack, dfa['rating'])

CPU times: user 117 ms, sys: 2.58 ms, total: 120 ms
Wall time: 118 ms


RandomForestRegressor()

In [37]:
# TODO check out all kinds of regressors
# https://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble