# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('data/X_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,['come']
1,"['im', 'home', 'please', 'call']"
2,"['yup', 'ì', 'comin', '']"
3,"['k', 'sent']"
4,"['eh', 'ur', 'laptop', 'got', 'stock', 'lei', ..."


### Create TF-IDF Vectors

In [2]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [3]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_


{'come': 2005,
 'im': 3795,
 'home': 3637,
 'please': 5560,
 'call': 1680,
 'yup': 8198,
 'comin': 2012,
 'sent': 6347,
 'eh': 2674,
 'ur': 7571,
 'laptop': 4233,
 'got': 3340,
 'stock': 6832,
 'lei': 4295,
 'say': 6254,
 'mon': 4832,
 'muz': 4935,
 'take': 7045,
 'look': 4423,
 'pls': 5567,
 'give': 3276,
 'food': 3055,
 'preferably': 5689,
 'pap': 5373,
 'slowly': 6572,
 'loads': 4392,
 'sugar': 6935,
 'hour': 3683,
 'water': 7786,
 'wanna': 7760,
 'gym': 3432,
 'harri': 3492,
 'hi': 3580,
 'test': 7137,
 'ltgt': 4487,
 'rd': 5907,
 'wont': 7973,
 'anything': 1066,
 'de': 2298,
 'send': 6337,
 'naughty': 4979,
 'pix': 5532,
 'aight': 945,
 'fuck': 3157,
 'ill': 3792,
 'get': 3248,
 'later': 4248,
 'dude': 2607,
 'ive': 3961,
 'seeing': 6312,
 'lotta': 4449,
 'corvettes': 2122,
 'lately': 4246,
 'congrats': 2069,
 'year': 8125,
 'special': 6700,
 'cinema': 1919,
 'pass': 5408,
 '09061209465': 184,
 'suprman': 6976,
 'matrix3': 4634,
 'starwars3': 6799,
 'etc': 2770,
 'free': 3108,
 'b

In [4]:
# How are these vectors stored?
X_test_vect[0]

<1x8226 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [5]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [6]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [8]:
# Use the trained model to make predictions on the test data
y_pred=rf_model.predict(X_test_vect)

In [9]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 0.796 / Accuracy: 0.972
