# Machine Learning with Text

# Problem: Use the title and description of a talk to predict whether it might be selected.
## Starting with the imports...

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, recall_score

In [None]:
df = pd.read_table('data/preprocessed.tsv', usecols=['title', 'description', 'selected'])
df.fillna(value="", inplace=True)

In [None]:
y = df['selected'].astype(int).values

# The Training & Prediction pipeline
![](pipeline.png)

## Let's use the 'title' column as the corpus

In [None]:
corpus = df['title']

## Text Vectorization & The TD Matrix

In [None]:
vect = TfidfVectorizer(sublinear_tf=True, stop_words='english')
X = vect.fit_transform(corpus)
pd.DataFrame(X.toarray(), columns=vect.get_feature_names()).head()

## Dimensionality Reduction

In [None]:
svd = TruncatedSVD(n_components=250)
X = svd.fit_transform(X)
pd.DataFrame(X).head()

## Training the Classifier

In [None]:
gnb = GaussianNB()
gnb.fit(X, y)

## Testing the classifier

In [None]:
predictions = gnb.predict(X)
print((predictions == y).sum() / 290)

# Exercise 1: Use the 'description' column of the dataset as a corpus for the predictions

In [None]:
# Retrieve the corpus from the dataset

In [None]:
# Obtain the TD Matrix

In [None]:
# Reduce the dimensionality of the TD matrix to 250

In [None]:
# Train the classifier

In [None]:
# Test the classifier

# Exercise 2: Use a combination of 'title' and 'description' corpora for the training & predictions