# Text Analytics | BAIS:6100
# Module 8: Text Classification - Exercises

Instructor: Kang-Pyo Lee 

## Loading the Same Dataset into a Pandas Dataframe

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 150)

df = pd.read_csv("classdata/emails.csv")

In [None]:
df

## Cleaning the Data

In [None]:
df = df.drop_duplicates(keep="first")

## Preparing Data for Modeling

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(use_idf=True, norm="l2", stop_words="english", max_df=0.7)
X = vectorizer.fit_transform(df.text)
y = df.spam

Let's try a different `test_size` value, say 0.25, and a different `random_state` value, say 1, to make different train and test sets. 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=1)

## Modeling with k-Nearest Neigobors (k-NNs)

### Step 1. Choose a classficiation algorithm to try

### Step 2. Initialize a model object with initial parameters

In [None]:
from sklearn.neighbors import KNeighborsClassifier 

knn = KNeighborsClassifier(n_neighbors=1)     # The number of neighbors to consider, or k, is set to 1. 

### Step 3. Fit the model using the training data

In [None]:
knn.fit(X_train, y_train)

### Step 4. Check the performance of the model

In [None]:
knn.score(X_train, y_train), knn.score(X_test, y_test)

In [None]:
pred = knn.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
print(classification_report(y_test, pred))

In [None]:
print(confusion_matrix(y_test,pred))

### Step 5: Perform cross validation and choose the best parameters if there are parameters to optimize

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
scores_k1 = cross_val_score(knn, X_train, y_train, cv=5)
scores_k1

In [None]:
scores_k1.mean(), scores_k1.std()

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
scores_k3 = cross_val_score(knn, X_train, y_train, cv=5)
scores_k3

In [None]:
scores_k3.mean(), scores_k3.std()

In [None]:
score_max = 0
for param in [1, 3, 10, 30]:
    model = KNeighborsClassifier(n_neighbors=param)
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print("k = {}: {}\n{:.3f}, {:.3f}\n".format(param, scores, scores.mean(), scores.std()))
    
    if scores.mean() > score_max:
        score_max = scores.mean()
        param_best = param
        
print("Highest score : {:.3f} when k = {}".format(score_max, param_best))

### Step 6. Build the final model with the best parameter(s)

In [None]:
def train_test(X_train, X_test, y_train, y_test, classifier):
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    
    print("Train score: {:.2f}".format(classifier.score(X_train, y_train)))
    print("Test score: {:.2f}\n".format(classifier.score(X_test, y_test)))
    print("Classification report:\n{}".format(classification_report(y_test, pred, zero_division=0)))
    print(confusion_matrix(y_test,pred))
    
    return classifier

In [None]:
print("k = {}".format(param_best))
knn = KNeighborsClassifier(n_neighbors=param_best)
knn = train_test(X_train, X_test, y_train, y_test, knn)

In [None]:
summary = {}
summary["k-NNs"] = round(knn.score(X_test, y_test), 3)

### Step 7. Make predictions on new unseen data

Fill the five string variables below with your own email texts. 

In [None]:
text1 = ""

In [None]:
text2 = ""

In [None]:
text3 = ""

In [None]:
text4 = ""

In [None]:
text5 = ""

In [None]:
new_texts = [text1, text2, text3, text4, text5]
X_new = vectorizer.transform(new_texts)

In [None]:
knn.predict(X_new)

##  Modeling with Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

In [None]:
scores = cross_val_score(lr, X_train, y_train, cv=5)
print("{}\n{:.3f}, {:.3f}".format(scores, scores.mean(), scores.std()))

In [None]:
lr = train_test(X_train, X_test, y_train, y_test, lr)

In [None]:
summary["Logistic Regression"] = round(lr.score(X_test, y_test), 3)

In [None]:
lr.predict(X_new)

## Modeling with Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()

In [None]:
scores = cross_val_score(mnb, X_train, y_train, cv=5)
print("{}\n{:.3f}, {:.3f}".format(scores, scores.mean(), scores.std()))

In [None]:
mnb = train_test(X_train, X_test, y_train, y_test, mnb)

In [None]:
summary["Multinomial Naive Bayes"] = round(mnb.score(X_test, y_test), 3)

In [None]:
mnb.predict(X_new)

## Modeling with Linear Support Vector Machines (SVMs)

In [None]:
from sklearn.svm import LinearSVC

svm = LinearSVC(C=1)

In [None]:
svm = train_test(X_train, X_test, y_train, y_test, svm)

In [None]:
score_max = 0
for param in [0.01, 0.03, 0.1, 0.3, 1, 3, 10]:
    model = LinearSVC(C=param)
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print("C = {}: {}\n{:.3f}, {:.3f}\n".format(param, scores, scores.mean(), scores.std()))
    
    if scores.mean() > score_max:
        score_max = scores.mean()
        param_best = param
        
print("Highest score : {:.3f} when C = {}".format(score_max, param_best))

In [None]:
print("C = {}".format(param_best))
svm = LinearSVC(C=param_best)
svm = train_test(X_train, X_test, y_train, y_test, svm)

In [None]:
summary["Linear SVMs"] = round(svm.score(X_test, y_test), 3)

In [None]:
svm.predict(X_new)

## Modeling with Kernelized Support Vector Machines (KSVMs)

In [None]:
from sklearn.svm import SVC

ksvm = SVC(C=1, kernel="rbf", gamma="scale")

In [None]:
ksvm = train_test(X_train, X_test, y_train, y_test, ksvm)

In [None]:
score_max = 0
for param in [0.01, 0.03, 0.1, 0.3, 1, 3, 10]:
    model = SVC(C=param, kernel="rbf", gamma="scale")
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print("C = {}: {}\n{:.3f}, {:.3f}\n".format(param, scores, scores.mean(), scores.std()))
    
    if scores.mean() > score_max:
        score_max = scores.mean()
        param_best = param
        
print("Highest score : {:.3f} when C = {}".format(score_max, param_best))

In [None]:
print("C = {}".format(param_best))
ksvm = SVC(C=param_best)
ksvm = train_test(X_train, X_test, y_train, y_test, ksvm)

In [None]:
summary["Kernelized SVMs"] = round(ksvm.score(X_test, y_test), 3)

In [None]:
ksvm.predict(X_new)

## Modeling with Neural Networks

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(10, ), activation="relu", random_state=0)

In [None]:
mlp = train_test(X_train, X_test, y_train, y_test, mlp)

In [None]:
# It will take long to run this cell. 

# score_max = 0
# for param in [10, 30, 100]:
#     model = MLPClassifier(hidden_layer_sizes=(param, ), activation="relu", random_state=0)
#     scores = cross_val_score(model, X_train, y_train, cv=5)
#     print("hidden_layer_size = {}: {}\n{:.3f}, {:.3f}\n".format(param, scores, scores.mean(), scores.std()))
    
#     if scores.mean() > score_max:
#         score_max = scores.mean()
#         param_best = param
        
# print("Highest score : {:.3f} when hidden_layer_sizes = {}".format(score_max, param_best))

In [None]:
# print("hidden_layer_size = {}".format(param_best))
# mlp = MLPClassifier(hidden_layer_sizes=(param_best, ), random_state=0)
# mlp = train_test(X_train, X_test, y_train, y_test, mlp)

In [None]:
summary["Neural Networks"] = round(mlp.score(X_test, y_test), 3)

In [None]:
mlp.predict(X_new)

## Choose the algorithm that performs best

In [None]:
summary