# 1. Neural Network Classifier with Scikit

Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using scikit-learn. Use the code found in chapter 12 of the Applied Text Analysis with Python book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
import pandas as pd, numpy as np, json, re, pickle, keras


def read_data(file):
    
    data = []

    with open(file) as f:
        for line in f:
            data.append(json.loads(line))
        
    # convert to data frame
    
    return pd.DataFrame(data)

# read category data

df = read_data('categorized-comments.jsonl')

# check size, structure and categories

print('Size: ', len(df), '\n',
      'Shape: ', df.info(), '\n',
      'Categories: ', df.cat.unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 606475 entries, 0 to 606474
Data columns (total 2 columns):
cat    606475 non-null object
txt    606475 non-null object
dtypes: object(2)
memory usage: 9.3+ MB
Size:  606475 
 Shape:  None 
 Categories:  ['sports' 'science_and_technology' 'video_games']


In [42]:
size = 10000    # sample size
replace = True  # with replacement
fn = lambda obj: obj.loc[np.random.choice(obj.index, size, replace),:]

category = df.groupby('cat', as_index=False).apply(fn)

# free up memory

del df

category['txt'] = category['txt'].apply(lambda x:clean_text(x))
category.reset_index(drop=True, inplace=True)

category.head()



Unnamed: 0,cat,txt
0,science_and_technology,k
1,science_and_technology,currently in different group chats one with ...
2,science_and_technology,i know this post is old but i just today start...
3,science_and_technology,the was waterproof resistant
4,science_and_technology,crashes here as well


In [17]:
# create the feature matrix

cv = CountVectorizer(stop_words=stop_words)

# create target and sample

X = cv.fit_transform(category['txt'])
Y = category['cat']

# create train test split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=1)

mlp = MLPClassifier(hidden_layer_sizes=(30,30,30), max_iter=75)
mlp.fit(X_train,y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(30, 30, 30), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=75,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [18]:
predictions = mlp.predict(X_test)
print('Confusion Matrix: ',confusion_matrix(y_test,predictions))  
print('Classification Report:',classification_report(y_test,predictions)) 
print('Accuracy: ',accuracy_score(y_test,predictions))

Confusion Matrix:  [[1897  225  424]
 [ 195 1693  610]
 [ 272  493 1691]]
Classification Report:                         precision    recall  f1-score   support

science_and_technology       0.80      0.75      0.77      2546
                sports       0.70      0.68      0.69      2498
           video_games       0.62      0.69      0.65      2456

              accuracy                           0.70      7500
             macro avg       0.71      0.70      0.71      7500
          weighted avg       0.71      0.70      0.71      7500

Accuracy:  0.7041333333333334


# 2. Neural Network Classifier with Keras

Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using Keras. Use the code found in chapter 12 of the Applied Text Analysis with Python book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.

In [19]:
# check the unique categories

#category["cat"].unique()
category.groupby(["cat"]).size()

cat
science_and_technology    10000
sports                    10000
video_games               10000
dtype: int64

In [20]:
encoder = LabelEncoder()

cat = category["cat"]
category["cat"]=encoder.fit_transform(cat)
category.groupby("cat").count()

Unnamed: 0_level_0,txt
cat,Unnamed: 1_level_1
0,10000
1,10000
2,10000


In [23]:

# set the features and classes

N_FEATURES = 5000
N_CLASSES = 1
N_UNITS = 2500

# create the feature matrix

cv = CountVectorizer(analyzer='word',
                     stop_words=stop_words, 
                     max_features = N_FEATURES,
                     max_df = 0.5,
                     min_df = 3)

# create target and sample

X = cv.fit_transform(category['txt'])
y = category['cat']

# create train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

print(str(X_train.shape))
print(str(X_test.shape))
print(str(y_train.shape))
print(str(y_test.shape))

(22500, 5000)
(7500, 5000)
(22500,)
(7500,)


In [25]:
# initialize

classifier_seq = Sequential()

classifier_seq.add(Dense(units=500,activation="relu",input_shape=(N_FEATURES,)))
classifier_seq.add(Dense(units=50, activation="relu"))
classifier_seq.add(Dense(units=4, activation="softmax"))

# compile the Artificial Neural Network (ANN)

classifier_seq.compile(optimizer="rmsprop", 
                       loss="sparse_categorical_crossentropy", 
                       metrics=["accuracy"])

# fit ANN to the training set

classifier_seq.fit(X_train, y_train, batch_size=200, epochs=5, verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x1a285d3b70>

In [26]:
classifier_seq.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 500)               2500500   
_________________________________________________________________
dense_5 (Dense)              (None, 50)                25050     
_________________________________________________________________
dense_6 (Dense)              (None, 4)                 204       
Total params: 2,525,754
Trainable params: 2,525,754
Non-trainable params: 0
_________________________________________________________________


In [27]:
# loss and accuracy

loss, accuracy = classifier_seq.evaluate(X_test, y_test, verbose=1)
print("Training Accuracy: {:.4f}".format(accuracy))

Training Accuracy: 0.7385


In [28]:
# create prediction

y_pred = classifier_seq.predict_classes(X_test)

# calculate model matrix

print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))
print("Classification Report: ", classification_report(y_test,y_pred))
print("Accuracy: ", accuracy_score(y_test,y_pred))

Confusion Matrix:  [[2020  222  304]
 [ 191 1844  463]
 [ 254  527 1675]]
Classification Report:                precision    recall  f1-score   support

           0       0.82      0.79      0.81      2546
           1       0.71      0.74      0.72      2498
           2       0.69      0.68      0.68      2456

    accuracy                           0.74      7500
   macro avg       0.74      0.74      0.74      7500
weighted avg       0.74      0.74      0.74      7500

Accuracy:  0.7385333333333334


# 3. Classifying Images

In chapter 20 of the Machine Learning with Python Cookbook, implement the code found in section 20.15 classify MSINT images using a convolutional neural network. Report the accuracy of your results.

In [30]:
import numpy as np
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.utils import np_utils
from keras import backend as K

# Set that the color channel value will be first
K.set_image_data_format("channels_first")

# Set seed
np.random.seed(0)

# Set that the color channel value will be first
K.set_image_data_format("channels_first")

# Set seed
np.random.seed(0)

# Set image information
channels = 1
height = 28
width = 28

# Load data and target from MNIST data
(data_train, target_train), (data_test, target_test) = mnist.load_data()

# Reshape training image data into features
data_train = data_train.reshape(data_train.shape[0], channels, height, width)

# Reshape test image data into features
data_test = data_test.reshape(data_test.shape[0], channels, height, width)

# Rescale pixel intensity to between 0 and 1
features_train = data_train / 255
features_test = data_test / 255

Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz


In [31]:
# One-hot encode target
target_train = np_utils.to_categorical(target_train)
target_test = np_utils.to_categorical(target_test)
number_of_classes = target_test.shape[1]

# Start neural network
network = Sequential()

# Add convolutional layer with 64 filters, a 5x5 window, and ReLU activation function
network.add(Conv2D(filters=64,
                   kernel_size=(5, 5),
                   input_shape=(channels, width, height),
                   activation='relu'))

# Add max pooling layer with a 2x2 window
network.add(MaxPooling2D(pool_size=(2, 2)))

# Add dropout layer
network.add(Dropout(0.5))

# Add layer to flatten input
network.add(Flatten())

# # Add fully connected layer of 128 units with a ReLU activation function
network.add(Dense(128, activation="relu"))

# Add dropout layer
network.add(Dropout(0.5))

# Add fully connected layer with a softmax activation function
network.add(Dense(number_of_classes, activation="softmax"))

# Compile neural network
network.compile(loss="categorical_crossentropy", # Cross-entropy
                optimizer="rmsprop", # Root Mean Square Propagation
                metrics=["accuracy"]) # Accuracy performance metric

# Train neural network
network.fit(features_train, # Features
            target_train, # Target
            epochs=2, # Number of epochs
            verbose=0, # Don't print description after each epoch
            batch_size=1000, # Number of observations per batch
            validation_data=(features_test, target_test)) # Data for evaluation

<keras.callbacks.callbacks.History at 0x1a288f3ef0>

In [32]:
# since the model took long time to train, I am saving it for future use
network.save('image')
del network

In [33]:
# loading the saved model
from keras.models import load_model
network = load_model('image')

# evaluate and print
eval_model=network.evaluate(features_train, target_train)
eval_model



[0.09607982429750264, 0.9710999727249146]