# Deep Neural Networks & TF-IDF

In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')

In [3]:
newsgroups_test = fetch_20newsgroups(subset='test')

In [4]:
x_train = newsgroups_train.data

In [5]:
x_test = newsgroups_test.data

In [6]:
y_train = newsgroups_train.target

In [7]:
y_test = newsgroups_test.target

In [8]:
print ("List of all 20 categories:")
print (newsgroups_train.target_names)
print ("\n")
print ("Sample Email:")
print (x_train[0])
print ("Sample Target Category:")
print (y_train[0])
print (newsgroups_train.target_names[y_train[0]])

List of all 20 categories:
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


Sample Email:
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is

----------------------------------------------------

In [9]:
import nltk

In [10]:
from nltk.corpus import stopwords

In [11]:
from nltk.stem import WordNetLemmatizer

In [12]:
import string

In [13]:
import pandas as pd

In [14]:
from nltk import pos_tag

In [15]:
from nltk.stem import PorterStemmer

In [16]:
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    try:
        tokens = [stemmer.stem(word) for word in tokens]
    except:
        tokens = tokens
        
    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text

In [17]:
x_train_preprocessed  = []

In [18]:
for i in x_train:
    x_train_preprocessed.append(preprocessing(i))

In [19]:
x_test_preprocessed = []

In [20]:
for i in x_test:
    x_test_preprocessed.append(preprocessing(i))

Building TFIDF vectorizer

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2),  stop_words='english', 
                             max_features= 10000,strip_accents='unicode',  norm='l2')

In [24]:
x_train_2 = vectorizer.fit_transform(x_train_preprocessed).todense()

MemoryError: 

In [None]:
x_test_2 = vectorizer.transform(x_test_preprocessed).todense()

Deep Learning modules

In [None]:
import numpy as np

In [None]:
from keras.models import Sequential

In [None]:
from keras.layers.core import Dense, Dropout, Activation

In [None]:
from keras.optimizers import Adadelta,Adam,RMSprop

In [None]:
from keras.utils import np_utils

Definiting hyper parameters

In [None]:
np.random.seed(1337) 
nb_classes = 20
batch_size = 64
nb_epochs = 20

In [None]:
Y_train = np_utils.to_categorical(y_train, nb_classes)

In [None]:
model = Sequential()

In [None]:
model.add(Dense(1000,input_shape= (10000,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))

In [None]:
model.add(Dense(500))
model.add(Activation('relu'))
model.add(Dropout(0.5))

In [None]:
model.add(Dense(50))
model.add(Activation('relu'))
model.add(Dropout(0.5))

In [None]:
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
print (model.summary())

Model Training

In [None]:
model.fit(x_train_2, Y_train, batch_size=batch_size, epochs=nb_epochs,verbose=1)

Model Prediction

In [None]:
y_train_predclass = model.predict_classes(x_train_2,batch_size=batch_size)

In [None]:
y_test_predclass = model.predict_classes(x_test_2,batch_size=batch_size)

In [None]:
from sklearn.metrics import accuracy_score,classification_report

In [None]:
print ("\n\nDeep Neural Network  - Train accuracy:"),(round(accuracy_score(y_train,y_train_predclass),3))

In [None]:
print ("\nDeep Neural Network  - Test accuracy:"),(round(accuracy_score(y_test,y_test_predclass),3))

In [None]:
print ("\nDeep Neural Network  - Train Classification Report")

In [None]:
print (classification_report(y_train,y_train_predclass))

In [None]:
print ("\nDeep Neural Network  - Test Classification Report")

In [None]:
print (classification_report(y_test,y_test_predclass))