In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import string
import re


In [2]:
data = pd.read_csv('./US_Crime_Data.csv')
data.sample(10)

Unnamed: 0,Date,Title,Organization,City,State,URL,Keyword,Summary
1965,5/29/17 12:09,White Supremacists Are Much Bigger Threat To T...,Carbonated.tv,San Ramon,CA,http://www.carbonated.tv/news/white-supremacis...,attack christian collins john killed meche mus...,
481,3/12/17 7:51,Greenwich Selectmen Comment on Uptick in Relig...,Greenwich Free Press,Greenwich,CT,http://greenwichfreepress.com/greenwich-select...,,
2336,6/7/17 18:52,Chicago Tribune: Pepe's apologizes for harassm...,cairchicago.org,Arlington Heights,IL,http://www.cairchicago.org/blog/2017/06/chicag...,,
1152,4/20/17 14:22,San Rafael: Latino-hating arsonist gets 11-yea...,Marin Independent Journal,Novato,CA,http://www.marinij.com/article/NO/20170420/NEW...,arson canal court crime fire hara hate indepen...,Richard O'Hara attends his sentencing in Marin...
6223,11/14/17 14:39,Hate graffiti' painted on Cinnaminson-area sid...,6abc.com,Philadelphia,PA,http://6abc.com/hate-graffiti-painted-on-nj-si...,action calabrese charles cinnaminson graffiti ...,Hate graffiti' painted on Cinnaminson-area sid...
3224,7/7/17 13:03,Man Charged With Hate Crime After Threats Sent...,Vermont Public Radio,Colchester,VT,http://digital.vpr.net/post/man-charged-hate-c...,burlington chairman democratic emails faisal g...,By Peter Hirschfeld • 1 minute ago\tFaisal Gil...
4195,8/23/17 12:58,Frisco men plead guilty to hate crimes in McKi...,Star Local Media,Plano,Texas,http://starlocalmedia.com/friscoenterprise/new...,ajiduah alcohol attorney aubrey bureau crimes ...,"Nigel Garrett, 21, and Cameron Ajiduah, 18, pl..."
6490,12/4/17 15:48,State senator to introduce hate crime bill,WISH-TV,Indianapolis,IN,http://wishtv.com/2017/12/04/state-senator-to-...,alliance amin bill crime crimes democratic har...,"By David Williams Published: December 4, 2017,..."
4894,9/25/17 18:30,CMPD investigating hate crime after Jewish fam...,TWC News,Syracuse,NY,http://www.twcnews.com/nc/charlotte/news/2017/...,charlotte citywide cmpd creek crime drive gale...,By Spectrum News Staff\tCHARLOTTE -- Ronald Ga...
3716,8/15/17 3:30,Charlottesville victim Heather Heyer was a cha...,AppsforPCdaily,,,http://appsforpcdaily.com/2017/08/charlottesvi...,bandcamp bernie clinton daughter democratic di...,She was always passionate about the beliefs sh...


In [3]:
data.isna().sum()

Date               0
Title              1
Organization       0
City            1167
State           1245
URL                0
Keyword         1176
Summary         2256
dtype: int64

## For this model we just need headlines

In [4]:
df = data[['Title']]
df = df.dropna().reset_index(drop = True)
df.head()

Unnamed: 0,Title
0,Pizza Hut driver who killed co-worker with sho...
1,Residents of NJ township receiving KKK promoti...
2,House OKs bill to expand Kentucky's hate crime...
3,"Amid Protests, 'Blue Lives Matter' Bill Passes..."
4,Lafourche inmates charged with hate crimes in ...


In [5]:
df['Title'][123]

'St. Louis suburb victimized by cemetery vandalism mulling hate crimes registry'

In [6]:
df.shape

(6782, 1)

In [8]:
def clean_text(df):
    df['Title'] = df['Title'].apply(lambda x : x.lower())
    tokens = df['Title'].str.replace('[{}]'.format(string.punctuation), '')
    return tokens

In [9]:
tokens = clean_text(df)

  tokens = df['Title'].str.replace('[{}]'.format(string.punctuation), '')


In [10]:
len(set(tokens))

6563

In [11]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(tokens)
seq = tokenizer.texts_to_sequences(tokens)

In [12]:
seq[:10]

[[3973, 3974, 450, 47, 88, 3975, 12, 2211, 103, 1370, 2803],
 [830, 4, 423, 1558, 2212, 470, 3976, 672],
 [257, 1846, 75, 5, 593, 3977, 1, 6, 36],
 [721, 898, 480, 274, 301, 75, 481, 634, 257],
 [3978, 2804, 16, 12, 1, 6, 3, 424, 19],
 [13, 722, 1, 2, 48, 594, 3979, 5, 401],
 [3980, 480, 274, 301, 75, 21, 1371, 2805, 5, 214, 3981, 173],
 [194, 320, 18, 402, 2806, 239, 15, 98, 38],
 [194,
  320,
  18,
  402,
  2806,
  239,
  15,
  98,
  38,
  635,
  174,
  556,
  557,
  14,
  498,
  558,
  275,
  42],
 [33, 174, 673, 302, 2, 1372, 2807, 3, 2213]]

In [13]:
x = []
y = []
total_words_drop = 0
for i in seq:
    if len(i) > 1:
        for j in range(1, len(i)):
            x.append(i[:j])
            y.append(i[j])
            
    else : 
        total_words_drop +=1
print('Total Words Dropped : {}'.format(total_words_drop))

Total Words Dropped : 12


In [14]:
y[: 10]

[3974, 450, 47, 88, 3975, 12, 2211, 103, 1370, 2803]

# Padding sequences

In [15]:
x = tf.keras.preprocessing.sequence.pad_sequences(x)

In [16]:
x.shape

(64701, 49)

# Shaping y same as x

In [17]:
y = tf.keras.utils.to_categorical(y)

In [18]:
y.shape

(64701, 7569)

## Vocab Size : total no. of unique words

In [19]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

7569

In [20]:
model = tf.keras.Sequential([tf.keras.layers.Embedding(vocab_size,49 ),
                            tf.keras.layers.LSTM(100, return_sequences = True),
                            tf.keras.layers.LSTM(100),
                            tf.keras.layers.Dense(100, activation = 'relu'),
                            tf.keras.layers.Dense(vocab_size, activation = 'softmax')])

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 49)          370881    
                                                                 
 lstm (LSTM)                 (None, None, 100)         60000     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
 dense_1 (Dense)             (None, 7569)              764469    
                                                                 
Total params: 1,285,850
Trainable params: 1,285,850
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.compile(loss  = 'categorical_crossentropy',
             optimizer = 'adam',
             metrics = ['accuracy'],
             )

In [24]:
history = model.fit(x,y,
                   epochs = 100,
                    batch_size = 256,
                    callbacks = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss',
                                                               patience = 5,
                                                               restore_best_weights = True))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

KeyboardInterrupt: 

## Saving model

In [None]:
model.save('crime_title_next_word_model.h5')