# Crime Prediction using Tweets and KDE

In [15]:
%matplotlib inline

import matplotlib.pylab as plt
import numpy as np
import pandas as pd

![title](./ANLP-Project-Pipeline.png)

## Data Sources

### Chicago Crimes Incidents

### Tweets

## Data Preprocessing

### Tokenizer

### Sub-grouping by Geo-Location

## Feature Extracts

### KDE

### Sentiment Analysis

### LDA

In [1]:
td = [["this is not a tweet", "tweeter soks :)", "I do not want to party :/"],["#creizi yeee", "shooting outside!"],["where am i?", "this is not a drill, please do NOT drink milk"]]

In [22]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

def coalesce(token):
    new_tokens = []
    for char in token:
        if len(new_tokens) < 2 or char != new_tokens[-1] or char != new_tokens[-2]:
            new_tokens.append(char)
    return ''.join(new_tokens)

def preprocess(text):
    # text input is one string
    # output is tokenized and preprocessed(as defined below) text
    
    # lowercase
    # no hashtags or mentions
    # any url converted to "url"
    # replace multiple repeated chars with 2 of them. eg paaaarty -> paarty
    
    text = text.lower()
    
    tokens = []
    for token in text.split():
        if token.startswith("@") or token.startswith("#"):
            continue
        elif token.startswith("https://") or token.startswith("http://"):
            tokens.append(u"url")
        else:
            tokens.append(coalesce(token))
            
    return tokens

def process_documents_lda(docs):
    # assume a document is a list of tweets
    # join tweets in a document by whitespace
    for d in docs:
        print(" ".join(d))
    return [" ".join(d) for d in docs]

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
def learn_lda(tweet_docs, preprocess=None, tokenizer=None):
    
    docs = process_documents_lda(tweet_docs)
    
    vectorizer = CountVectorizer(stop_words="english", preprocessor=preprocess, tokenizer=tokenizer)

    X = vectorizer.fit_transform(docs)

    lda = LatentDirichletAllocation(n_components=5, max_iter=5, learning_method='online', learning_offset=50., random_state=42)

    lda.fit(X)
    
    print_top_words(lda, vectorizer.get_feature_names(), 2)
    
    return lda

In [23]:
lda = learn_lda(td, preprocess=preprocess, tokenizer=lambda x:x)


this is not a tweet tweeter soks :) I do not want to party :/
#creizi yeee shooting outside!
where am i? this is not a drill, please do NOT drink milk
Topic #0: i? milk
Topic #1: outside! tweet
Topic #2: :/ party
Topic #3: drink yee
Topic #4: yee shooting



## Model Training & Evaluation

### Training

### Evaluation