# Introduction

What are the values, interests, personalities, opinions, language of Chicago and the rest of Illinois? And do they differ?

In this notebook, using a dataset of 1.6 million tweets classified with either a positive or negative sentiment––I  train a neural networks to predict the sentiment of a tweet with about 78% accuracy. I then run the neural network over 70,000 tweets from Illinois and examine how users from Chicago and the rest of Illinois feel about a given topic. I then export the word vectorizations and predicted sentiment and load them into a dash application. 

# Libraries

In [5]:
# Basic packages
import pandas as pd 
import numpy as np
import random
import re
import collections
import matplotlib.pyplot as plt
from pathlib import Path
import string
import nltk
import keras
np.random.seed(0)


# Machine Learning:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

# Packages for data preparation
from process_dataframe import process_dataframe
%load_ext autoreload
%autoreload 2
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from nltk.corpus import stopwords
nltk.download('stopwords')
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from keras.preprocessing import text, sequence
from nltk import word_tokenize
from nltk.collocations import *
from nltk import FreqDist



# Packages for modeling
from keras import models
from keras import layers
from keras import optimizers
from keras import regularizers
from gensim.models import Word2Vec
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Input
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping

# Turn off warnings
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /Users/joel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Using TensorFlow backend.
[nltk_data] Downloading package stopwords to /Users/joel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Functions

In [224]:
def clean(text, split = True):

           
    digits = string.digits
    punctuation = string.punctuation
    punctuation += '’'
    
#     emoji_pattern = re.compile("["
#             u"\U0001F600-\U0001F64F"  # emoticons
#             u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#             u"\U0001F680-\U0001F6FF"  # transport & map symbols
#             u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                                "]+", flags=re.UNICODE)
#     no_emoji = emoji_pattern.sub(r'', text)
    
    
    no_emoji  = text.encode('ascii', 'ignore').decode('ascii')
    
    d_trump = no_emoji.replace('realdonaldtrump', 'donald trump')
    no_stops  = remove_stops(d_trump)
    no_http = remove_http(no_stops)
    no_digits = no_http.translate(str.maketrans('', '', digits))
    cleaned = no_digits.translate(str.maketrans('', '', punctuation))
    

    
    
    if split:
        split = cleaned.split()
        return split
    else: 
        return cleaned 
    
def remove_http(text):
    split_text = text.split()
    for i in split_text:
        if 'http' in i:
            text = text.replace(i, '')
    
    return text

    
def remove_stops(text):
    stops = stopwords.words('english')
    stops += ['i’m', 'it’s', 
              'don’t', 'can’t', 
              'that’s', '2', 
              'lol', 'i',
             'get', 'the','i’m', '-',
             "i'm", 'it', 'a', 'u', 'amp', '•', 'i’ve',
             'chicago', 'tictac', 'flippen', 'click', 'link', 'bio', 'job',
             '_', 'apply', 'jobs', 'hiring', 'openings', 'opening', 'could', 'might',
             'via']

    split = text.split()
    removed = [x.lower() for x in split if x.lower() not in stops]
    
    return ' '.join(removed)


def tokenize(self, vocab_size = 20000, maxlen = 50):
    vocabulary_size = vocab_size
    tokenizer = Tokenizer(num_words= vocabulary_size)
    tokenizer.fit_on_texts(self.stemmed)
    sequences_1 = tokenizer.texts_to_sequences(self.stemmed)
    data = pad_sequences(sequences_1, maxlen=maxlen)
    return data


def bigrams(bag):
    bigram_measures = nltk.collocations.BigramAssocMeasures()

    finder = BigramCollocationFinder.from_words(bag)
    finder.apply_freq_filter(10)

    scored = finder.score_ngrams(bigram_measures.likelihood_ratio)
    return scored

def trigrams(bag):
    trigram_measures = nltk.collocations.TrigramAssocMeasures()
    finder = TrigramCollocationFinder.from_words(bag)
    scored = finder.score_ngrams(trigram_measures.likelihood_ratio)
    return scored

# Sentiment Prediction

1. Using a [twitter sentiment dataset](https://www.kaggle.com/kazanova/sentiment140) from kaggle, I will try to create a model that can predict the sentiment of tweets.
2. Then I will run the model over my twitter data from Illinois
3. Compare the sentiments of Chicago and Illinois.

## Clean Sentiment Data: 

Now let's see if I can create a model to predict the sentiment of tweets. For simplicity sake, I will run the new dataset through a class that filters the data through the same cleaning process used above.

*Note:* For simplicity's sake, I will be running this dataset through a class called process_dataframe that replicates the cleaning process done above.

In [444]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding = 'latin', names = ['target', 'ids', 
                                                                                             'date', 'flag', 
                                                                                             'user', 'text'])

sentiment_process = process_dataframe(df, 'text')


df.text = sentiment_process.data

In [445]:
change = {4:1,
         0:0}
df.target = df.target.map(change)

In [8]:
df.text[:10]

0    [switchfoot, awww, thats, bummer, shoulda, dav...
1    [upset, cant, update, facebook, texting, it, c...
2    [kenichan, dived, many, times, ball, managed, ...
3                     [whole, body, feels, itchy, fir]
4    [nationwideclass, no, behaving, all, mad, here...
5                              [kwesidei, whole, crew]
6                                          [need, hug]
7    [loltrish, hey, long, time, see, yes, rains, b...
8                                      [tatianak, nop]
9                                    [twittera, muera]
Name: text, dtype: object

First I'll tokenize the words so they can be fed to the neural network.

In [446]:
vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(df.text)
sequences_1 = tokenizer.texts_to_sequences(df.text)
data = pad_sequences(sequences_1, maxlen=100)

Uncomment the cell below to save the tokenizer.

In [41]:
# import pickle

# # saving
# with open('tokenizer.pickle', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

Now, let's print out the shape of the matrix so we know how to calibrate the neural network.

In [18]:
data.shape

(1600000, 100)

I will now feed the text data into a Gated Recurrent Unit Neural Network

## GRU



In [459]:
import pandas as pd
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
%matplotlib inline
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding, GRU
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.utils.np_utils import to_categorical

In the next cell: 
1. Create a train test for evaluating the accuracy of the neural network.
2. Compile the layers of the neural network.

In [461]:
xtrain, xtest, ytrain_, ytest_ = train_test_split(data, df.target, train_size = .99)

ytrain = to_categorical(ytrain_)
ytest = to_categorical(ytest_)

gru_model = Sequential()
gru_model.add(Embedding(20000, 128))
gru_model.add(GRU(50, return_sequences=True))
gru_model.add(GlobalMaxPool1D())
gru_model.add(Dropout(0.5))
gru_model.add(Dense(50, activation='relu'))
gru_model.add(Dropout(0.5))
gru_model.add(Dense(2, activation='sigmoid'))

gru_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Coolio. Let's take a look at the layers and make sure evrything looks ok.

In [462]:
gru_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_45 (Embedding)     (None, None, 128)         2560000   
_________________________________________________________________
gru_1 (GRU)                  (None, None, 50)          26850     
_________________________________________________________________
global_max_pooling1d_39 (Glo (None, 50)                0         
_________________________________________________________________
dropout_77 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_235 (Dense)            (None, 50)                2550      
_________________________________________________________________
dropout_78 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_236 (Dense)            (None, 2)                 102       
Total para

**FIT TIME**

In [463]:
gru_model.fit(xtrain, ytrain, epochs=2, batch_size=32, validation_split=0.1)

Train on 1425600 samples, validate on 158400 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1b43d5bc18>

**THREE HOURS LATER:** Let's take a look at how accurately the neural network can predict sentiment.

In [464]:
gru_model.evaluate(xtrain, ytrain)



[0.4286012996438176, 0.8019842171717172]

In [466]:
gru_model.evaluate(xtest, ytest)



[0.4539098492860794, 0.78325]

Ok! 78% accuracy. This obviously is not perfect, but sentiment an abstract thing to measure. Let's what this looks like.

# Sentiment Analysis
For sentiment analysis I will:
1. Run the GRU sentiment model over illinois data and create a sentiment column for those predicted values.
2. Create seperate dataframes for Chicago and Illinois data
3.  Create a Word2Vec model for Chicago and Illinois to find words that are in relationship with a given topic.

The dataframe containing predicted sentiment and the word2vec models in this section are the primary pieces of data at work in a Dash app made to vizualize the differencing between the two groups.




from process_dataframe import process_dataframe
%load_ext autoreload
%autoreload 2

In [468]:
from process_dataframe import process_dataframe
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Import data:

In [482]:
illinois = pd.read_csv('final_uncleaned.csv')

illinois = illinois[illinois.target.notnull()]


illinois = illinois[illinois.text.notnull()]
illinois.reset_index(drop =  True, inplace = True)

Process data:

In [492]:
lincoln = process_dataframe(illinois, 'text')

illinois['text'] = lincoln.data

Tokenize the data:

In [493]:
pred_tokenizer = tokenizer.texts_to_sequences(lincoln.data)
pred_data = sequence.pad_sequences(pred_tokenizer, maxlen=100)

Run the GRU neural network over the tweets from Illinois and create predicted sentiment of each tweet.

In [496]:
preds = gru_model.predict_classes(pred_data)

illinois['sentiment'] = preds

illinois.sentiment.value_counts()

1    46451
0    23548
Name: sentiment, dtype: int64

In this project, 1 repsents a positive sentiment and 2 a negative sentiment. 

Aw what a bunch of positive folks :)

In [499]:
column = []
for i in illinois.index:
    collected = [x for x in illinois.text[i] if len(x) < 15]
    column.append(collected)

illinois.text = column

The text data is currenty a list type. Let's change it to a string type so it can be vectorized.

In [222]:
illinois.text = illinois.text.apply(' '.join)

# Create Word2Vec:

For this, I will create three word2vec models.
1. All illinois data––*What are topical relationships of the state as a whole?*
2. Chicago data––*What is Chicago talking about?*
3. All of Illinois excluding Chicago––*What differences can be found between IL and Chicago?*

**Then** I will save each of the word embedding models, and vizualize them with a dash app.

In [504]:
il_x = illinois[illinois.target == 0]
chi = illinois[illinois.target == 1]

## 1. Illinois –– All

In [549]:
w2v = Word2Vec(illinois.text, size=100, window=5, min_count=1, workers=4)

w2v.train(illinois.text, total_examples=w2v.corpus_count, epochs=10)

(4863291, 4905810)

Let's look at the most similar words! (I love this part)

In [597]:
w2v.most_similar(positive = ['illinois'])

[('airport', 0.9049680233001709),
 ('east', 0.8951677083969116),
 ('kishwaukee', 0.8946200013160706),
 ('central', 0.8909432888031006),
 ('north', 0.873832643032074),
 ('daytripper', 0.8731734752655029),
 ('loop', 0.8699049353599548),
 ('southern', 0.8686106204986572),
 ('state', 0.8685200214385986),
 ('training', 0.8682582378387451)]

Alright let's repeat the process for Chicago and Illinois excluding Chicago

## Chicago

In [551]:
w2v_chi = Word2Vec(illinois.text, size=100, window=5, min_count=1, workers=4)

w2v_chi.train(chi.text, total_examples=w2v.corpus_count, epochs=10)

(2382453, 2403220)

In [552]:
w2v_chi.wv.most_similar(positive = ['government'])

[('address', 0.9894412755966187),
 ('allow', 0.9891008138656616),
 ('politics', 0.9889135956764221),
 ('issue', 0.9869572520256042),
 ('responsible', 0.9869493842124939),
 ('lack', 0.9862771034240723),
 ('candidates', 0.9845634698867798),
 ('totally', 0.9839571714401245),
 ('destroy', 0.9835915565490723),
 ('prove', 0.9832010269165039)]

## Illinois excluding Chicago

In [553]:
w2v_il_x = Word2Vec(illinois.text, size=100, window=5, min_count=1, workers=4)

w2v_il_x.train(il_x.text, total_examples=w2v.corpus_count, epochs=10)

(2481015, 2502590)

In [554]:
w2v_il_x.wv.most_similar(positive = ['government'])

[('politicians', 0.9770750999450684),
 ('decency', 0.9761868715286255),
 ('lies', 0.9746777415275574),
 ('rights', 0.9728584289550781),
 ('protect', 0.9712259769439697),
 ('privatesector', 0.9706145524978638),
 ('americans', 0.9696345329284668),
 ('allow', 0.9693350791931152),
 ('healthcare', 0.9692049622535706),
 ('recession', 0.9686658382415771)]

Uncomment the cell below the save the word vectors

In [560]:
#w2v.save('illinois_vectors.bin')
#w2v_chi.save('chicago_vectors.bin')
#w2v_il_x.save('il_x_vectors.bin')