In [35]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

## NLTK Tweet Data Preprocessing
Rename the columns as well as change positive tweets from target = 4 to target = 1

In [36]:
import json 
# Read the json tweet files in
with open('../model-files/positive_tweets.json', 'r') as f:
    positive_tweets_data = [json.loads(line) for line in f]
with open('../model-files/negative_tweets.json', 'r') as f:
    negative_tweets_data = [json.loads(line) for line in f]

nltk_positive = pd.DataFrame(positive_tweets_data)
nltk_negative = pd.DataFrame(negative_tweets_data)
nltk_positive['target'] = 1
nltk_negative['target'] = 0

nltk_positive = nltk_positive.loc[:, ['target', 'text']]
nltk_negative = nltk_negative.loc[:, ['target', 'text']]
nltk_positive

Unnamed: 0,target,text
0,1,#FollowFriday @France_Inte @PKuchly57 @Milipol...
1,1,@Lamb2ja Hey James! How odd :/ Please call our...
2,1,@DespiteOfficial we had a listen last night :)...
3,1,@97sides CONGRATS :)
4,1,yeaaaah yippppy!!! my accnt verified rqst has...
...,...,...
4995,1,"@chriswiggin3 Chris, that's great to hear :) D..."
4996,1,@RachelLiskeard Thanks for the shout-out :) It...
4997,1,@side556 Hey! :) Long time no talk...
4998,1,@staybubbly69 as Matt would say. WELCOME TO AD...


In [38]:
"""Import the stop words to be removed"""
with open("../model-files/stop-words.txt") as f:
    stopwords = [line.strip() for line in f]
stopwords = set(stopwords)

def preprocess_tweet(tweet):
    # Convert to lowercase
    tweet = tweet.lower()
    # Remove URLs. Match words that begin with http, http\S+, www, etc... and \S+ which is just one or more non-whitespace characters
    tweet = re.sub(r'\shttp\S+|\swww\S+|\shttps\S+', ' URL ', tweet, flags=re.MULTILINE)

    # Do they same as above but match with anything that ends with .com, .net, or website endings
    tweet = re.sub(r'\S+.com\s|\S+.net\s|\S+.org\s|\S+.co\s|\S+.us\s|\S+.edu\s|\S+.me\s|\S+.cn\s|\S+.uk\s|\S+.cn\s', ' URL ', tweet, flags=re.MULTILINE)
    # Get rid of @ mentions from the tweet dataset
    tweet = re.sub(r'@\S+', '', tweet, flags=re.MULTILINE)

    # Remove punctuation
    tweet = re.sub(r'\W', ' ', tweet)
    # Remove digits
    tweet = re.sub(r'\d+', '', tweet)

    # Tokenize the tweet (just store each word into a list)
    tokens = word_tokenize(tweet)

    tokens = [token for token in tokens if token not in stopwords] # Keep the ones that aren't a stop word
    # Lemmatize tokens, converting it back to its base form
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]


    # If a word has more than 3 consecutive characters, make it 2 characters instead
    for i in range(len(tokens)):
        chars_to_delete = list()
        for j in range(len(tokens[i]) - 2):
            if tokens[i][j] == tokens[i][j+1] and tokens[i][j] == tokens[i][j+2]:
                chars_to_delete.append(tokens[i][j])
        tokens[i] = list(tokens[i]) # Turn it into a list to delete specified values
        for ch in chars_to_delete: # Delete based off of the characters stored in chars_to_delete
            tokens[i].remove(ch)
        tokens[i] = "".join(tokens[i]) # Convert back into a string

    tokens = [token for token in tokens if len(token) > 1] # Get rid of random letters out and about from punctuation removal

    # Join tokens back into a string
    cleaned_tweet = ' '.join(tokens)
    return cleaned_tweet

In [40]:
nltk_positive['clean'] = nltk_positive['text'].apply(preprocess_tweet) # Make a column for the cleaned data
nltk_negative['clean'] = nltk_negative['text'].apply(preprocess_tweet) 

In [41]:
positive_text = list(nltk_positive['clean']) # Dataset arranged in a way such that the first half is negative and second half is positive
positive_text = [str(item) for item in positive_text]
negative_text = list(nltk_negative['clean']) # Dataset arranged in a way such that the first half is negative and second half is positive
negative_text = [str(item) for item in negative_text]

positive_sentiment = list(nltk_positive['target'])
negative_sentiment = list(nltk_negative['target'])

negative_text[0].split()

['hopeless', 'tmr']

## Use One-Hot-Encoding

We will be considering the top X frequent Tweets for both categories and doing one-hot-encoding with them. Basically, it will be a matrix where each row represents a message and each column represents each X most frequent words. If that particular word is present, mark it as 1. Otherwise, words that aren't in that message are marked with 0

## DONT RUN AGAIN AFTER WE PICKLED THE VECTORS

#### Count the frequencies up

In [42]:
negative_word_count = 0 # Total number of words for each class
positive_word_count = 0
positive_words = {}  # Word frequencies for each class
negative_words = {}

In [43]:
for i in range(len(positive_text)):
    splitted = positive_text[i].split()
    for j in range(len(splitted)):
        if splitted[j] not in positive_words: # Keep track of 
            positive_words[splitted[j]] = 0
        positive_words[splitted[j]] += 1
        positive_word_count += 1

for i in range(len(negative_text)):
    splitted = negative_text[i].split()
    for j in range(len(splitted)):
        if splitted[j] not in negative_words:
            negative_words[splitted[j]] = 0
        negative_words[splitted[j]] += 1
        negative_word_count += 1

#### Grab the top 500 most frequent words with significant meaning

In [44]:
most_freq_pos = list()
for k, v in positive_words.items():
    most_freq_pos.append((v, k))
most_freq_neg = list()
for k, v in negative_words.items():
    most_freq_neg.append((v, k))

most_freq_pos.sort()
most_freq_neg.sort() 
# We're gonna take top 500 unique words, but if the lists have the same word at the same index, don't use it
print(most_freq_pos[-10:])
print(most_freq_neg[-10:])

[(174, 'amp'), (197, 'happy'), (219, 'like'), (239, 'good'), (248, 'thank'), (249, 'day'), (288, 'follow'), (323, 'love'), (394, 'thanks'), (2252, 'URL')]
[(150, 'day'), (150, 'one'), (150, 'sorry'), (175, 'na'), (187, 'get'), (208, 'like'), (224, 'want'), (243, 'miss'), (272, 'please'), (1823, 'URL')]


In [45]:
pos_index = len(most_freq_pos) - 1
res = set()
for neg_index in range(len(most_freq_neg)-1, -1, -1):
    if most_freq_pos[pos_index][1] != most_freq_neg[neg_index][1]:
        if len(res) == 499: # Only add one if we are getting close, positive has more words so use that
            res.add(most_freq_pos[pos_index][1])
        else:
            res.add(most_freq_pos[pos_index][1])
            res.add(most_freq_neg[neg_index][1])
    pos_index -= 1
    if len(res) == 500:
        break
print(len(res))
print(res)

with open("../model-files/words.txt", 'w') as file:
    for word in res:
        file.write(word + '\n')

500
{'mention', 'get', 'definitely', 'started', 'actually', 'buy', 'damn', 'dm', 'job', 'today', 'pain', 'made', 'idk', 'hate', 'fav', 'black', 'city', 'day', 'morning', 'tomorrow', 'im', 'saying', 'final', 'show', 'finally', 'dude', 'dont', 'sure', 'hour', 'rn', 'set', 'touch', 'mine', 'lot', 'know', 'favorite', 'concert', 'story', 'youth', 'enjoyed', 'playing', 'everything', 'design', 'didnt', 'plan', 'perfect', 'idea', 'fucked', 'baby', 'much', 'wait', 'back', 'community', 'full', 'gt', 'http', 'hope', 'infinite', 'poor', 'live', 'bro', 'kind', 'think', 'say', 'first', 'stop', 'bam', 'cry', 'since', 'link', 'cream', 'leaving', 'co', 'anyway', 'nothing', 'everyone', 'great', 'worry', 'wake', 'heart', 'tired', 'anyone', 'hoping', 'mean', 'hopefully', 'agree', 'person', 'ugh', 'mum', 'watch', 'lucky', 'ever', 'pretty', 'getting', 'favourite', 'mind', 'talking', 'invite', 'gutted', 'followfriday', 'thanks', 'order', 'eye', 'already', 'read', 'wow', 'flipkartfashionfriday', 'song', 'nice

In [50]:
df = nltk_positive.append(nltk_negative)
text, sentiment = list(df['clean']), list(df['target']) # Turn these both into lists
text = [str(item) for item in text]
df

  df = nltk_positive.append(nltk_negative)


Unnamed: 0,target,text,clean
0,1,#FollowFriday @France_Inte @PKuchly57 @Milipol...,followfriday top engaged member community week
1,1,@Lamb2ja Hey James! How odd :/ Please call our...,hey james odd please call contact centre able ...
2,1,@DespiteOfficial we had a listen last night :)...,listen last night bleed amazing track scotland
3,1,@97sides CONGRATS :),congrats
4,1,yeaaaah yippppy!!! my accnt verified rqst has...,yeaah yippy accnt verified rqst succeed got bl...
...,...,...,...
4995,0,I wanna change my avi but uSanele :(,wan na change avi usanele
4996,0,MY PUPPY BROKE HER FOOT :(,puppy broke foot
4997,0,where's all the jaebum baby pictures :((,jaebum baby picture
4998,0,But but Mr Ahmad Maslan cooks too :( https://t...,mr ahmad maslan cook URL


#### Make a new array with the hot encodings for each message and sentiment

In [52]:
word_list = []
with open("../model-files/words.txt", 'r') as file:
    for line in file:
        word_list.append(line.strip())# Order doesn't matter, but needs to stay consistent so convert the set to a list
word_list_dict = dict()
for i in range(len(word_list)): # Map words to their index to make the next part O(1) instead of O(N)
    word_list_dict[word_list[i]] = i

hot_encode = dict()
vectors = list()
for i in range(len(text)):
    message = text[i]
    tokens = message.split()
    vector = [0] * len(word_list)
    for token in tokens:
        if token in word_list_dict:
            vector[word_list_dict[token]] = 1 # We have this token so for this vector, mark it accordingly
    vectors.append((vector, sentiment[i])) # Add the whole vector, AND the sentiment

# So now each index of vectors's output is just the corresponding index in the list called "sentiment"

In [53]:
import pickle
with open('../model-files/training_vectors.pkl', 'wb') as f:
    pickle.dump(vectors, f)

## Train Neural Network

In [54]:
from NeuralNet import NeuralNetwork
import random
import pickle

In [62]:
with open('../model-files/training_vectors.pkl', 'rb') as file:
    # Load the pickled object (in this case, an array)
    data_vectors = pickle.load(file)
random.shuffle(data_vectors)
for i in range(len(data_vectors)):
    curr_in, curr_out = data_vectors[i]
    input = np.array(curr_in) / 255 # Turn the input into a numpy array
    input = np.reshape(input, (-1, 1)) # Need to turn it from 1x784 to 784x1, -1 used by numpy to automatically infer the number of rows based on size of original, 1 represents I only want 1 column
    output = np.array([curr_out])
    data_vectors[i] = (input, output)
train_data, test_data = data_vectors[:8500], data_vectors[8500:]
print(train_data[0][0].shape)
print(test_data[0][0].shape) # These should be the size of the input vectors

(500, 1)
(500, 1)


In [64]:
neuralnet = NeuralNetwork([500, 300, 100, 1])
neuralnet.fit(train_data, test_data)

KeyboardInterrupt: 