<a href="https://colab.research.google.com/github/kaspergroenbek98/first-first-year-project/blob/master/chaosBook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
! git clone https://github.com/kaspergroenbek98/first-first-year-project.git

fatal: destination path 'first-first-year-project' already exists and is not an empty directory.


In [0]:
### Functions and import calls

import numpy as np
import nltk
import csv
import nltk
import re
from collections import Counter

### START OF VOCABULARY ###
def generate_vocabulary(data, fCol, vocabType):
    '''
    Returns a list/vocabulary of len <= "size" based on the vocabType and the featureColumn specified
    '''
    size = 2000
    # Only get large groups to get representative data
    major_features = np.array(nltk.FreqDist(data[:,fCol]).most_common(5))
    major_masks = np.array([data[:,fCol] == f for (f, cnt) in major_features])
    fqs = [tweet_word_distribution(data[mask,:]) for mask in major_masks]
    return list(vocabType(data, fCol, major_features, major_masks, fqs, size))

def vocab_feature_most_common(data, fCol, major_features, major_masks, fqs, size):
    '''
    Returns a vocabulary checklist for each tweet to check off (True/False).
    Prioritises the most common words for each feature
    '''
    vocabulary = set()
    # get the most common words in each freq dist. zip(*...) removes the counts from fd, and updates vocabulary ONLY with the words
    for fd in fqs:
        vocabulary.update(list(zip(*fd.most_common(size//len(major_features))))[0])
    return vocabulary

def vocab_feature_unique(data, fCol, major_features, major_masks, fqs, size):
    '''
    Returns a vocabulary checklist for each tweet to check off (True/False).
    Prioritises words which are uncommon in other features, but common in one feature
    '''
    major_mask = np.array(major_masks.sum(axis=0), dtype='bool')
    fq = tweet_word_distribution(data[major_mask,:]) # Get a fq over the words used by all in the major categories
    words = [word for (word, cnt) in fq.items() if cnt >= 20] # removes rarely mentioned words which probably arent indicative of a significant trend
    priorityArray = []
    for i, word in enumerate(words):
        priorityArray.append([word])
        #divide frequency of word in that state by the tweetcount from that state, and by how often that word is used in total by all states
        score = max(fqs[fID][word]/(int(major_features[fID][1])*fq[word]) for fID in range(len(major_masks)))
        priorityArray[i].append(score)
    priorityArray.sort(key = lambda x: x[1], reverse=True) # Sort them based on their best score
    vocabulary = zip(*priorityArray[:size])[0] # Removes their scores
    return vocabulary
### END OF VOCABULARY ###
    
def identify_hashtags(data):
    col = np.zeros((data.shape[0],1), 'str')
    data = np.append(data, col, axis=1)
    for i, text in enumerate(data[:,6]):
        results = re.findall(r"#\w+", text) # Finds matches and returns them as an iterable
        if results:
            data[i,15] = ' '.join(results)
        else:
            data[i,15] = ''

def tweet_features(tweet, word_features):
    tweet_words = set(tweet)
    features = {}
    for word in word_features:
        features[f'contains({word})'] = (word in tweet_words)
    return features

def divide_featureset(feature):
    '''
    Divides numpy featureset of (featureVector, classification) into a 80:10:10 train:dev:test set
    '''
    testSize = int(len(feature)*0.8)
    train, rest = feature[:testSize], feature[testSize:]
    restSize = len(rest)//2
    dev, test = rest[:restSize], rest[restSize:]
    return train, dev, test

def clean(data):
    porter = nltk.PorterStemmer()

    # Remove all stopwords, non-alphabet words (except spaces), and stem the words
    for i, row in enumerate(data[:,6]):
        row = row.lower()
        row = ''.join(char for char in row if char.isalpha() or char == ' ')
        row = ' '.join(porter.stem(word) for word in row.split() if word not in stopwords)
        data[i,6] = row

def tweet_word_distribution(data):
    # Split each sentence into tokens, and create a frequency distribution
    tokens = [token for sentence in data[:,6] for token in sentence.split()]
    fd = nltk.FreqDist(tokens)
    return fd

def generate_features(data, vocabulary, fCol):
    """ 
    Creates tuples with a vector containing boolean values depending on whether
    or not the word is in the tweet - along with the label of the tweet.
    """
    features = [(tweet_features(d.split(), vocabulary), c) for (d,c) in zip(data[:,6], data[:,fCol])] # column 6 is text data, column 2 is gender data
    return features

In [0]:
nltk.download('punkt')
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))

# Read the data into a header and a data np.array - the array is then shuffled
with open('first-first-year-project/data.csv', encoding='latin1') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    data = np.array([np.array(line) for line in csv_reader])
header, data = data[0,:], data[1:,:]
np.random.shuffle(data)
print(header)

# Identify hashtags and insert them in the 15th column
identify_hashtags(data)
print(data.shape) # Note that shape starts at 1 and ends at 16, so the interval is [0:15]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['other_topic' 'resolution_topics' 'gender' 'name' 'Resolution_Category'
 'retweet_count' 'text' 'tweet_coord' 'tweet_created' 'tweet_date'
 'tweet_id' 'tweet_location' 'tweet_state' 'user_timezone' 'tweet_region']
(5011, 15)


In [0]:
clean(data)
fCol = {'gender':2, 'Resolution_Category':4, 'tweet_state':12, 'hashtags':15} # Dont think hashtags work right now
vocabType = [vocab_feature_most_common, vocab_feature_unique][0] #Pick one by replacing the number in [int]
vocabulary =         generate_vocabulary(data, fCol['tweet_state'], vocabType)
features = generate_features(data, vocabulary, fCol['tweet_state'])
train, dev, test = divide_featureset(features)

In [5]:
classifier = nltk.NaiveBayesClassifier.train(train)
print(nltk.classify.accuracy(classifier, dev))
classifier.show_most_informative_features(10)

0.13373253493013973
Most Informative Features
          contains(date) = True               MT : CA     =    149.5 : 1.0
        contains(hahaha) = True               MT : CA     =    149.5 : 1.0
       contains(correct) = True               MT : NY     =    110.8 : 1.0
          contains(hous) = True               MT : NY     =    110.8 : 1.0
      contains(mcdonald) = True               MT : CA     =     89.7 : 1.0
          contains(wast) = True               WY : CA     =     85.4 : 1.0
         contains(faith) = True               WY : CA     =     85.4 : 1.0
      contains(cariloha) = True               DE : CA     =     85.4 : 1.0
        contains(chocol) = True               DE : CA     =     85.4 : 1.0
          contains(babi) = True               ND : TX     =     81.8 : 1.0


In [6]:
bigrm = list(nltk.ngrams(tokens, 2))
fdbigrm = nltk.FreqDist(bigrm)


NameError: ignored

In [0]:
word_features = list(fdbigrm.keys())[:2000]


size = 1000
featuresets = [(tweet_features(nltk.ngrams(d.split(), 2)), c) for (d,c) in zip(data[:size,6], data[:size,2])] # column 6 is text data, column 2 is gender data
train_set, test_set = featuresets[size//2:], featuresets[:size//2]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [0]:
print(train[:1])