<a href="https://colab.research.google.com/github/kaspergroenbek98/first-first-year-project/blob/master/Copy_of_chaosBook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
! git clone https://github.com/kaspergroenbek98/first-first-year-project.git

fatal: destination path 'first-first-year-project' already exists and is not an empty directory.


In [0]:
import numpy as np
import nltk
import csv
import nltk
import re

from collections import Counter

def vocab_state_common(data):
    '''
    Returns a vocabulary checklist for each tweet to check off (True/False).
    Prioritises the most common words in a state
    '''
    size = 2000//5
    big_states = Counter(data[:,12]).most_common(5)
    vocabulary = []
    for state in big_states:
        mask = (data[:,12] == state)
        vocabulary.update( Counter(data[mask,6]).most_common(size) )
    return vocabulary

def vocab_state_unique(data):
    '''
    Returns a vocabulary checklist for each tweet to check off (True/False).
    Prioritises words which are uncommon in other states, but common in one state
    '''
    size = 2000//5
    big_states = np.array(Counter(data[:,12]).most_common(5))
    state_mask = (data[:,12] == big_states)
    state_data = data[state_mask,:]

    word_priority = np.zeros((state_data.shape[0], 1))
    _, fq = token_distr(state_data)
    

    vocabulary = set()


    words = Counter()


    for state in big_states:
        mask = (state_data[:,12] == state)
        vocabulary.update( Counter(stateData[:,6]).most_common(size) )
    return vocabulary


def identify_hashtags(data):
    col = np.zeros((data.shape[0],1), 'str')
    data = np.append(data, col, axis=1)
    for i, text in enumerate(data[:,6]):
        results = re.findall(r"#\w+", text) # Finds matches and returns them as an iterable
        if results:
            data[i,15] = ' '.join(results)
        else:
            data[i,15] = ''

def tweet_features(tweet, word_features):
    tweet_words = set(tweet)
    features = {}
    for word in word_features:
        features[f'contains({word})'] = (word in tweet_words)
    return features

def divide_featureset(feature):
    '''
    Divides numpy featureset of (featureVector, classification) into a 80:10:10 train:dev:test set
    '''
    testSize = int(len(feature)*0.8)
    train, rest = feature[:testSize], feature[testSize:]
    restSize = len(rest)//2
    dev, test = rest[:restSize], rest[restSize:]
    return train, dev, test


def clean(data):
    porter = nltk.PorterStemmer()

    # Remove all stopwords, non-alphabet words (except spaces), and stem the words
    for i, row in enumerate(data[:,6]):
        row = row.lower()
        row = ''.join(char for char in row if char.isalpha() or char == ' ')
        row = ' '.join(porter.stem(word) for word in row.split() if word not in stopwords)
        data[i,6] = row

def token_distr(data):
    # Create a long string of ALL words, then token them, and create a frequency distribution
    actualWords = ' '.join(token for sentence in data[:,6] for token in sentence.split())
    tokens = actualWords.split()
    fd = nltk.FreqDist(tokens)
    return tokens, fd

def generate_features(data, fd):
    word_features = [word for (word, count) in fd.most_common(2000)]
    """ 
    Creates tuples with a vector containing boolean values depending on whether
    or not the word is in the tweet - along with the label of the tweet.
    """
    features = [(tweet_features(d.split(), word_features), c) for (d,c) in zip(data[:,6], data[:,2])] # column 6 is text data, column 2 is gender data
    return features

nltk.download('punkt')
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))

# Read the data into a header and a data np.array - the array is then shuffled
with open('first-first-year-project/data.csv', encoding='latin1') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    data = np.array([np.array(line) for line in csv_reader])
header, data = data[0,:], data[1:,:]

print(header)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['other_topic' 'resolution_topics' 'gender' 'name' 'Resolution_Category'
 'retweet_count' 'text' 'tweet_coord' 'tweet_created' 'tweet_date'
 'tweet_id' 'tweet_location' 'tweet_state' 'user_timezone' 'tweet_region']


In [0]:
# Identify hashtags and insert them in the 15th column
identify_hashtags(data)
print(data.shape) # Note that shape starts at 1 and ends at 16, so the interval is [0:15]

(5011, 15)


In [0]:
clean(data)
tokens, fd = token_distr(data)

featureDict = {'gender':2, 'resolution_Category':4, 'tweet_state':12, 'hashtags':15}
featureIdx = featureDict['tweet_state']
vocabulary = vocab_state_common(data)

features = generate_features(data, fd)
train, dev, test = divide_featureset(features)

In [0]:
classifier = nltk.NaiveBayesClassifier.train(train)
print(nltk.classify.accuracy(classifier, dev))
classifier.show_most_informative_features(10)

0.5528942115768463
Most Informative Features
          contains(join) = True             male : female =     11.9 : 1.0
          contains(pray) = True           female : male   =      7.3 : 1.0
          contains(name) = True             male : female =      6.8 : 1.0
         contains(catch) = True             male : female =      6.1 : 1.0
       contains(tixwish) = True           female : male   =      5.9 : 1.0
        contains(bigger) = True             male : female =      5.5 : 1.0
       contains(basebal) = True             male : female =      5.5 : 1.0
        contains(youtub) = True             male : female =      5.5 : 1.0
 contains(mlbpaclubhous) = True             male : female =      5.3 : 1.0
         contains(chanc) = True           female : male   =      5.2 : 1.0


In [0]:
bigrm = list(nltk.ngrams(tokens, 2))
fdbigrm = nltk.FreqDist(bigrm)


In [0]:
word_features = list(fdbigrm.keys())[:2000]


size = 1000
featuresets = [(tweet_features(nltk.ngrams(d.split(), 2)), c) for (d,c) in zip(data[:size,6], data[:size,2])] # column 6 is text data, column 2 is gender data
train_set, test_set = featuresets[size//2:], featuresets[:size//2]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [0]:
print(nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features(10)

In [0]:
'newyearsresolution' in stopwords