<a href="https://colab.research.google.com/github/kaspergroenbek98/first-first-year-project/blob/master/chaosBook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
! git clone https://github.com/kaspergroenbek98/first-first-year-project.git

fatal: destination path 'first-first-year-project' already exists and is not an empty directory.


In [0]:
### Functions and import calls

import numpy as np
import nltk
import csv
import nltk
import re
from collections import Counter

### START OF VOCABULARY ###
def generate_vocabulary(data, fCol, vocabType):
    '''
    Returns a list/vocabulary of len <= "size" based on the vocabType and the featureColumn specified
    '''
    size = 2000
    # Only get large groups to get representative data
    major_features = np.array(nltk.FreqDist(data[:,fCol]).most_common(5))
    major_masks = np.array([data[:,fCol] == f for (f, cnt) in major_features])
    fqs = [tweet_word_distribution(data[mask,:]) for mask in major_masks]
    return list(vocabType(data, fCol, major_features, major_masks, fqs, size))

def vocab_feature_most_common(data, fCol, major_features, major_masks, fqs, size):
    '''
    Returns a vocabulary checklist for each tweet to check off (True/False).
    Prioritises the most common words for each feature
    '''
    vocabulary = set()
    # get the most common words in each freq dist. zip(*...) removes the counts from fd, and updates vocabulary ONLY with the words
    for fd in fqs:
        vocabulary.update(list(zip(*fd.most_common(size//len(major_features))))[0])
    return vocabulary

def vocab_feature_unique(data, fCol, major_features, major_masks, fqs, size):
    '''
    Returns a vocabulary checklist for each tweet to check off (True/False).
    Prioritises words which are uncommon in other features, but common in one feature
    '''
    major_mask = np.array(major_masks.sum(axis=0), dtype='bool')
    fq = tweet_word_distribution(data[major_mask,:]) # Get a fq over the words used by all in the major categories
    words = [word for (word, cnt) in fq.items() if cnt >= 20] # removes rarely mentioned words which probably arent indicative of a significant trend
    priorityArray = []
    for i, word in enumerate(words):
        priorityArray.append([word])
        #divide frequency of word in that state by the tweetcount from that state, and by how often that word is used in total by all states
        score = max(fqs[fID][word]/(int(major_features[fID][1])*fq[word]) for fID in range(len(major_masks)))
        priorityArray[i].append(score)
    priorityArray.sort(key = lambda x: x[1], reverse=True) # Sort them based on their best score
    vocabulary = zip(*priorityArray[:size])[0] # Removes their scores
    return vocabulary
### END OF VOCABULARY ###
    
def identify_hashtags(data):
    col = np.zeros((data.shape[0],1), 'str')
    data = np.append(data, col, axis=1)
    for i, text in enumerate(data[:,6]):
        results = re.findall(r"#\w+", text) # Finds matches and returns them as an iterable
        if results:
            data[i,15] = ' '.join(results)
        else:
            data[i,15] = ''

def tweet_features(tweet, word_features):
    tweet_words = set(tweet)
    features = {}
    for word in word_features:
        features[f'contains({word})'] = (word in tweet_words)
    return features

def divide_featureset(feature):
    '''
    Divides numpy featureset of (featureVector, classification) into a 80:10:10 train:dev:test set
    '''
    testSize = int(len(feature)*0.8)
    train, rest = feature[:testSize], feature[testSize:]
    restSize = len(rest)//2
    dev, test = rest[:restSize], rest[restSize:]
    return train, dev, test

def clean(data):
    porter = nltk.PorterStemmer()

    # Remove all stopwords, non-alphabet words (except spaces), and stem the words
    for i, row in enumerate(data[:,6]):
        row = row.lower()
        row = ''.join(char for char in row if char.isalpha() or char == ' ')
        row = ' '.join(porter.stem(word) for word in row.split() if word not in stopwords)
        data[i,6] = row

def tweet_word_distribution(data):
    # Split each sentence into tokens, and create a frequency distribution
    tokens = [token for sentence in data[:,6] for token in sentence.split()]
    fd = nltk.FreqDist(tokens)
    return fd

def generate_features(data, vocabulary, fCol):
    """ 
    Creates tuples with a vector containing boolean values depending on whether
    or not the word is in the tweet - along with the label of the tweet.
    """
    features = [(tweet_features(d.split(), vocabulary), c) for (d,c) in zip(data[:,6], data[:,fCol])] # column 6 is text data, column 2 is gender data
    return features

In [0]:
nltk.download('punkt')
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))

# Read the data into a header and a data np.array - the array is then shuffled
with open('first-first-year-project/data.csv', encoding='latin1') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    data = np.array([np.array(line) for line in csv_reader])
header, data = data[0,:], data[1:,:]
np.random.shuffle(data)
print(header)

# Identify hashtags and insert them in the 15th column
identify_hashtags(data)
print(data.shape) # Note that shape starts at 1 and ends at 16, so the interval is [0:15]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['other_topic' 'resolution_topics' 'gender' 'name' 'Resolution_Category'
 'retweet_count' 'text' 'tweet_coord' 'tweet_created' 'tweet_date'
 'tweet_id' 'tweet_location' 'tweet_state' 'user_timezone' 'tweet_region']
(5011, 15)


In [0]:
clean(data)
fCol = {'gender':2, 'Resolution_Category':4, 'tweet_state':12, 'hashtags':15} # Dont think hashtags work right now
vocabType = [vocab_feature_most_common, vocab_feature_unique][0] #Pick one by replacing the number in [int]
vocabulary =         generate_vocabulary(data, fCol['tweet_state'], vocabType)
features = generate_features(data, vocabulary, fCol['tweet_state'])
train, dev, test = divide_featureset(features)

In [0]:
classifier = nltk.NaiveBayesClassifier.train(train)
print(nltk.classify.accuracy(classifier, dev))
classifier.show_most_informative_features(10)

0.1536926147704591
Most Informative Features
          contains(quit) = True               ND : CA     =    150.0 : 1.0
          contains(easi) = True               DE : CA     =     85.7 : 1.0
          contains(date) = True               MT : CA     =     85.7 : 1.0
        contains(chocol) = True               DE : CA     =     85.7 : 1.0
        contains(hahaha) = True               MT : CA     =     85.7 : 1.0
          contains(hous) = True               MT : CA     =     85.7 : 1.0
        contains(colleg) = True               NH : CA     =     83.3 : 1.0
          contains(babi) = True               ND : TX     =     80.8 : 1.0
 contains(mlbpaclubhous) = True               ND : TX     =     80.8 : 1.0
          contains(noth) = True               WY : CA     =     75.0 : 1.0


In [0]:
bigrm = list(nltk.ngrams(tokens, 2))
fdbigrm = nltk.FreqDist(bigrm)


NameError: ignored

In [0]:
word_features = list(fdbigrm.keys())[:2000]


size = 1000
featuresets = [(tweet_features(nltk.ngrams(d.split(), 2)), c) for (d,c) in zip(data[:size,6], data[:size,2])] # column 6 is text data, column 2 is gender data
train_set, test_set = featuresets[size//2:], featuresets[:size//2]
classifier = nltk.NaiveBayesClassifier.train(train_set)

NameError: ignored

In [0]:
print(train[:1])

[({'contains(crochet)': False, 'contains(seek)': False, 'contains(duck)': False, 'contains(ûïlaurenmabra)': False, 'contains(face)': False, 'contains(song)': False, 'contains(girlfriend)': False, 'contains(k)': False, 'contains(ûïtumbierpost)': False, 'contains(tattoo)': False, 'contains(limit)': False, 'contains(pot)': False, 'contains(someon)': False, 'contains(û)': False, 'contains(health)': False, 'contains(show)': False, 'contains(woman)': False, 'contains(content)': False, 'contains(even)': False, 'contains(whenev)': False, 'contains(blog)': False, 'contains(emot)': False, 'contains(tonight)': False, 'contains(find)': False, 'contains(helpoth)': False, 'contains(grace)': False, 'contains(justinbieb)': False, 'contains(mû)': False, 'contains(ween)': False, 'contains(plank)': False, 'contains(hair)': False, 'contains(shape)': False, 'contains(earli)': False, 'contains(bring)': False, 'contains(ass)': False, 'contains(supper)': False, 'contains(bjork)': False, 'contains(import)': Fa

# Bad Word Implementation

In [0]:
import pandas as pd
import itertools
clean(data)
np.random.shuffle(data)

In [0]:
bad_words_txt = pd.read_csv('first-first-year-project/bad-words.txt', sep='\n',)
bad_words_array = bad_words_txt.to_numpy()
bad_words_list = bad_words_array.tolist()
bad_words = list(itertools.chain(*bad_words_list))

In [0]:
tweets_created = list(data[:,8])
dates = [i.split(' ',1)[0] for i in tweets_created]
times = [i.split(' ',1)[-1] for i in tweets_created]

In [0]:
fCol = {'gender':2, 'Resolution_Category':4, 'tweet_created':8, 'tweet_state':12, 'tweet_region':14, 'hashtags':15}

## Bad words and Gender

In [0]:
features2 = generate_features(data, bad_words, fCol['gender'])
train2, dev2, test2 = divide_featureset(features2)

In [0]:
classifier2 = nltk.NaiveBayesClassifier.train(train2)
print(nltk.classify.accuracy(classifier2, dev2))
classifier2.show_most_informative_features(15)

0.4870259481037924
Most Informative Features
        contains(bigger) = True             male : female =      6.4 : 1.0
          contains(crap) = True             male : female =      3.1 : 1.0
        contains(toilet) = True             male : female =      3.1 : 1.0
         contains(shoot) = True             male : female =      3.1 : 1.0
          contains(poop) = True             male : female =      3.1 : 1.0
          contains(butt) = True             male : female =      2.6 : 1.0
         contains(fight) = True             male : female =      2.5 : 1.0
           contains(god) = True             male : female =      2.5 : 1.0
           contains(fat) = True             male : female =      2.4 : 1.0
          contains(blow) = True             male : female =      2.4 : 1.0
        contains(heroin) = True             male : female =      2.4 : 1.0
          contains(cunt) = True             male : female =      2.4 : 1.0
          contains(hell) = True           female : male

## Bad words and Categories

In [0]:
features3 = generate_features(data, bad_words, fCol['Resolution_Category'])
train3, dev3, test3 = divide_featureset(features3)
classifier3 = nltk.NaiveBayesClassifier.train(train3)
print(nltk.classify.accuracy(classifier3, dev3))
classifier3.show_most_informative_features(10)

0.3852295409181637
Most Informative Features
        contains(german) = True           Educat : Person =     18.4 : 1.0
         contains(nigga) = True           Career : Person =     13.7 : 1.0
          contains(damn) = True           Philan : Person =     13.0 : 1.0
           contains(kid) = True           Philan : Health =     10.6 : 1.0
          contains(blow) = True           Financ : Person =      9.5 : 1.0
         contains(fight) = True           Philan : Person =      9.3 : 1.0
           contains(god) = True           Person : Humor  =      9.1 : 1.0
         contains(death) = True           Career : Humor  =      8.3 : 1.0
        contains(fucker) = True           Career : Humor  =      8.3 : 1.0
        contains(harder) = True           Career : Humor  =      8.3 : 1.0


## Bad words and Regions

In [0]:
features4 = generate_features(data, bad_words, fCol['tweet_region'])
train4, dev4, test4 = divide_featureset(features4)
classifier4 = nltk.NaiveBayesClassifier.train(train4)
print(nltk.classify.accuracy(classifier4, dev4))
classifier4.show_most_informative_features(10)

0.2954091816367265
Most Informative Features
        contains(bigger) = True             West : South  =      4.7 : 1.0
          contains(sick) = True           Midwes : South  =      3.8 : 1.0
         contains(black) = True           Northe : South  =      3.6 : 1.0
           contains(kid) = True             West : South  =      3.1 : 1.0
         contains(shoot) = True             West : South  =      3.0 : 1.0
          contains(poop) = True             West : South  =      3.0 : 1.0
          contains(fear) = True           Midwes : West   =      3.0 : 1.0
          contains(suck) = True           Midwes : West   =      3.0 : 1.0
         contains(fight) = True           Northe : Midwes =      2.8 : 1.0
        contains(harder) = True           Midwes : West   =      2.8 : 1.0


## Bad words and Dates

In [0]:
#In progress: comparison of bad words and dates
features5 = generate_features(data, bad_words, dates)
train5, dev5, test5 = divide_featureset(features5)
classifier5 = nltk.NaiveBayesClassifier.train(train5)
print(nltk.classify.accuracy(classifier5, dev5))
classifier5.show_most_informative_features(10)

IndexError: ignored

In [0]:
#In progress: creation of bad word bigrams 
tokens = [token for sentence in data[:,6] for token in sentence.split()]
bigrams2 = list(nltk.ngrams(tokens, 2))
fdbigrams2 = nltk.FreqDist(bigrams2)

#**K-Fold implementation** (WORKING!)

In [0]:
# Data preperations, set all the relevant labels here as done earlier
np.random.shuffle(data)
kf_fCol = {'gender':2, 'Resolution_Category':4, 'tweet_state':12, 'hashtags':15}
kf_vocabType = [vocab_feature_most_common, vocab_feature_unique][0] #Pick one by replacing the number in [int]
kf_vocabulary = generate_vocabulary(data, kf_fCol['gender'], kf_vocabType)
kf_features = generate_features(data, vocabulary, fCol['gender'])


#kf_features = kf_features[:500]  # Define the amount of data to use

In [65]:
# Import KFold model and define the amount of folds
from sklearn.model_selection import KFold  # import model
k = 5  # amount of folds

# Store an object of the KFold class in a variable with shuffle=True
kf = KFold(n_splits=k, shuffle=True)

""" 
Loop for the models and their designated folds (1 to k)
and calculate the average of each of the 'k' models' accuracy
using the NB classifier and k-fold model class.
"""
sum = 0 # sum to calculate average of model accuracies
iteration = 1
for train, test in kf.split(kf_features): # Loop over K chunk of data splits
    train_data = np.array(kf_features)[train] # New chunk of train data
    test_data = np.array(kf_features)[test] # New chunk of test data

    # Define the NB classifier to the train data
    kd_classifier = nltk.NaiveBayesClassifier.train(train_data)

    print("Iteration no.", iteration, "\n")
    print("Accuracy of iteration number", iteration, ":", nltk.classify.accuracy(kd_classifier, test_data))
    kd_classifier.show_most_informative_features(10)
    print("\n\n")

    sum += nltk.classify.accuracy(kd_classifier, test_data) # Add current sum to total

    iteration += 1 # Increment variable to display next iteration

# Average calculated by the sum divided by the number of folds
average = sum/k
print("The accuracy average of the K-fold models:", average)

Iteration no. 1 

Accuracy of iteration number 1 : 0.5776892430278885
Most Informative Features
      contains(cariloha) = True           female : male   =     12.7 : 1.0
contains(hashtagoftheweek) = True           female : male   =     10.6 : 1.0
        contains(youtub) = True             male : female =      6.3 : 1.0
        contains(chocol) = True           female : male   =      5.7 : 1.0
        contains(bigger) = True             male : female =      5.6 : 1.0
          contains(dick) = True           female : male   =      5.0 : 1.0
       contains(tixwish) = True           female : male   =      5.0 : 1.0
          contains(hurt) = True           female : male   =      5.0 : 1.0
          contains(weed) = True             male : female =      5.0 : 1.0
          contains(wife) = True             male : female =      5.0 : 1.0



Iteration no. 2 

Accuracy of iteration number 2 : 0.5568862275449101
Most Informative Features
      contains(cariloha) = True           female : ma