In [1]:
! git clone https://github.com/kaspergroenbek98/first-first-year-project.git

Cloning into 'first-first-year-project'...
remote: Enumerating objects: 73, done.[K
remote: Counting objects:   1% (1/73)[Kremote: Counting objects:   2% (2/73)[Kremote: Counting objects:   4% (3/73)[Kremote: Counting objects:   5% (4/73)[Kremote: Counting objects:   6% (5/73)[Kremote: Counting objects:   8% (6/73)[Kremote: Counting objects:   9% (7/73)[Kremote: Counting objects:  10% (8/73)[Kremote: Counting objects:  12% (9/73)[Kremote: Counting objects:  13% (10/73)[Kremote: Counting objects:  15% (11/73)[Kremote: Counting objects:  16% (12/73)[Kremote: Counting objects:  17% (13/73)[Kremote: Counting objects:  19% (14/73)[Kremote: Counting objects:  20% (15/73)[Kremote: Counting objects:  21% (16/73)[Kremote: Counting objects:  23% (17/73)[Kremote: Counting objects:  24% (18/73)[Kremote: Counting objects:  26% (19/73)[Kremote: Counting objects:  27% (20/73)[Kremote: Counting objects:  28% (21/73)[Kremote: Counting objects:  30% (22/73)[K

In [0]:
### Functions and import calls

import numpy as np
import nltk
import csv
import nltk
import re
from collections import Counter

### START OF VOCABULARY ###
def generate_vocabulary(data, fCol, vocabType):
    '''
    Returns a list/vocabulary of len <= "size" based on the vocabType and the featureColumn specified
    '''
    size = 2000
    # Only get large groups to get representative data
    major_features = np.array(nltk.FreqDist(data[:,fCol]).most_common(5))
    major_masks = np.array([data[:,fCol] == f for (f, cnt) in major_features])
    fqs = [tweet_word_distribution(data[mask,:]) for mask in major_masks]
    return list(vocabType(data, fCol, major_features, major_masks, fqs, size))

def vocab_feature_most_common(data, fCol, major_features, major_masks, fqs, size):
    '''
    Returns a vocabulary checklist for each tweet to check off (True/False).
    Prioritises the most common words for each feature
    '''
    vocabulary = set()
    # get the most common words in each freq dist. zip(*...) removes the counts from fd, and updates vocabulary ONLY with the words
    for fd in fqs:
        vocabulary.update(list(zip(*fd.most_common(size//len(major_features))))[0])
    return vocabulary

def vocab_feature_unique(data, fCol, major_features, major_masks, fqs, size):
    '''
    Returns a vocabulary checklist for each tweet to check off (True/False).
    Prioritises words which are uncommon in other features, but common in one feature
    '''
    major_mask = np.array(major_masks.sum(axis=0), dtype='bool')
    fq = tweet_word_distribution(data[major_mask,:]) # Get a fq over the words used by all in the major categories
    words = [word for (word, cnt) in fq.items() if cnt >= 20] # removes rarely mentioned words which probably arent indicative of a significant trend
    priorityArray = []
    for i, word in enumerate(words):
        priorityArray.append([word])
        #divide frequency of word in that state by the tweetcount from that state, and by how often that word is used in total by all states
        score = max(fqs[fID][word]/(int(major_features[fID][1])*fq[word]) for fID in range(len(major_masks)))
        priorityArray[i].append(score)
    priorityArray.sort(key = lambda x: x[1], reverse=True) # Sort them based on their best score
    vocabulary = zip(*priorityArray[:size])[0] # Removes their scores
    return vocabulary
### END OF VOCABULARY ###
    
def identify_hashtags(data):
    col = np.zeros((data.shape[0],1), 'str')
    data = np.append(data, col, axis=1)
    for i, text in enumerate(data[:,6]):
        results = re.findall(r"#\w+", text) # Finds matches and returns them as an iterable
        if results:
            data[i,15] = ' '.join(results)
        else:
            data[i,15] = ''

def tweet_features(tweet, word_features):
    tweet_words = set(tweet)
    features = {}
    for word in word_features:
        features[f'contains({word})'] = (word in tweet_words)
    return features

def divide_featureset(feature):
    '''
    Divides numpy featureset of (featureVector, classification) into a 80:10:10 train:dev:test set
    '''
    testSize = int(len(feature)*0.8)
    train, rest = feature[:testSize], feature[testSize:]
    restSize = len(rest)//2
    dev, test = rest[:restSize], rest[restSize:]
    return train, dev, test

def clean(data):
    porter = nltk.PorterStemmer()

    # Remove all stopwords, non-alphabet words (except spaces), and stem the words
    for i, row in enumerate(data[:,6]):
        row = row.lower()
        row = ''.join(char for char in row if char.isalpha() or char == ' ')
        row = ' '.join(porter.stem(word) for word in row.split() if word not in stopwords)
        data[i,6] = row

def tweet_word_distribution(data):
    # Split each sentence into tokens, and create a frequency distribution
    tokens = [token for sentence in data[:,6] for token in sentence.split()]
    fd = nltk.FreqDist(tokens)
    return fd

def generate_features(data, vocabulary, fCol):
    """ 
    Creates tuples with a vector containing boolean values depending on whether
    or not the word is in the tweet - along with the label of the tweet.
    """
    features = [(tweet_features(d.split(), vocabulary), c) for (d,c) in zip(data[:,6], data[:,fCol])] # column 6 is text data, column 2 is gender data
    return features

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))

# Read the data into a header and a data np.array - the array is then shuffled
with open('first-first-year-project/data.csv', encoding='latin1') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    data = np.array([np.array(line) for line in csv_reader])
header, data = data[0,:], data[1:,:]
np.random.shuffle(data)
print(header)

# Identify hashtags and insert them in the 15th column
identify_hashtags(data)
print(data.shape) # Note that shape starts at 1 and ends at 16, so the interval is [0:15]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['other_topic' 'resolution_topics' 'gender' 'name' 'Resolution_Category'
 'retweet_count' 'text' 'tweet_coord' 'tweet_created' 'tweet_date'
 'tweet_id' 'tweet_location' 'tweet_state' 'user_timezone' 'tweet_region']
(5011, 15)


In [0]:
clean(data)
fCol = {'gender':2, 'Resolution_Category':4, 'tweet_state':12, 'hashtags':15} # Dont think hashtags work right now
vocabType = [vocab_feature_most_common, vocab_feature_unique][0] #Pick one by replacing the number in [int]
vocabulary =         generate_vocabulary(data, fCol['tweet_state'], vocabType)
features = generate_features(data, vocabulary, fCol['tweet_state'])
train, dev, test = divide_featureset(features)

In [5]:
classifier = nltk.NaiveBayesClassifier.train(train)
print(nltk.classify.accuracy(classifier, dev))
classifier.show_most_informative_features(10)

0.14570858283433133
Most Informative Features
          contains(quit) = True               ND : CA     =    193.3 : 1.0
          contains(step) = True               DE : CA     =    116.0 : 1.0
       contains(complet) = True               DE : CA     =    116.0 : 1.0
      contains(cariloha) = True               DE : CA     =    116.0 : 1.0
          contains(babi) = True               ND : TX     =    106.3 : 1.0
          contains(easi) = True               DE : NY     =     88.0 : 1.0
          contains(hous) = True               MT : CA     =     82.9 : 1.0
         contains(faith) = True               WY : CA     =     72.5 : 1.0
        contains(chocol) = True               WY : CA     =     72.5 : 1.0
         contains(small) = True               DE : CA     =     69.6 : 1.0


In [6]:
bigrm = list(nltk.ngrams(tokens, 2))
fdbigrm = nltk.FreqDist(bigrm)


NameError: ignored

In [7]:
word_features = list(fdbigrm.keys())[:2000]


size = 1000
featuresets = [(tweet_features(nltk.ngrams(d.split(), 2)), c) for (d,c) in zip(data[:size,6], data[:size,2])] # column 6 is text data, column 2 is gender data
train_set, test_set = featuresets[size//2:], featuresets[:size//2]
classifier = nltk.NaiveBayesClassifier.train(train_set)

NameError: ignored

In [8]:
print(train[:1])

[({'contains(goodby)': False, 'contains(realli)': False, 'contains(direct)': False, 'contains(fewer)': False, 'contains(tiburonnapl)': False, 'contains(fruit)': False, 'contains(anyth)': False, 'contains(angel)': False, 'contains(faith)': False, 'contains(help)': False, 'contains(great)': False, 'contains(ïü)': False, 'contains(exercis)': False, 'contains(daili)': False, 'contains(note)': False, 'contains(pic)': False, 'contains(la)': False, 'contains(heartach)': False, 'contains(happynewyear)': False, 'contains(everyon)': False, 'contains(effort)': False, 'contains(youtub)': False, 'contains(come)': False, 'contains(guilt)': False, 'contains(rachelclarkgiveaway)': False, 'contains(motiv)': False, 'contains(ùâùâùâ)': False, 'contains(bi)': False, 'contains(career)': False, 'contains(httptcogdwkslj)': False, 'contains(trust)': False, 'contains(got)': False, 'contains(overr)': False, 'contains(bae)': False, 'contains(blog)': False, 'contains(stick)': False, 'contains(punish)': False, 'co

# Bad Word Implementation

In [0]:
import pandas as pd
import itertools
clean(data)
np.random.shuffle(data)

In [0]:
bad_words_txt = pd.read_csv('first-first-year-project/bad-words.txt', sep='\n',)
bad_words_array = bad_words_txt.to_numpy()
bad_words_list = bad_words_array.tolist()
bad_words = list(itertools.chain(*bad_words_list))

In [0]:
tweets_created = list(data[:,8])
dates = [i.split(' ',1)[0] for i in tweets_created]
times = [i.split(' ',1)[-1] for i in tweets_created]

In [0]:
fCol = {'gender':2, 'Resolution_Category':4, 'tweet_created':8, 'tweet_state':12, 'tweet_region':14, 'hashtags':15}

## Bad words and Gender

In [0]:
features2 = generate_features(data, bad_words, fCol['gender'])
train2, dev2, test2 = divide_featureset(features2)

In [18]:
classifier2 = nltk.NaiveBayesClassifier.train(train2)
print(nltk.classify.accuracy(classifier2, dev2))
classifier2.show_most_informative_features(15)

0.47105788423153694
Most Informative Features
          contains(fear) = True           female : male   =      3.7 : 1.0
         contains(adult) = True           female : male   =      3.7 : 1.0
        contains(bigger) = True             male : female =      3.6 : 1.0
          contains(sick) = True           female : male   =      3.0 : 1.0
        contains(toilet) = True             male : female =      3.0 : 1.0
           contains(kid) = True           female : male   =      2.7 : 1.0
          contains(chin) = True           female : male   =      2.4 : 1.0
         contains(shoot) = True             male : female =      2.3 : 1.0
          contains(crap) = True             male : female =      2.3 : 1.0
          contains(poop) = True             male : female =      2.3 : 1.0
          contains(suck) = True           female : male   =      2.2 : 1.0
         contains(faith) = True           female : male   =      2.2 : 1.0
          contains(burn) = True           female : mal

## Bad words and Categories

In [19]:
features3 = generate_features(data, bad_words, fCol['Resolution_Category'])
train3, dev3, test3 = divide_featureset(features3)
classifier3 = nltk.NaiveBayesClassifier.train(train3)
print(nltk.classify.accuracy(classifier3, dev3))
classifier3.show_most_informative_features(10)

0.37524950099800397
Most Informative Features
         contains(color) = True           Time M : Person =     12.3 : 1.0
       contains(destroy) = True           Philan : Humor  =     11.6 : 1.0
        contains(german) = True           Educat : Humor  =     11.1 : 1.0
          contains(blow) = True           Financ : Person =     10.0 : 1.0
           contains(god) = True           Person : Humor  =      9.9 : 1.0
         contains(fight) = True           Philan : Health =      9.6 : 1.0
           contains(kid) = True           Philan : Health =      9.6 : 1.0
          contains(damn) = True           Philan : Health =      9.6 : 1.0
           contains(ass) = True           Educat : Person =      8.5 : 1.0
         contains(shoot) = True           Career : Person =      8.4 : 1.0


## Bad words and Regions

In [20]:
features4 = generate_features(data, bad_words, fCol['tweet_region'])
train4, dev4, test4 = divide_featureset(features4)
classifier4 = nltk.NaiveBayesClassifier.train(train4)
print(nltk.classify.accuracy(classifier4, dev4))
classifier4.show_most_informative_features(10)

0.27944111776447106
Most Informative Features
         contains(drunk) = True           Northe : South  =      5.5 : 1.0
           contains(die) = True           Midwes : South  =      4.1 : 1.0
         contains(faith) = True           Midwes : South  =      4.1 : 1.0
          contains(fear) = True           Midwes : South  =      3.7 : 1.0
          contains(sick) = True           Midwes : South  =      3.7 : 1.0
           contains(ass) = True            South : Northe =      3.3 : 1.0
        contains(bigger) = True             West : South  =      3.0 : 1.0
         contains(nigga) = True             West : South  =      3.0 : 1.0
         contains(adult) = True             West : South  =      3.0 : 1.0
          contains(suck) = True           Midwes : West   =      2.9 : 1.0


## Bad words and Dates

In [23]:
#In progress: comparison of bad words and dates
features5 = generate_features(data, bad_words, dates)
train5, dev5, test5 = divide_featureset(features5)
classifier5 = nltk.NaiveBayesClassifier.train(train5)
print(nltk.classify.accuracy(classifier5, dev5))
classifier5.show_most_informative_features(10)

IndexError: ignored

In [1]:
#In progress: creation of bad word bigrams 
tokens = [token for sentence in data[:,6] for token in sentence.split()]
bigrams2 = list(nltk.ngrams(tokens, 2))
fdbigrams2 = nltk.FreqDist(bigrams2)

NameError: ignored