# task1. preprocessing tweets
### load datasets

In [1]:
import json

x = []
y = []
data = json.load(open("data.json"))
for k, v in data.items():
    x.append(k)
    y.append(v)
    
print("Number of tweets =", len(x))
print("Number of labels =", len(y))
print("\nSamples of data:")
for i in range(10):
    print("Country =", y[i], "\tTweet =", x[i])

assert(len(x) == 943)
assert(len(y) == 943)


Number of tweets = 943
Number of labels = 943

Samples of data:
Country = us 	Tweet = @Addictd2Success thx u for following
Country = us 	Tweet = Let's just say, if I were to ever switch teams, Khalesi would be top of the list. #girlcrush
Country = ph 	Tweet = Taemin jonghyun!!! Your birits make me go~ http://t.co/le8z3dntlA
Country = id 	Tweet = depart.senior 👻 rapat perdana (with Nyayu, Anita, and 8 others at Ruang Aescullap FK Unsri Madang) — https://t.co/swRALlNkrQ
Country = ph 	Tweet = Done with internship with this pretty little lady!  (@ Metropolitan Medical Center w/ 3 others) [pic]: http://t.co/1qH61R1t5r
Country = gb 	Tweet = Wow just Boruc's clanger! Haha Sunday League stuff that, Giroud couldn't believe his luck! #clown
Country = my 	Tweet = I'm at Sushi Zanmai (Petaling Jaya, Selangor) w/ 5 others http://t.co/bcNobykZ
Country = us 	Tweet = Mega Fest!!!! Its going down🙏🙌  @BishopJakes
Country = gb 	Tweet = @EllexxxPharrell wow love the pic babe xx
Country = us 	Tweet = You 

### Preprocessing for tweets 

In [2]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import re

tt = TweetTokenizer()
stopwords = set(stopwords.words('english')) #note: stopwords are all in lowercase

def preprocess_data(data, labels):
    cleaned_tokens = []
    cleaned_labels = []
    
    for string, label in zip(data, labels):
        dic = {}
        tokens = tt.tokenize(string)    # step1. tokenize a tweet data
        for token in tokens:
            token = token.lower()       # step2. lowercase words
            
            # step3 & step4. save words including any English alphabets and not belonging to stopwords 
            if (re.search("[a-z]+",token) and not token in stopwords):   
                dic[token] = dic.get(token,0) + 1  
                
        # save preprocessed tokens with their label if any token exists
        if len(dic) > 0:
            cleaned_tokens.append(dic)
            cleaned_labels.append(label)
    
    return cleaned_tokens, cleaned_labels    

x_processed, y_processed = preprocess_data(x, y)

print("Number of preprocessed tweets =", len(x_processed))
print("Number of preprocessed labels =", len(y_processed))
print("\nSamples of preprocessed data:")
for i in range(10):
    print("Country =", y_processed[i], "\tTweet =", x_processed[i])

Number of preprocessed tweets = 943
Number of preprocessed labels = 943

Samples of preprocessed data:
Country = us 	Tweet = {'@addictd2success': 1, 'thx': 1, 'u': 1, 'following': 1}
Country = us 	Tweet = {"let's": 1, 'say': 1, 'ever': 1, 'switch': 1, 'teams': 1, 'khalesi': 1, 'would': 1, 'top': 1, 'list': 1, '#girlcrush': 1}
Country = ph 	Tweet = {'taemin': 1, 'jonghyun': 1, 'birits': 1, 'make': 1, 'go': 1, 'http://t.co/le8z3dntla': 1}
Country = id 	Tweet = {'depart.senior': 1, 'rapat': 1, 'perdana': 1, 'nyayu': 1, 'anita': 1, 'others': 1, 'ruang': 1, 'aescullap': 1, 'fk': 1, 'unsri': 1, 'madang': 1, 'https://t.co/swrallnkrq': 1}
Country = ph 	Tweet = {'done': 1, 'internship': 1, 'pretty': 1, 'little': 1, 'lady': 1, 'metropolitan': 1, 'medical': 1, 'center': 1, 'w': 1, 'others': 1, 'pic': 1, 'http://t.co/1qh61r1t5r': 1}
Country = gb 	Tweet = {'wow': 1, "boruc's": 1, 'clanger': 1, 'haha': 1, 'sunday': 1, 'league': 1, 'stuff': 1, 'giroud': 1, 'believe': 1, 'luck': 1, '#clown': 1}
Countr

In [3]:
# test block
assert(len(x_processed) == len(y_processed))
assert(len(x_processed) > 800)

# Task2. text Classification

### split datasets

In [10]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

# initialise the objects
x_train, x_dev, x_test = None, None, None
y_train, y_dev, y_test = None, None, None

# split datasets into 70:15:15 with tha same distribution
x_train, x_temp, y_train, y_temp = train_test_split(x_processed, y_processed, test_size = 0.3, \
                                                    stratify = y_processed, random_state = 1)
x_dev, x_test, y_dev, y_test = train_test_split(x_temp, y_temp, test_size = 0.5, \
                                                stratify = y_temp, random_state = 1)

# vectorize tokens 
vectorizer = DictVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_dev = vectorizer.transform(x_dev)
x_test = vectorizer.transform(x_test)

#### Naive Bayes and Logistic Regression

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# tune hyper-paramters for MultinomialNB 
best_acc_NB = 0    
best_clfNB = None
alphas = np.logspace(-3, 2, num = 6)  

print("===========================\nMultinomialNB")
for a in alphas:
    clfNB = MultinomialNB(alpha = a)
    clfNB.fit(x_train,y_train)
    prediction_NB = clfNB.predict(x_dev)
    acc_NB = accuracy_score(y_dev,prediction_NB)
    print("alpha", a, "w/ accuracy", acc_NB)
    if acc_NB >= best_acc_NB:
        best_acc_NB = acc_NB
        best_clfNB = clfNB
    
# tune hyper-paramters for LogisticRegression
best_acc_LR = 0      
best_clfLR = None
# solver = ['newton-cg', 'liblinear','lbfgs']   
# penalty = ['none', 'l1', 'l2']
C = [0.001, 0.01, 0.1, 0.5, 1, 5, 10, 50, 100]    

print("===========================\nLogisticRegression")
for c in C:
    clfLR = LogisticRegression(C = c)
    clfLR.fit(x_train,y_train)
    prediction_LR = clfLR.predict(x_dev)
    acc_LR = accuracy_score(y_dev,prediction_LR)
    print("C", c, "w/ accuracy", acc_LR)
    if acc_LR >= best_acc_LR:
        best_acc_LR = acc_LR
        best_clfLR = clfLR

# print the optimal hyperparameters
print("\n\nbest classifier of mutinomial naive bayes model : %s with accuracy (%.3f)" \
      % (best_clfNB,best_acc_NB))                
print("best classifier of logistic regression model : %s with accuracy (%.3f)" \
      % (best_clfLR, best_acc_LR))

MultinomialNB
alpha 0.001 w/ accuracy 0.23404255319148937
alpha 0.01 w/ accuracy 0.23404255319148937
alpha 0.1 w/ accuracy 0.22695035460992907
alpha 1.0 w/ accuracy 0.2198581560283688
alpha 10.0 w/ accuracy 0.2127659574468085
alpha 100.0 w/ accuracy 0.19148936170212766
LogisticRegression
C 0.001 w/ accuracy 0.22695035460992907
C 0.01 w/ accuracy 0.22695035460992907
C 0.1 w/ accuracy 0.22695035460992907
C 0.5 w/ accuracy 0.22695035460992907
C 1 w/ accuracy 0.22695035460992907
C 5 w/ accuracy 0.23404255319148937
C 10 w/ accuracy 0.24822695035460993
C 50 w/ accuracy 0.23404255319148937
C 100 w/ accuracy 0.23404255319148937


best classifier of mutinomial naive bayes model : MultinomialNB(alpha=0.01) with accuracy (0.234)
best classifier of logistic regression model : LogisticRegression(C=10) with accuracy (0.248)


#### performance evaluation with the best parameters

In [12]:
# import library
from sklearn.metrics import f1_score, classification_report

# fit the training data into the best Naive Bayes classifier and predict the label of test data
best_clfNB.fit(x_train,y_train)          
nb_prediction = best_clfNB.predict(x_test)

print("\n>> MultinomialNB Model Results <<\n")
print("Accuracy :", round(accuracy_score(y_test,nb_prediction), 3))
print("Macro Avg. F-score :", round(f1_score(y_test,nb_prediction, average='macro'), 3))
print(classification_report(y_test, nb_prediction))


# fit the training data into the best Logistic Regression classifier and predict the label of test data
best_clfLR.fit(x_train,y_train)
lr_prediction = best_clfLR.predict(x_test)
       
print("\n>> Linear Regression Model Results <<\n")
print("Accuracy :", round(accuracy_score(y_test,lr_prediction), 3))
print("Macro Avg. F-score :", round(f1_score(y_test,lr_prediction, average='macro'),3))
print(classification_report(y_test, lr_prediction))

'''
According to the result, LR model performs better with test dataset
'''


>> MultinomialNB Model Results <<

Accuracy : 0.289
Macro Avg. F-score : 0.284
              precision    recall  f1-score   support

          au       0.24      0.27      0.25        15
          ca       0.12      0.13      0.13        15
          de       0.18      0.25      0.21         8
          gb       0.28      0.33      0.30        15
          id       0.60      0.20      0.30        15
          my       0.33      0.47      0.39        15
          ph       0.47      0.53      0.50        15
          sg       0.40      0.29      0.33        14
          us       0.21      0.20      0.21        15
          za       0.23      0.20      0.21        15

    accuracy                           0.29       142
   macro avg       0.31      0.29      0.28       142
weighted avg       0.31      0.29      0.29       142


>> Linear Regression Model Results <<

Accuracy : 0.331
Macro Avg. F-score : 0.323
              precision    recall  f1-score   support

          au       0.6

'\nAccording to the result, LR model performs better with test dataset\n'

#### further analysis: what is the most important features and their weights for each class for the two classifiers?

In [13]:
# get the required data
countries = best_clfLR.classes_
coefficients = best_clfLR.coef_
features = vectorizer.get_feature_names()

print("Classifier = Logistic Regression")
# print the top 20 words with the highest weight per class in LR model
for i in range(len(countries)):
    print("\nCountry =", countries[i])
    coef = coefficients[i]
    rank_indices = np.argsort(-coef)  # rank coefficients in descending order and return it's index
    for j in range(20):               # print the matching feature and coefficient 
        index = rank_indices[j]
        print("%s (%.3f)" % (features[index], coef[index]), end = " ")
    print("\n")

    
# get the required data
countries = best_clfNB.classes_
coefficients = best_clfNB.coef_

# print the top 20 words with the highest weight per class in NB model
print("Classifier = Naive Bayes")
for i in range(len(countries)):
    print("\nCountry =", countries[i])
    coef = coefficients[i]
    rank_indices = np.argsort(-coef)  # rank coefficients in descending order and return it's index
    for j in range(20):               # print the matching feature and coefficient 
        index = rank_indices[j]
        print("%s (%.1f)" % (features[index], coef[index]), end = " ")
    print("\n")
    
    
'''
Only two countries, Australia(au) and Singapore(sg), have some redundant words observed in two different models. 
They are the words describing a name of geographic location, such as 'melbourne', 'australia' in au 
and 'singapore' in sg. However, overall trends in all countries show that the top 20 important words are 
significantly different in different models. In particular, the difference in RL model results is more obviously 
observed between different countries. Seemingly, NB cannot effectively differentiate the countries based on tweet. 
Which can explain the relatively lowe accuracy of NB. 
'''    

Classifier = Logistic Regression

Country = au
#melbourne (1.846) literally (1.828) australia (1.719) little (1.609) melbourne (1.508) instagrams (1.480) spammed (1.480) #mtvhottest (1.470) https://t.co/7rcjjptvl7 (1.460) summerpoyi's (1.460) @mattjohnbond (1.256) geelong (1.256) freo (1.256) realsies (1.256) @shell_07 (1.256) #amazing (1.256) @darrencriss (1.212) valentine's (1.212) @chriscolfer (1.212) green (1.174) 


Country = ca
@aliclarke (1.934) bed (1.882) thing (1.698) works (1.693) hate (1.691) let's (1.650) think (1.480) movies (1.439) xoxo (1.393) @cheetahbiatch (1.393) kids (1.358) next (1.310) finally (1.211) beautiful (1.185) @samanthasharris (1.174) really (1.168) found (1.131) awesome (1.129) manor (1.122) carlyle (1.122) 


Country = de
@fabiomarabini (2.023) happened (1.916) https://t.co/df7ficsci3 (1.796) roseninsel (1.796) @selenagomez (1.512) http://t.co/airtaqn48v (1.512) germany (1.462) #utopia (1.436) jessica (1.436) https://t.co/brkwmsvzrb (1.436) gauting (1.4

"\nOnly two countries, Australia(au) and Singapore(sg), have some redundant words observed in two different models. \nThey are the words describing a name of geographic location, such as 'melbourne', 'australia' in au \nand 'singapore' in sg. However, overall trends in all countries show that the top 20 important words are \nsignificantly different in different models. In particular, the difference in RL model results is more obviously \nobserved between different countries. Seemingly, NB cannot effectively differentiate the countries based on tweet. \nWhich can explain the relatively lowe accuracy of NB. \n"

### Additional blocks for hashtags 
#### preprocessing hashtags

In [4]:
# a function to collect all unique hashtags in the preprocess data
def get_all_hashtags(data):
    hashtags = set([])
    for d in data:
        for word, frequency in d.items():
            if word.startswith("#") and len(word) > 1:
                hashtags.add(word)
    return hashtags

hashtags = get_all_hashtags(x_processed)
print("Number of hashtags =", len(hashtags))
print(sorted(hashtags))

Number of hashtags = 425
['#100percentpay', '#1stsundayofoctober', '#1yearofalmostisneverenough', '#2011prdctn', '#2015eebritishfilmacademyawards', '#2k16', '#2littlebirds', '#365picture', '#5sosacousticatlanta', '#5sosfam', '#8thannualpubcrawl', '#affsuzukicup', '#aflpowertigers', '#ahimacon14', '#aim20', '#airasia', '#allcity', '#alliswell', '#allwedoiscurls', '#amazing', '#anferneehardaway', '#ariona', '#art', '#arte', '#artwork', '#ashes', '#asian', '#asiangirl', '#askcrawford', '#askherforfback', '#askolly', '#asksteven', '#at', '#australia', '#awesome', '#awesomepict', '#barcelona', '#bart', '#bayofislands', '#beautiful', '#bedimages', '#bell', '#beringmy', '#bettybooppose', '#bff', '#big', '#bigbertha', '#bigbreakfast', '#blackhat', '#blessedmorethanicanimagine', '#blessedsunday', '#blogtourambiente', '#bluemountains', '#bonekachika', '#boomtaob', '#booyaa', '#bored', '#boredom', '#bradersisterhood', '#breaktime', '#breedingground', '#bringithomemy', '#brooksengland', '#burgers'

#### **1.MaxMatch algorithm for hastags** 

In [5]:
from nltk.corpus import wordnet

lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
words = set(nltk.corpus.words.words()) #a list of words provided by NLTK
words = set([ word.lower() for word in words ]) #lowercase all the words for better matching

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def tokenize_hashtags(hashtags):
    hashtags_dic = {}       
    for hashtag in hashtags:
        tokens = []
        length = len(hashtag)
        e = length     # initial end index to search
        s = 0          # initial start index to search
    
        # match the longest word starting from index s and ending at index e
        while s < length:
            word = hashtag[s:e]
            if lemmatize(word) in words:  # if the lemmatized substring is in word, save the original word 
                tokens.append(word)
                s += len(word)
                e = length
            else:       # otherwise, shorten the substring and match again by moving e towards s
                e -= 1
                if e == s:     # if no matches, save a single character as a token
                    tokens.append(word)
                    s += 1
                    e = length
                    
        hashtags_dic[hashtag] = tokens    
    return hashtags_dic    

#tokenise hashtags with MaxMatch
tokenized_hashtags = tokenize_hashtags(hashtags)

#print results
for k, v in sorted(tokenized_hashtags.items())[-30:]:
    print(k, v)

#vanilla ['#', 'vanilla']
#vca ['#', 'v', 'ca']
#vegan ['#', 'vega', 'n']
#veganfood ['#', 'vega', 'n', 'food']
#vegetables ['#', 'vegetables']
#vegetarian ['#', 'vegetarian']
#video ['#', 'video']
#vma ['#', 'v', 'ma']
#voteonedirection ['#', 'vote', 'one', 'direction']
#vsco ['#', 'vs', 'c', 'o']
#vscocam ['#', 'vs', 'coca', 'm']
#walking ['#', 'walking']
#watch ['#', 'watch']
#weare90s ['#', 'wear', 'e', '9', '0', 's']
#wearesocial ['#', 'weares', 'o', 'c', 'i', 'al']
#white ['#', 'white']
#wings ['#', 'wings']
#wok ['#', 'wo', 'k']
#wood ['#', 'wood']
#work ['#', 'work']
#workmates ['#', 'work', 'mates']
#world ['#', 'world']
#worldcup2014 ['#', 'world', 'cup', '2', '0', '1', '4']
#yellow ['#', 'yellow']
#yiamas ['#', 'y', 'i', 'ama', 's']
#ynwa ['#', 'yn', 'wa']
#youtube ['#', 'you', 'tube']
#yummy ['#', 'yummy']
#yws13 ['#', 'y', 'ws', '1', '3']
#zweihandvollfarm ['#', 'z', 'wei', 'hand', 'vol', 'l', 'farm']


In [6]:
# test block
assert(len(tokenized_hashtags) == len(hashtags))
assert(tokenized_hashtags["#newrecord"] == ["#", "new", "record"])


#### **2. reversed MaxMatch algorithm for hastags**

In [7]:
def tokenize_hashtags_rev(hashtags):
    hashtags_rev_dic = {}       # create a dictionary to save tokens
    for hashtag in hashtags:
        tokens = []
        length = len(hashtag)
        e = length              # initial end index to search
        s = 0                   # initial start index to search
        
         # match the longest word starting from index s and ending at index e
        while s < length:
            word = hashtag[s:e]
            if lemmatize(word) in words:   # if the lemmatized substring is in word, save the original word 
                tokens.append(word)
                s = 0
                e -= len(word)
            else:                # otherwise, shorten the substring and match again by moving s towards e
                s += 1
                if e == s:       # if no matches, save a single character as a token
                    tokens.append(word)
                    s = 0
                    e -= 1
                    
        hashtags_rev_dic[hashtag] = tokens[::-1]      # save a reversed set of tokens in dictionary
    return hashtags_rev_dic  

    
#tokenise hashtags with the reversed version of MaxMatch
tokenized_hashtags_rev = tokenize_hashtags_rev(hashtags)

#print results
for k, v in sorted(tokenized_hashtags_rev.items())[-30:]:
    print(k, v)
    

#vanilla ['#', 'vanilla']
#vca ['#', 'v', 'ca']
#vegan ['#', 'v', 'e', 'gan']
#veganfood ['#', 'v', 'e', 'gan', 'food']
#vegetables ['#', 'vegetables']
#vegetarian ['#', 'vegetarian']
#video ['#', 'video']
#vma ['#', 'v', 'ma']
#voteonedirection ['#', 'vote', 'one', 'direction']
#vsco ['#', 'vs', 'c', 'o']
#vscocam ['#', 'vs', 'c', 'o', 'cam']
#walking ['#', 'walking']
#watch ['#', 'watch']
#weare90s ['#', 'we', 'are', '9', '0', 's']
#wearesocial ['#', 'we', 'are', 'social']
#white ['#', 'white']
#wings ['#', 'wings']
#wok ['#', 'w', 'ok']
#wood ['#', 'wood']
#work ['#', 'work']
#workmates ['#', 'work', 'mates']
#world ['#', 'world']
#worldcup2014 ['#', 'world', 'cup', '2', '0', '1', '4']
#yellow ['#', 'yellow']
#yiamas ['#', 'y', 'i', 'a', 'mas']
#ynwa ['#', 'yn', 'wa']
#youtube ['#', 'you', 'tube']
#yummy ['#', 'yummy']
#yws13 ['#', 'y', 'ws', '1', '3']
#zweihandvollfarm ['#', 'z', 'wei', 'hand', 'vol', 'l', 'farm']


In [8]:
# test block
assert(len(tokenized_hashtags_rev) == len(hashtags))
assert(tokenized_hashtags_rev["#newrecord"] == ["#", "new", "record"])


#### 3. Comparison between MaxMatch and reversed MaxMatch

In [9]:
from nltk.corpus import brown
import numpy as np

# get words from brown corpus
brown_words = brown.words()
brown_words = [word.lower() for word in brown_words]   #lowercase all the words in the corpus
num_unique_words = len(set(brown_words))         # the number of unique words in the corpus
total_frequency = len(brown_words)               # the total number or words in the corpus

i = 0   # count the number of output

# print any mismatch results between MaxMatch and reversed MaxMatch algorithms in alphabetic order
for hashtag in sorted(tokenized_hashtags_rev.keys()):
    maxMatch = tokenized_hashtags[hashtag]
    rev_maxMatch = tokenized_hashtags_rev[hashtag]
    if maxMatch != rev_maxMatch:
        i += 1
        maxMatch_p = []
        rev_maxMatch_p = []

        for token in maxMatch:
            frequency = brown_words.count(token)     # get the frequency of a token from corpus
            maxMatch_p.append(frequency)            
        maxMatch_p = np.array(maxMatch_p)
        # calculate probability using add-one smoothing
        maxMatch_p = (maxMatch_p + 1)/ (total_frequency + 1*num_unique_words) 
        maxMatch_p = np.log(np.prod(maxMatch_p))    # get the log probability
        
        for token in rev_maxMatch:
            frequency = brown_words.count(token)    # get the frequency of a token from corpus
            rev_maxMatch_p.append(frequency)
        rev_maxMatch_p = np.array(rev_maxMatch_p)
        # calculate probability using add-one smoothing
        rev_maxMatch_p = (rev_maxMatch_p + 1)/ (total_frequency + 1*num_unique_words)
        rev_maxMatch_p = np.log(np.prod(rev_maxMatch_p))     # get the log probability
        
        print("\n%d. %s\nMaxMatch = [%s]; LogProb = %.1f \nReversed MaxMatch = [%s]; LogProb = %.1f" \
              % (i, hashtag, ', '.join(map(str, maxMatch)), maxMatch_p, \
                 ', '.join(map(str, rev_maxMatch)), rev_maxMatch_p))

        
'''
In general, the lower probability of tokens makes more sense. 
For instances, [#, 1, st, sunday, of, october] (LogProb = -58.7) is 
better than [#, 1, st, sunday, ofo, c, tobe, r] (LogProb = -92.7). 
Likewise, [#, 1, year, of, almost, is, never, enough] (LogProb = -60.9) sounds much better than 
[#, 1, year, of, al, mos, tis, never, enough] (LogProb = -86.9). 
'''


1. #1stsundayofoctober
MaxMatch = [#, 1, st, sunday, ofo, c, tobe, r]; LogProb = -92.7 
Reversed MaxMatch = [#, 1, st, sunday, of, october]; LogProb = -58.7

2. #1yearofalmostisneverenough
MaxMatch = [#, 1, year, of, almost, is, never, enough]; LogProb = -60.9 
Reversed MaxMatch = [#, 1, year, of, al, mos, tis, never, enough]; LogProb = -86.9

3. #8thannualpubcrawl
MaxMatch = [#, 8, than, nu, alp, u, b, crawl]; LogProb = -90.1 
Reversed MaxMatch = [#, 8, th, annual, pub, crawl]; LogProb = -71.6

4. #affsuzukicup
MaxMatch = [#, a, f, fs, u, z, u, k, i, cup]; LogProb = -104.6 
Reversed MaxMatch = [#, a, f, f, suz, u, k, i, cup]; LogProb = -91.4

5. #ahimacon14
MaxMatch = [#, ah, ima, con, 1, 4]; LogProb = -67.2 
Reversed MaxMatch = [#, a, hi, macon, 1, 4]; LogProb = -58.8

6. #alliswell
MaxMatch = [#, all, is, well]; LogProb = -32.0 
Reversed MaxMatch = [#, al, li, swell]; LogProb = -50.7

7. #allwedoiscurls
MaxMatch = [#, all, wed, o, is, curls]; LogProb = -61.7 
Reversed MaxMatch = [#


63. #learningcommunties
MaxMatch = [#, learning, c, om, munt, ies]; LogProb = -75.1 
Reversed MaxMatch = [#, learning, c, om, m, unties]; LogProb = -72.3

64. #lebedeintennis
MaxMatch = [#, l, e, bed, e, in, tennis]; LogProb = -70.2 
Reversed MaxMatch = [#, l, e, be, de, in, tennis]; LogProb = -65.2

65. #letmesleep
MaxMatch = [#, let, mes, leep]; LogProb = -50.1 
Reversed MaxMatch = [#, let, me, sleep]; LogProb = -38.8

66. #loadsoffun
MaxMatch = [#, loads, off, un]; LogProb = -44.5 
Reversed MaxMatch = [#, loads, of, fun]; LogProb = -39.3

67. #longranger
MaxMatch = [#, long, ranger]; LogProb = -34.3 
Reversed MaxMatch = [#, l, on, granger]; LogProb = -44.4

68. #magazinesandtvscreens
MaxMatch = [#, magazines, and, t, vs, creen, s]; LogProb = -75.4 
Reversed MaxMatch = [#, magazine, sand, t, v, screens]; LogProb = -66.9

69. #makeupfree
MaxMatch = [#, make, up, free]; LogProb = -36.2 
Reversed MaxMatch = [#, ma, keup, free]; LogProb = -47.5

70. #mamajeanneandme
MaxMatch = [#, mam, 

'\nIn general, the lower probability of tokens makes more sense. \nFor instances, [#, 1, st, sunday, of, october] (LogProb = -58.7) is \nbetter than [#, 1, st, sunday, ofo, c, tobe, r] (LogProb = -92.7). \nLikewise, [#, 1, year, of, almost, is, never, enough] (LogProb = -60.9) sounds much better than \n[#, 1, year, of, al, mos, tis, never, enough] (LogProb = -86.9). \n'