# Tweet Classification

# Overview
Provided with a collection tweets, the task of this project is to classify whether a tweet constitutes a rumour event

# 1. Preprocessing 

**download the tweet corpus**

In [2]:
import requests
import os
from pathlib import Path

fname = 'rumour-data.tgz'
data_dir = os.path.splitext(fname)[0] #'rumour-data'
'''
my_file = Path(fname)
if not my_file.is_file():
    url = "https://github.com/jhlau/jhlau.github.io/blob/master/files/rumour-data.tgz?raw=true"
    r = requests.get(url)

    #Save to the current directory
    with open(fname, 'wb') as f:
        f.write(r.content)
        
print("Done. File downloaded:", my_file)
'''

'\nmy_file = Path(fname)\nif not my_file.is_file():\n    url = "https://github.com/jhlau/jhlau.github.io/blob/master/files/rumour-data.tgz?raw=true"\n    r = requests.get(url)\n\n    #Save to the current directory\n    with open(fname, \'wb\') as f:\n        f.write(r.content)\n        \nprint("Done. File downloaded:", my_file)\n'

**extract the zip file**

In [None]:
import tarfile

#decompress rumour-data.tgz
tar = tarfile.open(fname, "r:gz")
tar.extractall()
tar.close()

#remove superfluous files (e.g. .DS_store)
extra_files = []
for r, d, f in os.walk(data_dir):
    for file in f:
        if (file.startswith(".")):
            extra_files.append(os.path.join(r, file))
for f in extra_files:
    os.remove(f)

print("Extraction done.")

### 1. Gather tweet messages from data

**Dataset description**: The corpus data is in the *rumour-data* folder. It contains 2 sub-folders: *non-rumours* and *rumours*. *rumours* contains all rumour-propagating tweets, while *non-rumours* has normal tweets. Within  *rumours* and *non-rumours*, there are sub-folders, each named with an ID. Each of these IDs constitutes an 'event', where an event is defined as consisting a **source tweet** and its **reactions**.

The folder structure is as follows:

    rumour-data
        - rumours
            - 498254340310966273
                - reactions
                    - 498254340310966273.json
                    - 498260814487642112.json
                - source-tweet
                    - 498254340310966273.json
        - non-rumours

In [3]:
import json
from timeit import default_timer as timer

def get_tweet_text_from_json(file_path):
    with open(file_path) as json_file:
        data = json.load(json_file)
        return data["text"]
    
def get_events(event_dir):
    event_list = []
    for event in sorted(os.listdir(event_dir)):
        ###
        # Your answer BEGINS HERE
        ###
        text_list = []
        sourch_path = os.path.join(event_dir, event, "source-tweet")
        reactions_path =  os.path.join(event_dir, event, "reactions")
    
        for jsonfile in os.listdir(sourch_path):
            jsonpath = os.path.join(sourch_path, jsonfile) 
            text_list.append(get_tweet_text_from_json(jsonpath))
            
        for jsonfile in os.listdir(reactions_path):
            jsonpath = os.path.join(reactions_path, jsonfile) 
            text_list.append(get_tweet_text_from_json(jsonpath))
        
        event_list.append(text_list)    
        listlen = len(event_list)
        
        if listlen%100 == 0:
            print(listlen, " events completed")
    
        ###
        # Your answer ENDS HERE
        ###
        
    return event_list

start = timer()

#a list of events, and each event is a list of tweets (source tweet + reactions)    
rumour_events = get_events(os.path.join(data_dir, "rumours"))
nonrumour_events = get_events(os.path.join(data_dir, "non-rumours"))

print("Number of rumour events =", len(rumour_events))
print("Number of non-rumour events =", len(nonrumour_events))

end = timer()
print(end - start)

100  events completed
200  events completed
300  events completed
400  events completed
500  events completed
100  events completed
200  events completed
300  events completed
400  events completed
500  events completed
600  events completed
700  events completed
800  events completed
900  events completed
1000  events completed
Number of rumour events = 500
Number of non-rumour events = 1000
104.8161205


**For your testing:**

In [6]:
assert(len(rumour_events) == 500)
assert(len(nonrumour_events) == 1000)

### 2. Preprocessing 

**steps**: (1) tokenize each tweet into individual word tokens; and (2) remove stopwords

In [7]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from collections import defaultdict

tt = TweetTokenizer()
stopwords = set(stopwords.words('english'))

def preprocess_events(events):
    ###
    # Your answer BEGINS HERE
    ###
    preprocessed_list = []
    for event in events:
        tokens = defaultdict(int)
        for sent in event:            
            tokenized_sentence = tt.tokenize(sent)
            for token in tokenized_sentence :
                if token.lower() not in stopwords:
                    tokens[token] += 1
        preprocessed_list.append(tokens)
        
    return preprocessed_list
    
    ###
    # Your answer ENDS HERE
    ###

start = timer()
preprocessed_rumour_events = preprocess_events(rumour_events)
preprocessed_nonrumour_events = preprocess_events(nonrumour_events)

print("Number of preprocessed rumour events =", len(preprocessed_rumour_events))
print("Number of preprocessed non-rumour events =", len(preprocessed_nonrumour_events))

end = timer()
print(end - start)

Number of preprocessed rumour events = 500
Number of preprocessed non-rumour events = 1000
1.3023077


**For your testing**:

In [8]:
assert(len(preprocessed_rumour_events) == 500)
assert(len(preprocessed_nonrumour_events) == 1000)

**Get hashtags**: Hashtags pose an interesting tokenisation problem because they often include multiple words written without spaces or capitalization. 

In [9]:
def get_all_hashtags(events):
    hashtags = set([])
    for event in events:
        for word, frequency in event.items():
            if word.startswith("#"):
                hashtags.add(word)
    return hashtags

hashtags = get_all_hashtags(preprocessed_rumour_events + preprocessed_nonrumour_events)
print("Number of hashtags =", len(hashtags))

Number of hashtags = 1829


### 3. Tokenization

**MaxMatch**: by a reversed version of the  algorithm, where matching begins at the end of the hashtag and progresses backwards. 
MaxMatch algorithm should match inflected forms by converting them into lemmas before matching. 
Part-of-speech tag of the word is provided when lemmatising a word. 

Example) given "#speakup", the algorithm should produce: \["#", "speak", "up"\]. 

In [None]:
from nltk.corpus import wordnet

lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
words = set(nltk.corpus.words.words()) #a list of words provided by NLTK
vocab = set([word.lower() for word in words]) #lowercased word list

# I tried recursive and iterative ways. Recursive way performs slightly faster.
'''
def max_match(hashtag):
    if not hashtag:
        return ""
    parsed_list = [] 

    for i in range(0, len(hashtag)-1):
        first_word = hashtag[i:]
        remainder = hashtag[:i]
        tagged= nltk.pos_tag([first_word])
        tag = word_tag(tagged[0][1])

        word = ""
        if tag is None: 
            word = first_word.lower()
        else:
            word = lemmatizer.lemmatize(first_word.lower(), tag)
                
        if word in vocab:
            return first_word + " " + max_match(remainder)

    first_word = hashtag[-1]
    remainder = hashtag[:-1]

    return first_word + " " + max_match(remainder)    
    
def tokenize_hashtags(hashtags):  
    ###
    # Your answer BEGINS HERE
    ###
    hash_dict = {}
    for hashtag in hashtags:
        parsed_list = max_match(hashtag).split(" ")[:-1]
        parsed_list.reverse()
        hash_dict[hashtag] = parsed_list            
    #print(hash_dict)        
    return hash_dict           
'''

def tokenize_hashtags(hashtags):
    ###
    # Your answer BEGINS HERE
    ###
    hash_dict = {}
    for hashtag in hashtags:
        len_tag = len(hashtag)   
        index = len_tag
        parsed_list = []       
        while index > 0:
            match = False
            for i in range(0, index):
                word = hashtag[i: index]
                tagged= nltk.pos_tag([word])
                tag = word_tag(tagged[0][1])
                lemma = ""
                if tag is None: 
                    lemma = word.lower()
                else:
                    lemma = lemmatizer.lemmatize(word.lower(), tag)                

                if lemma in vocab:
                    match = True
                    parsed_list.append(word)
                    index = i
                    break
            if not match:
                parsed_list.append(hashtag[index-1])
                index -= 1
        parsed_list.reverse()         
        hash_dict[hashtag] = parsed_list    
        
    return hash_dict 
    ###
    # Your answer ENDS HERE
    ###

def word_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None    

list_ = ['#speakUP', '#speakingup']
print('example:' , tokenize_hashtags(list_))
print('Please wait for 9~15 secs')
print()
start = timer()
tokenized_hashtags = tokenize_hashtags(hashtags)
print(list(tokenized_hashtags.items())[:20])

end = timer()
print(end - start)  

example: {'#speakUP': ['#', 'speak', 'UP'], '#speakingup': ['#', 's', 'p', 'e', 'akin', 'gup']}
Please wait for 9~15 secs



**For your testing:**

In [9]:
assert(len(tokenized_hashtags) == len(hashtags))

### 4. Update the bag-of-words
This function looks for every hashtag in a list of preprocessed events and updates the bag-of-words dictionary with the tokenized hashtag tokens. 

In [14]:
def update_event_bow(events):
    ###
    # Your answer BEGINS HERE
    ###
    for event in events:
        for word, frequency in list(event.items()): #create a snapshot rather than a view to update original one
            if word.startswith("#"):
                hashtag_parsed = tokenized_hashtags[word]
                for item in hashtag_parsed:
                    #event[item] += 1
                    if item.lower() not in stopwords:         
                        event[item] += 1
    
    ###
    # Your answer ENDS HERE
    ###
            
update_event_bow(preprocessed_rumour_events)
update_event_bow(preprocessed_nonrumour_events)

print("Number of preprocessed rumour events =", len(preprocessed_rumour_events))
print("Number of preprocessed non-rumour events =", len(preprocessed_nonrumour_events))

Number of preprocessed rumour events = 500
Number of preprocessed non-rumour events = 1000


# Text Classification

### 5. Feature extraction 

**data partition**:  create training, development and test partitions with a 60%/20%/20% ratio.

**feature extraction**: turn the bag-of-words dictionary of each event into a feature vector, using scikit-learn `DictVectorizer`.

In [21]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

vectorizer = DictVectorizer()

###
# Your answer BEGINS HERE
###
events  = preprocessed_rumour_events + preprocessed_nonrumour_events
y = []

for i in range(0, len(preprocessed_rumour_events)):
    y.append("rumour")
    
for i in range(0, len(preprocessed_nonrumour_events)):
    y.append("nonrumour")
    
X = vectorizer.fit_transform(events)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=94) #62, 94
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, stratify=y_train, random_state=94) 
    
###
# Your answer ENDS HERE
###

print("Vocabulary size =", len(vectorizer.vocabulary_))

Vocabulary size = 39516


### 6. Hyperparameter tuning

**classifiers**: Naive Bayes and Logistic Regression. 

Hyperparameter tuning using the development set.
Cross-validation shouldn't be used.

In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import  accuracy_score, f1_score, classification_report
###
# Your answer BEGINS HERE
###
alpha_list = [x/10 for x in range(0, 11)]
c_list = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
nb = [MultinomialNB(alpha=alpha) for alpha in alpha_list]
lrcs = [LogisticRegression(C=c) for c in c_list]
models = [nb, lrcs]
best_params = {}
for clfs in models:
    best_score = 0
    best_param = None
    model_name = clfs[0].__class__.__name__
    print()
    print("=======", model_name)
    for clf in clfs:
        model = clf.fit(X_train, y_train)   
        predictions = model.predict(X_val)    
        accuracy = accuracy_score(y_val ,predictions)
        param_dict = None
        if model_name == 'MultinomialNB':      
            param_dict = {'alpha': clf.get_params()['alpha'] }
        else:     
            param_dict = {'C': clf.get_params()['C'] }
        
        print (param_dict, " || accuracy: ", accuracy)
        if accuracy > best_score:
            best_score = accuracy
            best_param = param_dict
            
    best_params[model_name] = (best_param , best_score)
    print("best score: ", best_score, ' || best param: ', best_param)  
    
print()    
print("### Best parameters in Development set: ")
for model, (param, score) in best_params.items():
    print("Model: ",  model)
    print("best parameter:", param)
    print("best score:", score)
    print()
    
###
# Your answer ENDS HERE
###


{'alpha': 0.0}  || accuracy:  0.7633333333333333
{'alpha': 0.1}  || accuracy:  0.7433333333333333
{'alpha': 0.2}  || accuracy:  0.7466666666666667
{'alpha': 0.3}  || accuracy:  0.7466666666666667
{'alpha': 0.4}  || accuracy:  0.76
{'alpha': 0.5}  || accuracy:  0.7733333333333333
{'alpha': 0.6}  || accuracy:  0.7766666666666666
{'alpha': 0.7}  || accuracy:  0.7766666666666666
{'alpha': 0.8}  || accuracy:  0.7833333333333333
{'alpha': 0.9}  || accuracy:  0.7933333333333333
{'alpha': 1.0}  || accuracy:  0.8033333333333333
best score:  0.8033333333333333  || best param:  {'alpha': 1.0}

{'C': 0.001}  || accuracy:  0.7333333333333333
{'C': 0.01}  || accuracy:  0.7833333333333333
{'C': 0.1}  || accuracy:  0.8233333333333334


  'setting alpha = %.1e' % _ALPHA_MIN)


{'C': 1}  || accuracy:  0.8133333333333334
{'C': 10}  || accuracy:  0.81
{'C': 100}  || accuracy:  0.7966666666666666
{'C': 1000}  || accuracy:  0.7966666666666666
best score:  0.8233333333333334  || best param:  {'C': 0.1}

### Best parameters in Development set: 
Model:  MultinomialNB
best parameter: {'alpha': 1.0}
best score: 0.8033333333333333

Model:  LogisticRegression
best parameter: {'C': 0.1}
best score: 0.8233333333333334



### 7. Compare performance
Using optimal hyper-parameter settings, compute test performance for Naive Bayes and Logistic Regression.

In [23]:
###
# Your answer BEGINS HERE
###
nb_param, score = best_params['MultinomialNB']
log_param, score = best_params['LogisticRegression']
models = [     
    MultinomialNB(**nb_param),
    LogisticRegression(**log_param)
]
for classifier in models:
    model_name = classifier.__class__.__name__
    #print(classifier.get_params())
    model = classifier.fit(X_train, y_train)   
    predictions = model.predict(X_test)   
    accuracy = accuracy_score(y_test ,predictions)
    f1_macro= f1_score(y_test, predictions, average='macro')
    print(model_name , ':')
    print("accuracy: ", accuracy)
    print("f1_macro: ", f1_macro)
    print(classification_report(y_test, predictions))
    
###
# Your answer ENDS HERE
###

MultinomialNB :
accuracy:  0.8466666666666667
f1_macro:  0.8322310834468003
              precision    recall  f1-score   support

   nonrumour       0.91      0.85      0.88       200
      rumour       0.74      0.83      0.78       100

    accuracy                           0.85       300
   macro avg       0.83      0.84      0.83       300
weighted avg       0.85      0.85      0.85       300

LogisticRegression :
accuracy:  0.8633333333333333
f1_macro:  0.8346218487394959
              precision    recall  f1-score   support

   nonrumour       0.85      0.96      0.90       200
      rumour       0.89      0.67      0.77       100

    accuracy                           0.86       300
   macro avg       0.87      0.81      0.83       300
weighted avg       0.87      0.86      0.86       300



