# TITLE OF PROJECT
## Contributors:
### Anik Burman, Indian Statistical Institute
### Joshua Fink, University of Michigan
### Sasha Lioutikova, Yale University
### Grace Smith, William and Mary
## Special Thanks:
### Dr. Johann Gagnon-Bartsch, Juejue Wang, and Heather Johnston

## 1. PREPROCESSING CSV FILES WITH AND WITHOUT SYNONYM HASHMAP
### Prior to running the analysis, clean the csv file to ensure the input works well. The functions generate a hashmap with the top 150 words, then it inserts them into the tweets in lexigraphical order

In [None]:
'''
HELPER FUNCTIONS AND LIBRARIES, PREPROCESSING AND SYNONYMS
'''

from collections import Counter
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import ftfy

# maybe remove other punctuation (like | …) and emojis?
## pre-processing: stem, remove URLs and stop words
def preProcessingFcn(tweet, removeWords=list(), stem=False, removeURL=True, removeStopwords=True,
	removeNumbers=False, removeHashtags=True, removeAt=True, removePunctuation=True, lem=True, removeNewline=True,
    ftfyCleanup=True, removeB=True, replaceApostrophe=True, removeEmoji=True):
        ps = PorterStemmer()
        lm = WordNetLemmatizer()
        tweet = tweet.lower()
        if removeB==True:
            tweet = re.sub(r"b", " ", tweet)
        if removeURL==True:
            tweet = re.sub(r"http\S+", " ", tweet)
        if removeHashtags==True:
            tweet = tweet.replace('#', ' ')
        if removeAt==True:
            tweet = tweet.replace('@', ' ')
        if removeNumbers==True:
            tweet=  ''.join(i for i in tweet if not i.isdigit())
        if removePunctuation==True:
            tweet = re.sub(r"[,.;@#?!&$:]+\ *", " ", tweet)
        if replaceApostrophe==True:
            tweet = re.sub(r"’", "'", tweet)### Joshua Fink, University of Michigan
        # if removeNewline==True:
        #     tweet = re.sub('\n', ' ', tweet)
        # if ftfyCleanup==True:### Joshua Fink, University of Michigan
        #     tweet = ftfy.fix_text(tweet)
        if removeStopwords==True:
            tweet = ' '.join([word for word in tweet.split() if word not in stopwords.words('english')])
        if len(removeWords)>0:
            tweet = ' '.join([word for word in tweet.split() if word not in removeWords])
        if lem==True:
            tweet = ' '.join([lm.lemmatize(word) for word in tweet.split()])
        if stem==True:
            tweet = ' '.join([ps.stem(word) for word in tweet.split()])
        return tweet


# credit @Abdul-Razak Adam on StackOverflow
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)


def preProcessingFcnTotal(tweets, removeWords=list(), stem=False, removeURL=True, removeStopwords=True,
	removeNumbers=False, removeHashtags=True, removeAt=True, removePunctuation=True, lem=True, removeNewline=True,
    ftfyCleanup=True, removeB=True):
    preprocessed_tweets = []
    for tweet in tweets:
        preprocessed_tweets.append(preProcessingFcn(tweet, removeWords, stem, removeURL, removeStopwords,removeNumbers,
                                                    removeHashtags, removeAt, removePunctuation, lem, removeNewline,
                                                    ftfyCleanup, removeB))
    return preprocessed_tweets


''' GETTING TOP WORDS '''
'''
IN: n = number of words to return, tweets = list of tweets
OUT: list of top n most frequent words in corpus
'''
def get_top_n_words(n, tweets):
    counter_obj = Counter()
    for tweet in tweets:
        counter_obj.update(word_tokenize(tweet))
    n_most_frequent_wcount = counter_obj.most_common(n)  # n most frequent words with counts
    n_most_frequent = [pair[0] for pair in n_most_frequent_wcount]  # n most frequent words (without counts)
    # print(n_most_frequent_wcount)
    # print(n_most_frequent)
    return n_most_frequent


''' MAKING HASH MAP '''
# boolean determining whether to use only unigrams
# currently should be True; non-unigrams are not yet included in synonym-based tweet altering
only_unigrams = True

'''
IN: list of "base" words to consider in hash map
OUT: hash map including all inputted base words where key:pair = synonym:base word
note: using version 2 of add_to_mapping (i.e. no separation of finding and adding synonyms)
'''
def make_mapping(words):
    mapping = {}
    for word in words:
        for syn in wordnet.synsets(word):
            for lemma in syn.lemma_names():
                if lemma not in mapping and lemma != word:
                    if (only_unigrams and '_' not in lemma) or not only_unigrams:
                        mapping[lemma] = word
    return mapping


''' ALTERING TWEETS '''
'''
IN: tweets = original corpus, mapping = synonym:base mapping
OUT: altered tweets with all base words added to end of tweet if synonym found in tweet
'''
def alter_tweets(tweets, mapping):
    new_tweets = []
    for tweet in tweets:
        for word in word_tokenize(tweet):
            if word in mapping:
                tweet = tweet + " " + mapping[word]
        new_tweets.append(tweet)
    return new_tweets

In [None]:
'''
PROCESS CSV INPUTS
'''

''' READING IN DATA '''
'''
IN: filepath (string), column index with tweet text (int)
OUT: tweets as list of strings
'''
def read_data(filepath, colind):
    data = []
    with open(filepath, newline="", encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            data.append(row[colind])
    return data[1:]


''' WRITING TO CSV '''

def write_data(filepath, tweets):
    with open(filepath, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['text'])
        for tw in tweets:
            writer.writerow([tw])


# read in data
data_incentive = read_data('../grace-sandbox/Data/VT_incentive.csv', 1)
#data_non_incentive = read_data('../grace-sandbox/Data/VT_non_incentive.csv', 1)
data_mani = read_data('../grace-sandbox/Data/Twitter_mani.csv', 4)

# print(data_incentive[:5])
# print(data_non_incentive[:5])
print(data_mani[:5])


# preprocess
# VT_incentive_clean = preProcessingFcnTotal(data_incentive)
# VT_non_incentive_clean = preProcessingFcnTotal(data_non_incentive)
mani_clean = preProcessingFcnTotal(data_mani, removeB=False)

# print(VT_incentive_clean[:5])
# print(VT_non_incentive_clean[:5])
print(mani_clean[:5])


# # alter w/ synonyms
# VT_incentive_altered = alter_tweets(VT_incentive_clean, make_mapping(get_top_n_words(50, VT_incentive_clean)))
# VT_non_incentive_altered = alter_tweets(VT_non_incentive_clean, make_mapping(get_top_n_words(50, VT_non_incentive_clean)))
mani_altered = alter_tweets(mani_clean, make_mapping(get_top_n_words(150, mani_clean)))
print(mani_altered[:5])


# # output
# write_data('./VT_incentive_clean.csv', VT_incentive_clean)
# write_data('./VT_non_incentive_clean.csv', VT_non_incentive_clean)
write_data('./mani_clean150.csv', mani_clean)
# write_data('./VT_incentive_altered.csv', VT_incentive_altered)
# write_data('./VT_non_incentive_altered.csv', VT_non_incentive_altered)
write_data('./mani_altered150.csv', mani_altered)


