In [1]:
import pickle

In [5]:
with open("data_all.pickle", "rb") as f:
    data = pickle.load(f)

In [6]:
data[0]

{'hashtags': [{'indices': [38, 47], 'text': 'Varshita'}],
 'text': "It's @MarkMasai's acting debut! Watch #Varshita this Saturday and Sunday on @maishamagiceast from 7.30pm to 8.30pm (2 episodes each evening)  Watch: https://t.co/K0Zvjy4IWv"}

In [7]:
with open("slangdict.p", "rb") as f:
    slang_dict = pickle.load(f)

In [8]:
slang_dict["aaf"]

'as a friend'

## Filter tweets with hashtags between 200 and 500 occurences

In [11]:
def count_hashtags(data):
    counter = {}
    for tweet in data:
        hashtags = tweet["hashtags"]
        for hashtag in hashtags:
            hashtag_text = hashtag["text"].lower()
            if hashtag_text in counter:
                counter[hashtag_text] = counter[hashtag_text] + 1
            else:
                counter[hashtag_text] = 1
    return counter

In [12]:
hashtags_counted = count_hashtags(data)

In [15]:
def get_golden_middle(hashtags_counted):
    '''return the set of hashtags with occurences between 200 and 500'''
    golden_middle = set()
    for hashtag, occurences in hashtags_counted.items():
        if 200 <= occurences <= 500:
            golden_middle.add(hashtag)
    return golden_middle

In [16]:
golden_middle = get_golden_middle(hashtags_counted)

In [17]:
len(golden_middle)

1330

In [18]:
def filter_200_500_occurences(data, golden_middle):
    '''return tweets that contain at least one hashtag with 200-500 occurences'''
    tweets = []
    for tweet in data:
        hashtags = tweet["hashtags"]
        for hashtag in hashtags:
            hashtag_text = hashtag["text"].lower()
            if hashtag_text in golden_middle:
                tweets.append(tweet)
                break
    return tweets

In [19]:
tweets_common_hash = filter_200_500_occurences(data, golden_middle)

In [21]:
len(data)

1844594

In [20]:
len(tweets_common_hash)

349278

## Filter non asci chars

In [80]:
import string
all_asci = set(string.printable)
punctuation = set("!%()*+,-.:;<=>?[]\\^{}|~")


In [78]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [84]:
def filter_non_ascii_punctuation_txt(txt):
    return "".join(list(filter(lambda x: (x in all_asci) and (x not in punctuation), txt)))

In [128]:
def remove_hashtag_if_left_empty_after_ascii_filtering(tweet):
    hashtags = tweet["hashtags"]
    tweet_txt = tweet["text"]
    hashtags_new = []
    for ht in hashtags:
        ht_text = filter_non_ascii_punctuation_txt(ht["text"])
        if ht_text != "":
            #if there's something left from the hashtag add it to the new tweet
            #also replace its occurence in the tweet text
            hashtags_new.append({"indices": ht["indices"], "text": ht_text})
            tweet_txt = tweet_txt.replace("#" + ht["text"], "#" + ht_text)
        else:
            #otherwise delete hashtag from tweet
            tweet_txt = tweet_txt.replace("#" + ht["text"], "")
    return {"hashtags": hashtags_new, "text": tweet_txt}

In [129]:
def filter_hashtags_for_asci_and_punctuation(tweets):
    new_tweets = []
    for tweet in tweets:
        new_tweet = remove_hashtag_if_left_empty_after_ascii_filtering(tweet)
        if new_tweet["hashtags"] != []:
            new_tweets.append(new_tweet)
    return new_tweets

In [130]:
def filter_non_ascii_punctuation(tweet):
    txt = tweet["text"]
    txt = "".join(list(filter(lambda x: (x in all_asci) and (x not in punctuation), txt)))
    return {"hashtags": tweet["hashtags"], "text": txt}

In [73]:
filter_non_ascii({"text": "асд", "hashtags": []})

''

In [76]:
tweets_common_hash[0]

{'hashtags': [{'indices': [64, 85], 'text': 'FreeCommunityCollege'},
  {'indices': [92, 103], 'text': 'PreKforAll'}],
 'text': 'RT @Kyle_Lierman: During our Obama White House days we proposed #FreeCommunityCollege &amp; #PreKforAll which would have cost $130 billio… '}

In [131]:
def filter_asci_punct_all_tweets(tweets):
    filtered = []
    for tweet in tweets:
        tweet_filtered = filter_non_ascii_punctuation(tweet)
        filtered.append(tweet_filtered)
    return filtered

In [132]:
tweets_filtered_asci_punct = filter_asci_punct_all_tweets(tweets_common_hash)

In [133]:
tweets_filtered_non_empty_hash = filter_hashtags_for_asci_and_punctuation(tweets_filtered_asci_punct)

In [134]:
len(tweets_filtered_asci_punct)

349278

In [135]:
len(tweets_filtered_non_empty_hash)

346585

In [136]:
tweets_filtered_non_empty_hash[0]["text"]

'RT @Kyle_Lierman During our Obama White House days we proposed #FreeCommunityCollege &amp #PreKforAll which would have cost $130 billio '

In [108]:
blacklist = ["\n", "\r", "&gt", "&amp"]

## Filter URLS

In [105]:
import re
re.sub(r"http\S+", '', s,)

'An awesome Hoodie for cat lover\nPRINTED IN THE USA\nsweaTshirt \ncatlover \nHoodie \nGet yours &gt\n '

In [116]:
def filter_urls(tweets):
    new_tweets = []
    for tweet in tweets:
        tweet_txt = tweet["text"]
        tweet_txt = re.sub(r"http\S+", '', tweet_txt)
        new_tweets.append({"hashtags": tweet["hashtags"], "text": tweet_txt})
    return new_tweets

In [137]:
tweets_no_urls = filter_urls(tweets_filtered_non_empty_hash)

In [138]:
tweets_no_urls[2]

{'hashtags': [{'indices': [21, 29], 'text': 'Discord'},
  {'indices': [30, 40], 'text': 'IndieGame'},
  {'indices': [63, 71], 'text': 'GameDev'},
  {'indices': [72, 81], 'text': 'IndieDev'},
  {'indices': [86, 93], 'text': 'Twitch'},
  {'indices': [94, 105], 'text': 'LiveStream'}],
 'text': "I'm the founder of a #Discord #IndieGame community that houses #GameDev/#IndieDev and #Twitch #LiveStream people and #Youtube gamers\n\n\n\nLet's connect all #Gaming people in one spot join the server today "}

## Replace \n and \r \t with " " 

In [181]:
def filter_special_chars(tweets):
    new_tweets = []
    for tweet in tweets:
        tweet_txt = tweet["text"]
        tweet_txt = re.sub(r"\r|\n|\t", ' ', tweet_txt)
        tweet_txt = re.sub('&amp|&gt','', tweet_txt)
        tweet_txt = re.sub(' +',' ', tweet_txt)
        new_tweets.append({"hashtags": tweet["hashtags"], "text": tweet_txt.strip()})
    return new_tweets

In [172]:
tweets_no_special_chars = filter_special_chars(tweets_no_urls)

In [173]:
tweets_no_special_chars[0]

{'hashtags': [{'indices': [64, 85], 'text': 'FreeCommunityCollege'},
  {'indices': [92, 103], 'text': 'PreKforAll'}],
 'text': 'RT @Kyle_Lierman During our Obama White House days we proposed #FreeCommunityCollege #PreKforAll which would have cost $130 billio'}

## Simplify hashtag representation

In [152]:
def simplify_tweets(tweets):
    new_tweets = []
    for tweet in tweets:
        hashtags_list = []
        tweet_txt = tweet["text"]
        hashtags = tweet["hashtags"]
        for hashtag in hashtags:
            hashtags_list.append(hashtag["text"])
        new_tweets.append({"text": tweet_txt, "hashtags": hashtags_list})
    return new_tweets

In [174]:
tweets_simplified = simplify_tweets(tweets_no_special_chars)

In [175]:
tweets_simplified[0]

{'hashtags': ['FreeCommunityCollege', 'PreKforAll'],
 'text': 'RT @Kyle_Lierman During our Obama White House days we proposed #FreeCommunityCollege #PreKforAll which would have cost $130 billio'}

## Find missing hashtags

In [161]:
def complete_hashtags(tweets):
    new_tweets = []
    pat = re.compile(r"#(\w+)")
    for tweet in tweets:
        tweet_txt = tweet["text"]
        all_hashes = pat.findall(tweet_txt)
        new_hashtags = tweet["hashtags"]
        for ht in all_hashes:
            if ht not in tweet["hashtags"]:
                new_hashtags.append(ht)
        new_tweets.append({"text": tweet_txt, "hashtags": new_hashtags})
    return new_tweets
        

In [176]:
tweets_completed = complete_hashtags(tweets_simplified)

In [179]:
tweets_completed[0]

{'hashtags': ['FreeCommunityCollege', 'PreKforAll'],
 'text': 'RT @Kyle_Lierman During our Obama White House days we proposed #FreeCommunityCollege #PreKforAll which would have cost $130 billio'}

## Remove @s

In [186]:
def filter_relations(tweets):
    new_tweets = []
    for tweet in tweets:
        tweet_txt = tweet["text"]
        tweet_txt = re.sub(r"@(\w+)", '', tweet_txt)
        tweet_txt = re.sub(' +',' ', tweet_txt)
        new_tweets.append({"hashtags": tweet["hashtags"], "text": tweet_txt})
    return new_tweets

In [187]:
tweets_no_monkey_a = filter_relations(tweets_completed)

In [219]:
tweets_no_monkey_a[2]

{'hashtags': ['Discord',
  'IndieGame',
  'GameDev',
  'IndieDev',
  'Twitch',
  'LiveStream',
  'Youtube',
  'Gaming'],
 'text': "I'm the founder of a #Discord #IndieGame community that houses #GameDev/#IndieDev and #Twitch #LiveStream people and #Youtube gamers Let's connect all #Gaming people in one spot join the server today"}

## Stop word removal

In [190]:
import nltk

In [193]:
from nltk.corpus import stopwords

In [202]:
#uncomment this the first time
#nltk.download('punkt')
#nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thalv\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [223]:
def remove_stop_words_list(sentences):
    new_sentences = []
    stopwords_set = set(stopwords.words('english'))
    for sent in sentences:
        words = sent.split(" ")
        filtered_words = [word for word in words if word not in stopwords_set]
        if filtered_words != []:
            new_sentences.append(" ".join(filtered_words))
    return new_sentences

In [222]:
remove_stop_words_list(["I'm the founder of a #Discord #IndieGame community that houses #GameDev/#IndieDev and #Twitch #LiveStream people and #Youtube gamers Let's connect all #Gaming people in one spot join the server today"])

["I'm founder #Discord #IndieGame community houses #GameDev/#IndieDev #Twitch #LiveStream people #Youtube gamers Let's connect #Gaming people one spot join server today"]

In [225]:
def remove_stop_words_from_tweets(tweets):
    new_tweets = []
    counter = 1
    n = len(tweets)
    for tweet in tweets:
        print("\r{}/{}".format(counter, n), end="")
        tweet_txt = tweet["text"]
        tweet_txt_no_stops = remove_stop_words_list([tweet_txt])
        
        if tweet_txt_no_stops == []:
            continue
        
        hashtags_no_stops = remove_stop_words_list(tweet["hashtags"])
        
        if hashtags_no_stops == []:
            continue
        
        new_tweets.append({"text": tweet_txt_no_stops, "hashtags": hashtags_no_stops})
        counter += 1 
    return new_tweets

In [226]:
tweets_no_stop = remove_stop_words_from_tweets(tweets_no_monkey_a)

346354/346585

In [230]:
tweets_no_stop[10500]

{'hashtags': ['Microsoft', 'Azure', 'Cloud', 'IoT', 'AI', 'IndustrialIoT'],
 'text': ['RT #Microsoft Release VMwareFriendly #Azure #Cloud Migrate Service #IoT #AI #IndustrialIoT']}

In [231]:
len(tweets_no_stop)

346354

In [234]:
counter = 0

for tweet in tweets_no_stop:
    if tweet["text"][0].startswith("RT"):
        counter += 1
print(counter)

220994


In [246]:
only_txt = [tweet["text"] for tweet in tweets_no_stop]

In [248]:
len(set(only_txt))

218213

In [238]:
def fix_stupidity(tweets):
    new_tweets = []
    for tweet in tweets:
        new_tweets.append({"text": tweet["text"][0], "hashtags": tweet["hashtags"]})
    return new_tweets

In [239]:
tweets_no_stop_1 = fix_stupidity(tweets_no_stop)

In [241]:
tweets_no_stop = tweets_no_stop_1

## Filter duplicates

In [243]:
def filter_duplicates(tweets):
    already_seen = set()
    new_tweets = []
    for tweet in tweets:
        if tweet["text"] not in already_seen:
            already_seen.add(tweet["text"])
            new_tweets.append(tweet)
    return new_tweets

In [244]:
tweets_no_duplicates = filter_duplicates(tweets_no_stop)

In [245]:
len(tweets_no_duplicates)

218213

In [250]:
tweets_no_duplicates[10050]

{'hashtags': ['Canucks'],
 'text': " There's buyers trade market right people may projected #Canucks sellers right view buyers"}

## Remove RT symbol

In [253]:
re.sub(r"^RT ", '', "RT something something RT")

'something something RT'

In [254]:
def remove_rt(tweets):
    new_tweets = []
    for tweet in tweets:
        tweet_txt = tweet["text"]
        tweet_txt = re.sub(r"^RT ", '', tweet_txt)
        tweet_txt = re.sub(' +',' ', tweet_txt)
        new_tweets.append({"hashtags": tweet["hashtags"], "text": tweet_txt.strip()})
    return new_tweets  

In [255]:
tweets_no_rt = remove_rt(tweets_no_duplicates)

In [256]:
only_txt = [tweet["text"] for tweet in tweets_no_rt]

In [259]:
tweets_no_duplicates = filter_duplicates(tweets_no_rt)

In [260]:
len(tweets_no_duplicates)

216259

In [261]:
tweets_no_duplicates[2]

{'hashtags': ['Discord',
  'IndieGame',
  'GameDev',
  'IndieDev',
  'Twitch',
  'LiveStream',
  'Youtube',
  'Gaming'],
 'text': "I'm founder #Discord #IndieGame community houses #GameDev/#IndieDev #Twitch #LiveStream people #Youtube gamers Let's connect #Gaming people one spot join server today"}

## Write this version down

In [263]:
with open("no_stem_no_expand_hashtags_preserved.pickle", "wb") as f:
    pickle.dump(tweets_no_duplicates, f)

## Expand slang

In [265]:
def expand_slang_txt(txt):
    words = txt.split(" ")
    new_words = []
    for word in words:
        if word.lower() in slang_dict:
            new_words.append(slang_dict[word.lower()].lower())
        else:
            new_words.append(word)
    return " ".join(new_words)

In [270]:
expand_slang_hashtags(["Ripped", "WAFL"])

['Ripped', 'whatafuckingloser']

In [266]:
def expand_slang_hashtags(hashtags):
    new_hashtags = []
    for hashtag in hashtags:
        if hashtag.lower() in slang_dict:
            new_hashtags.append(slang_dict[hashtag.lower()].lower().replace(" ", ""))
        else:
            new_hashtags.append(hashtag)
    return new_hashtags

In [272]:
def expand_slang(tweets):
    new_tweets = []
    for tweet in tweets:
        tweet_txt = expand_slang_txt(tweet["text"])
        hashtags = expand_slang_hashtags(tweet["hashtags"])
        
        new_tweets.append({"hashtags": hashtags, "text": tweet_txt})
    return new_tweets      

In [273]:
tweets_slang_expanded = expand_slang(tweets_no_duplicates)

In [275]:
tweets_slang_expanded[11231]

{'hashtags': ['UX', 'knowledge'],
 'text': 'Today #UX Team Leader sharing #knowledge'}

## Save

In [276]:
with open("no_stem_expanded_hashtags_preserved.pickle", "wb") as f:
    pickle.dump(tweets_slang_expanded, f)

## Stemming

In [282]:
from nltk.stem.porter import PorterStemmer

In [290]:
def stem_txt(ps, txt):
    words = txt.split(" ")
    new_words = []
    for word in words:
        new_words.append(ps.stem(word))
    return " ".join(new_words)

In [278]:
def stem_hashtags(ps, hts):
    new_hts = []
    for ht in hts:
        new_hts.append(ps.stem(ht))
    return new_hts

In [296]:
def stem_tweets(tweets):
    new_tweets = []
    ps = PorterStemmer()
    counter = 1
    for tweet in tweets:
        print("\r{}/{}".format(counter, len(tweets)), end="")
        tweet_txt = stem_txt(ps, tweet["text"])

        new_tweets.append({"hashtags": tweet["hashtags"], "text": tweet_txt})
        counter += 1
    return new_tweets

In [297]:
tweets_stemmed = stem_tweets(tweets_slang_expanded)

216259/216259

In [299]:
len(tweets_stemmed)

216259

In [298]:
tweets_stemmed[2]

{'hashtags': ['Discord',
  'IndieGame',
  'GameDev',
  'IndieDev',
  'Twitch',
  'LiveStream',
  'Youtube',
  'Gaming'],
 'text': "i'm founder #discord #indiegam commun hous #gamedev/#indiedev #twitch #livestream peopl #youtub gamer let' connect #game peopl one spot join server today"}

# SAVE

In [300]:
with open("all_preproc_hashtags_preserved.pickle", "wb") as f:
    pickle.dump(tweets_stemmed, f)

# Drop all hashtags

In [304]:
def drop_all_hashtags(tweets):
    new_tweets = []
    for tweet in tweets:
        tweet_txt = re.sub(r"#(\w+)", "", tweet["text"])
        tweet_txt = re.sub(' +',' ', tweet_txt)        
        new_hashtags = tweet["hashtags"]
        new_tweets.append({"text": tweet_txt, "hashtags": new_hashtags})
    return new_tweets

In [305]:
tweets_no_hashtags = drop_all_hashtags(tweets_stemmed)

In [306]:
tweets_no_hashtags[2]

{'hashtags': ['Discord',
  'IndieGame',
  'GameDev',
  'IndieDev',
  'Twitch',
  'LiveStream',
  'Youtube',
  'Gaming'],
 'text': "i'm founder commun hous / peopl gamer let' connect peopl one spot join server today"}

# SAVE

In [307]:
with open("all_preproc_no_hashtags.pickle", "wb") as f:
    pickle.dump(tweets_no_hashtags, f)

## Hashtags no symbols

In [308]:
def drop_all_hashtag_symbols(tweets):
    new_tweets = []
    for tweet in tweets:
        tweet_txt = re.sub(r"#", "", tweet["text"])
        tweet_txt = re.sub(' +',' ', tweet_txt)        
        new_hashtags = tweet["hashtags"]
        new_tweets.append({"text": tweet_txt, "hashtags": new_hashtags})
    return new_tweets

In [309]:
tweets_no_ht_symbols = drop_all_hashtag_symbols(tweets_stemmed)

In [310]:
tweets_no_ht_symbols[2]

{'hashtags': ['Discord',
  'IndieGame',
  'GameDev',
  'IndieDev',
  'Twitch',
  'LiveStream',
  'Youtube',
  'Gaming'],
 'text': "i'm founder discord indiegam commun hous gamedev/indiedev twitch livestream peopl youtub gamer let' connect game peopl one spot join server today"}

## SAVE

In [311]:
with open("all_preproc_no_ht_symbols.pickle", "wb") as f:
    pickle.dump(tweets_no_ht_symbols, f)