# Retrieve tweets data set
Each tweet has a source (the author), a target (RT/@), a date, semi-cleaned text, full text and id. This next snippet of code extracts all tweets from json file and removes duplicates (1 300 000 -> 800 000 tweets). The tweets are saved in the `jtweets` variable.

In [1]:
import json

In [2]:
# tweet dataset
j = json.load(open('/home/pgay/twitter/elyzee/elyzee_decimated_elyzee_temp_edges_only.json'))
# keep only tweets and delete duplicates
jtweets = [dict(t) for t in {tuple(d.items()) for d in j['edges']}]

# Preprocess the tweets
Each tweet has a 'txt' key which corresponds to an already preprocessed tweet. But additional preprocessing is required: removing stopwords and numbers, and tokenizing tweet. The new preprocessed tweet is stored in a new key ('txt_pp'). We perform all the preprocessing first in order to save precious runtime later.

In [3]:
from stop_words import get_stop_words
from gensim.parsing.preprocessing import preprocess_string, STOPWORDS

In [4]:
def removeNumbersFromTweet(tweet):
    """
    remove numbers from from tokenized semi-cleaned ('txt' key) tweet
    returns tokenized tweet
    """
    return [i for i in tweet if not i.isdigit()]

In [5]:
def removeStopWords(tweet):
    """
    removes french and english stopwords from tokenized tweet
    returns tokenized tweet
    """
    frenchStopWords = get_stop_words('french')
    newStopWords = STOPWORDS.union(set(frenchStopWords))
    # remove stopwords and word with length less than or equal to 3
    return [word.lower() for word in tweet if word not in newStopWords and len(word) not in [1,2,3]]

In [6]:
def preProcess(tweet):
    """
    some final preprocessing (remove nbs and spelling mistakes)
    returns tokenized preprocessed tweet
    """
    return removeStopWords(removeNumbersFromTweet(tweet.split()))
    

In [7]:
def preProcessAllTweets(tweets):
    """
    preprocess all tweets
    adds key 'txt_pp' to tweets
    returns list of tweets (so a list of dict)
    """
    for n in range(len(tweets)):
        tweets[n]['txt_pp'] = preProcess(tweets[n]['txt'])
    return tweets
        

In [8]:
jtweets = preProcessAllTweets(jtweets)

# Save tweets to JSON

In [9]:
#save jtweets as json file for other notebooks
with open('jtweets.json', 'w') as outfile:
    json.dump(jtweets, outfile)