### Import Dependencies and Initialize Twitter API

In [9]:
import tweepy
import time
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

# your twitter app credentials
CONSUMER_KEY = <YOUR CONSUMER KEY>
CONSUMER_SECRET = <YOUR CONSUMER SECRET>
ACCESS_TOKEN = <YOUR ACCESS TOKEN>
ACCESS_SECRET = <YOUR ACCESS SECRET>

# create twitter api object with your credentials using tweepy
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
api = tweepy.API(auth)

### Create Dataset
Twitter has a limit on how many tweets you can return from it's API on a given user. This limit is 3200, therefore we will gather a dataset of the 3200 most recent tweets from a user using the below function. It will save the tweets to a CSV file with the name of the twitter handle that is passed into the function.

In [2]:
def getTweets(twitter_handle):
    try:
        tweets = tweepy.Cursor(api.user_timeline, screen_name = twitter_handle, exclude_replies = True, include_rts = False, tweet_mode = 'extended').items()
        
        return (status.full_text for status in tweets)
    except tweepy.TweepError:  
        print("Waiting...")
        time.sleep(60)
        
        return None

def buildDataset(twitter_handle):
    data = getTweets(twitter_handle)
    with open(twitter_handle+'.csv', 'w', encoding = 'utf-8') as file:
        for tweet in data:
            file.write(tweet)
            file.write('\n')

In [3]:
#create CSV file with Trudeau tweets
buildDataset("JustinTrudeau")

In [3]:
#create CSV file with Trump tweets
buildDataset("realDonaldTrump")

### Cleaning and Preprocessing Dataset

In [4]:
def cleanData(twitter_handle):
    with open(twitter_handle+'.csv', 'r', encoding = 'utf-8') as file:
        with open(twitter_handle+'_cleaned.csv', 'w', encoding = 'utf-8') as clean_file:
            for tweet in file:
                tweet = tweet.lower() #lower case
                tweet = re.sub(r'https?:\/\/.*[\r\n]*', '\n', tweet) #remove all URLs
                tweet = re.sub('@[^\s]+', '', tweet) #remove the @users
                tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
                clean_file.write(tweet)
            clean_file.close()
        file.close()

In [7]:
cleanData("JustinTrudeau")

In [8]:
cleanData("realDonaldTrump")

In [10]:
stop_words = set(stopwords.words('english')) 
print(stop_words)

{'both', 'down', 'she', 'be', 'it', "it's", 'have', 'shouldn', 'himself', 'if', 'with', 'i', 'wasn', 'me', 'but', 'other', 'itself', 'their', 'until', 'been', 'should', 'doesn', 'further', 'any', "you're", "mightn't", "haven't", 'above', 'this', "shouldn't", 'ourselves', 'they', 'while', 'out', 'off', 'herself', 'now', 'aren', 'of', 'nor', 'does', 'll', 'our', 'haven', 'his', 'what', 'few', 'd', "couldn't", 'up', "didn't", "needn't", 'yourselves', 'being', 'about', 'on', 'having', 'did', 'hasn', 'your', 'in', 'own', 's', 'after', 'shan', 'such', 'mustn', 'will', 'won', 're', 'to', 'doing', 'o', 'we', 'yours', 'ma', 'mightn', 'an', 'over', "hadn't", 'more', 'all', 'so', 'during', 'didn', 'against', 'there', 'weren', "you've", 'myself', 'then', 'you', 'do', 'not', 'most', 'couldn', "you'll", 'before', 'and', 'has', 'each', 'don', 'when', 'wouldn', 'the', 'y', 'them', 'again', 'that', 'him', "isn't", "don't", "she's", 'hadn', 'why', "mustn't", 'at', 'same', 'isn', "doesn't", 'theirs', 'to