In [None]:
#The first step to any project is obtaining data. 
#Here, I interface with the Twitter REST API to download up to 3000 tweets of a specific individual by their twitter handle, and then discard anything beyond 3000.
#What is nice about this code, is that if you want download enough tweets that the API would normally time out and stop working, this code ensures that it just take a pause, waits the necessary amount of time, and then continues.
#it is also set up in a way that allows to continue downloading from the last place that you had to pause at, so that you can avoid duplicates

import tweepy
import pandas as pd

#
pd.options.display.max_columns = 50
pd.options.display.max_rows = 50
pd.options.display.width = 150
#
#The keys are obtained with your twitter account through your twitter website
#Authentication
consumer_key = 'EnterYourConsumerKeyHere'
consumer_secret = 'EnterYourConsumerSecretHere'
#Passing the consumer key and secret to the OAuthHandler
#auth = tweepy.OAuthHandler(consumer_key = consumer_key, consumer_secret = consumer_secret)
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
##Creating API object and passing the authentication to it
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

if (not api):
    print ("Can't Authenticate")
    sys.exit(-1)

import sys
import jsonpickle
import os

account_list1 = "TwitterHandleOfInterestGoesHere"
#If you want ALL THE TWEETS enter some rediculously large number here
maxTweets = 5000000000000000000000000000000000000
#max tweets per query that api permits
tweetsPerQry = 100


results1 = []
#if results1 from a specific ID onwards required set since_id to the ID otherwise default to no lower limit to go back as far back as API allows
sinceId = None
#if results1 only below a specific ID are required set max_id to that ID. Otherwise default to no upper limit and start from the most recent tweet matching the search query
max_id = -1

#I did not end up using these filters, but am including them in case they are of use to others:
#A filter to only download tweets from verified accounts.
#Can be useful if you want to download tweets on a specific topic as opposed to from a specific person
#verified_filter='+filter:verified'
#A filter to only download tweets in English.
#Note that this sometimes fails, e.g. tweets can have an Ensligh URL with non-English words
#In that case you can employ a function to filter out all the non-English tweets later in the pipleline
#lang_filter = '+language = "en"'
#A filter to only download tweets that are not simple re-tweets.
#rt_filter = '-filter:retweets'

tweetCount = 0
print("Downloading max {0} tweets".format(maxTweets))
with open(fName, 'w') as f:
    while tweetCount < maxTweets:
        try:
            if (max_id <= 0):
                if (not sinceId):
                    #You can exclude retweets either with a filter or by directly specifying that include_rts=False
                    new_tweets = api.user_timeline(screen_name = account_list1, include_rts = False, count=tweetsPerQry)
                else:
                    new_tweets = api.user_timeline(screen_name = account_list1, include_rts = False, count=tweetsPerQry,
                                            since_id=sinceId)
            else:
                if (not sinceId):
                    new_tweets = api.user_timeline(screen_name = account_list1, include_rts = False, count=tweetsPerQry,
                                            max_id=str(max_id - 1))
                else:
                    new_tweets = api.user_timeline(screen_name = account_list1, include_rts = False, count=tweetsPerQry,
                                            max_id=str(max_id - 1),
                                            since_id=sinceId)
            if not new_tweets:
                print("No more tweets found")
                break
            for tweet in new_tweets:
                results1.append(tweet)
#                f.write(jsonpickle.encode(tweet._json, unpicklable=False) +
#                       '\n')
            tweetCount += len(new_tweets)
            print("Downloaded {0} tweets".format(tweetCount))
            max_id = new_tweets[-1].id
        except tweepy.TweepError as e:
            # Just exit if any error
            print("some error : " + str(e))
            break

print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName))


#takes the list results1 and returns a dataframe after storing results in it
def process_results1(results1):
    id_list = [tweet.id for tweet in results1]
    df_name = pd.DataFrame(id_list, columns = ["id"])

#Processing the tweet data by putting it all in a dataframe
    df_name["text"] = [tweet.text for tweet in results1]
    df_name["created_at"] = [tweet.created_at for tweet in results1]
    df_name["retweet_count"] = [tweet.retweet_count for tweet in results1]
    df_name["favorite_count"] = [tweet.favorite_count for tweet in results1]
    df_name["source"] = [tweet.source for tweet in results1]
#processing the user data
    df_name["user_id"] = [tweet.author.id for tweet in results1]
    df_name["user_screen_name"] = [tweet.author.screen_name for tweet in results1]
    df_name["user_name"] = [tweet.author.name for tweet in results1]
#when the user created the account:
    df_name["user_created_at"] = [tweet.author.created_at for tweet in results1]
    df_name["user_description"] = [tweet.author.description for tweet in results1]
    df_name["user_followers_count"] = [tweet.author.followers_count for tweet in results1]
    df_name["user_friends_count"] = [tweet.author.friends_count for tweet in results1]
    df_name["user_location"] = [tweet.author.location for tweet in results1]

#I chose to keep only the last 3000 tweets
    return df_name
df_name = process_results1(results1)
df_name = df_name.iloc[df_name.index < 3000]

In [None]:
#keeping only the column that has the text in it
df_name = df_name.loc[:,['text']]
#Splitting the tweets into three columns, and then combining them to create data points of 3 tweets instead of one
#I ended up using units of 9 tweets to improve the F1 score
df_name = (pd.DataFrame(df_name.values.reshape(-1, 3)))
df_name['text'] = df_name[0] + ' ' + df_name[1] + ' ' + df_name[2]
df_name = df_name.loc[:,['text']]
#Since this person was depressed I added a column to indicate that of their tweets should be classified as such
df_name['depressed'] = 1
df_name


#Repeat this process until you have all the tweets from the different people that you want

In [None]:
#If you find you are having issues playing with the dataframe because of empty cells, try the following
#to strip the space from the otherwise empty tweet cells
df_name['text']=df_name['text'].astype("str")
df_name['text']=df_name['text'].map(str.strip)
#creating a filter that is selecting the not empty cells since the space has been stripped
filter = df_name["text"] != ""
#removing all the rows that have cells that are empty that aren't what the filter holds
df_name = df_name[filter]
#Resetting the index
df_name = df_name.reset_index(drop=True)
df_name

In [None]:
#Once you have all your data in seperate dataframes combine them:
#creating single df
#specifying that axis=0 will join the dataframes by coluns--one on top of the other--as opposed to by rows--one next to the other
data_set = pd.concat([df_name1,df_name2,df_name3,df_name4], axis=0)
data_set = data_set.reset_index(drop=True)