In [1]:
#Importing required packages.
    #tweepy for the API and searching of twitter.
import tweepy

    #string to remove punctation, unicodedata and unidecode to convert emoji into text representation.
import string
import unicodedata
from unidecode import unidecode

    #textblob for the sentiment analysis.
from textblob import TextBlob

    #pandas to store it into a dataframe, and csv to write out. 
import pandas as pd
import csv

In [2]:
def pull_tweets(term,date,retweets,lang,amount):
    """ 
        This function creates an Oauth handler for Twitter.com's API using credentials provided in the .py file.
        The function then connects to Twitter's API, and using the Cursor object from Tweepy package it does a 
        search for tweets using the parameters specified. The text field of these tweets are then stored into
        a list, and returned to the caller. 

        Parameters: 
            term         (string): The term used to search twitter for tweets
            date         (string): The date used for earliest search (note: free accounts can only go back 7 days)
            retweets    (Boolean): Whether to include retweets or not. 
            lang         (string): The language of tweet to return.
            amount          (int): The number of tweets to pull. 

        Returns: 
            list: A list of raw tweet information. 

    """
    include_retweets = ' -filter:retweets'
    #enable retweets if requested by the caller
    if retweets == True:
        include_retweets = ''
        
    %run ~/twitter_credentials.py
        #Use tweepy.OAuthHandler to create an authentication using the given key and secret
    auth = tweepy.OAuthHandler(consumer_key=con_key, consumer_secret=con_secret)
    auth.set_access_token(acc_token, acc_secret)
        #connect to the twitter API using the authentication
    api = tweepy.API(auth)  
        #using cursor search twitter using given parameters - extended mode so it doesnt truncate. 
    tweet_block = tweepy.Cursor(api.search, q=term+include_retweets, lang=lang, since=date, tweet_mode='extended').items(amount)
        #pull text from cursor item
    raw_text_tweets = [tweet.full_text for tweet in tweet_block]
    
    return(raw_text_tweets)

raw_tweets = pull_tweets("FF7R","2020-05-03",False,"en",100)

In [3]:
def depunctify_dynamic(text, replacement_chars=string.punctuation):
    """ 
    Itereates over the list of passed in replacement_chars, and for each iteration replace
    the char with an empty string. This "cleans" the text based on what was passed for the
    replacement_chars
   
    Parameters: 
        text                (string): The text to remove punctuation.
        replacement_chars   (string): The string of punctuation to remove from the text.
                                      default value is the string.punctuation value.
        
    Returns: 
        string: A string of the "cleaned" text value.
  
    """
    for pchar in replacement_chars:
        text = text.replace(pchar,'')
    return text

In [4]:
def clean_tweets(raw_tweets):
    """ 
    This accepts a list of raw tweets, cleans them, and returns the processed tweets. First, it takes each tweet
    from the raw list and splits it into individuals words as a sub list. Second, it iterates through each 
    split-word tweet, goes into each word and processes the word cleaning of punctuation and URLs. Finally
    it rejoins the words together, strips whitespace, and rebuilds the list of cleaned tweets. 
   
    Parameters: 
        raw_tweets  (list): A list of tweets that contain undesired formatting in their body of text
        
    Returns: 
        list: A list of the cleaned_tweets. 
  
    """
    temp_cleaning_tweet = []
    clean_tweets = []

        #split each tweet in the raw_tweets data into its own list, but also split it by word, since we need to clean it next.
    split_list_tweets = [tweet.split() for tweet in raw_tweets]

        #iterate through each split-word tweet from the previous line.
    for tweet in split_list_tweets:
            temp_cleaning_list = []
                #dig into the actual word[s] of the split-word tweet.
            for word in tweet:
                    #if URL, need to remove
                if word[0:8] == "https://":
                    word = ""
                    #strip punctuation using the depunctify_dynamic() function
                word = depunctify_dynamic(word,(string.punctuation.replace('?','').replace('!','') + "”“‘…’"))       
                    #add cleaned word to temp list
                temp_cleaning_list.append(word)
                #join each word in the temp cleaned list together, strip whitespace, then add to the final list
            clean_tweets.append((" ".join(temp_cleaning_list)).strip())

    return(clean_tweets)

cleaned_tweets = clean_tweets(raw_tweets)

In [5]:
#I did not create this function (although i did add comments for my edification). 
#Credit goes to: user3082900 on Stackoverflow:
#https://stackoverflow.com/questions/43797500/python-replace-unicode-emojis-with-ascii-characters/43813727#43813727

def deEmojify(inputString):
    returnString = ""
        #iterate through each character in the tweet
    for character in inputString:
            #if it can be encoded to ascii, do so
        try:
            character.encode("ascii")
            returnString += character
            #otherwise, if its unicode, decode the character
        except UnicodeEncodeError:
            replaced = unidecode(str(character))
            if replaced != '':
                returnString += replaced
            else:
                    #otherwise, pull the name of the unicode character and return this
                try:
                     returnString += "[" + unicodedata.name(character) + "]"
                    #all others, return X
                except ValueError:
                     returnString += "[x]"
        #return the decoded tweet. 
    return returnString

    #iterate through each tweet and decode any unicode or emojis with its text equivalent
cleaned_decoded_tweets = [deEmojify(tweet) for tweet in cleaned_tweets]

In [6]:
def conduct_sentiment_analysis(cleaned_tweets):
    """ 
    This accepts a list of cleaned tweets, and runs them through the TextBlob package for sentiment analysis
    First it "blobs" each cleaned tweet (breaks down into NLP components. Second, it calculates polarity on that 
    text, and returns it along with the text back into a list. Third, a dataframe is created using the values
    and additional binary column is added, isPositive = 1 if the polarity is greater than zero, else 0. 
   
    Parameters: 
        cleaned_tweets  (list): A list of tweets that are ready for sentiment analysis.
        
    Returns: 
        DataFrame: A dataframe containing the sentiment analysis of the tweets. 
  
    """
        #"blobbing" each tweet, adding it to a list of "blobbed" tweets
    textblob_tweet = [TextBlob(tweet) for tweet in cleaned_tweets]
        #for each blobbed item, create a list of the calculated polarity and the text of the tweet - store that into a list.
    sentiment_values = [[tweet.sentiment.polarity, str(tweet)] for tweet in textblob_tweet]
        #create a dataframe using the polarity in one column, and the tweet text in the other.
    df = pd.DataFrame(sentiment_values, columns=["Polarity","Tweet"])
        #add a binary "isPositive" column for any polarity greater than 0
    df['isPositive'] = df.apply(lambda row: 1 if (row.Polarity >0) else 0, axis=1)

    return(df)

    #complete the sentiment analysis and store in a dataframe
tweet_df = conduct_sentiment_analysis(cleaned_decoded_tweets)
    #review dataframe
tweet_df

Unnamed: 0,Polarity,Tweet,isPositive
0,0.000000,the only stairs i can climb is the 59 flight o...,0
1,0.189660,Final thoughts on FF7R side quests were fun an...,1
2,-0.400000,Its moments like this in DFFOO that I apprecia...,0
3,-0.400000,FUCK YOU JULES FF7R,0
4,0.875000,Wanted to RP but I got distracted finishing FF...,1
...,...,...,...
95,0.350000,Look I just cry every time I see her and hear ...,1
96,0.031250,Surprise! A little morning FF7R stream We just...,1
97,0.020833,Morning bois hows everyones Sunday so far? Gon...,1
98,-0.145833,Me after finally beating Hell House on hard mo...,0


🎯cell changed to markdown since we are not writing out to file in final submission.

🎯cell was here to write out tweets in increments of 1000 over the course of 3 sessions, pulling data twice per session.

🎯data is combined to obtain the combined dataset with the code displayed in the other markdown cell below.

`tweet_df.to_csv("C:/Users/joshj/Documents/GitHub/ds710spring2020finalproject/tweets.csv", index=False)`

In [7]:
def character_counter(df):
    """ 
    This function extracts the text column from the dataframe, and splits it into a list of words per entry.
    The function then iterates through each word (converts to lowercase) and compares the word to a list of character names
    If the character name is a match, then its related counter is incremented. A zipped list of names and counts
    is created, casted to a dictionary and returned to the caller. 
   
    Parameters: 
        df (DataFrame): A dataframe containing the text of the tweets
        
    Returns: 
        Dictionary: A dictionary that contains the frequency counts of character names.
  
    """
        #initinalizing counters
    cloud_count = 0
    tifa_count = 0
    aerith_count = 0
    sephiroth_count = 0
    barret_count = 0
    names = ["Cloud", "Tifa", "Aerith", "Sephiroth", "Barret"]

        #inner for loop creates a list of tweet text from the dataframe.
        #outter for loop seperates each tweet into single words.
        #this creates a nested list of words. 
    words_list = [word.split() for word in [tweet for tweet in df["Tweet"]]]

        #iterating through each word of each tweet and counting the number of times the name appears. 
    for tweet in words_list:
            for word in tweet:
                if word.lower() in ("cloud","cloudstrife"):
                    cloud_count +=1
                elif word.lower() in ("tifa","tifalockhart"):
                    tifa_count +=1
                elif word.lower() in ("aerith","aerithgainsborough"):
                    aerith_count +=1
                elif word.lower() == "sephiroth":
                    sephiroth_count +=1
                elif word.lower() in ("barret","barretwallace"):
                    barret_count +=1

        #building a list of the final name counts
    counts = [cloud_count, tifa_count, aerith_count, sephiroth_count, barret_count]                

        #zipping the character names and character counts, converting to dictionary, and returning.
    return dict(zip(names,counts))

character_counter(tweet_df)

{'Cloud': 5, 'Tifa': 9, 'Aerith': 7, 'Sephiroth': 6, 'Barret': 1}

🎯Changing cell to markdown since final submission did not include all 6 source files in my github (just the combined file). Map the read_csv function to each .csv file. For each file read in this way, concatinate/append it to the tweet_df dataframe.

`combine_tweet_df = pd.concat(map(pd.read_csv,[C:/Users/joshj/Documents/GitHub/ds710spring2020finalproject/FF7R_tweets_4_19_2020.csv","C:/Users/joshj/Documents/GitHub/ds710spring2020finalproject/FF7R_tweets_4_24_2020.csv","C:/Users/joshj/Documents/GitHub/ds710spring2020finalproject/FF7R_tweets_4_26_2020.csv","C:/Users/joshj/Documents/GitHub/ds710spring2020finalproject/finalfantasy7remake_tweets_4_19_2020.csv","C:/Users/joshj/Documents/GitHub/ds710spring2020finalproject/finalfantasy7remake_tweets_4_24_2020.csv","C:/Users/joshj/Documents/GitHub/ds710spring2020finalproject/finalfantasy7remake_tweets_4_26_2020.csv"]))` 


🎯Output combined .csv files into one combined_dataset
`combine_tweet_df.to_csv("C:/Users/joshj/Documents/GitHub/ds710spring2020finalproject/combined_dataset.csv", index=False)`


In [8]:
def write_character_frequency(df, target_output):
    """ 
    Writes a .csv file of the number of times a character name appears in the passed in dataframe.
   
    Parameters: 
                df (DataFrame): The dataframe that will be parsed for character names. 
        target_output (string): The pathway and file name for data to be written.
        
    Returns: 
        String: A simple message that lets the user know the function was successfully executed.
  
    """
    with open(target_output, 'w', newline='') as output_file: #opening the output file to write
        writer = csv.writer(output_file) 
        for key, value in character_counter(df).items(): #calling the character_counter() function on the dataframe, and then iterating through each key-value pair in the returned dict, and writing to the csv file.
            writer.writerow([key,value])

    return "File Successfully Saved."

🎯Cell changed to markdown since we are not writing out to file in final submission.

🎯This produced the character frequency counts, using the combined data set.

`write_character_frequency(combine_tweet_df,"C:/Users/joshj/Documents/GitHub/ds710spring2020finalproject/character_frequency.csv")`