# Sentiment Analysis on Twitter data

In [1]:
import re

In [2]:
# print all the outputs in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
# Auto-save every 1 second.
%autosave 1

Autosaving every 1 seconds


### Open the text files 

In [4]:
trump_file = open("Trump.txt","r")
covid_file = open("covid.txt","r")
pos = open("positive.txt","r")
neg = open("negative.txt","r")
st = open("stop.txt","r")

### Read the Text files to store their contents

In [5]:
trump_string = trump_file.read()
covid_string = covid_file.read()

# Positive words read from file and stored in a dictionary 
positive_words = {}
for p in pos.read().split():
    positive_words[p] = 1

# Negative words read from file and stored in a dictionary 
negative_words = {}
for n in neg.read().split():
    negative_words[n] = 2
    
# Stop words read from file and stored in a dictionary
stop_words = {}
for s in st.read().split():
    stop_words[s] = 3
    
# Using dictionary to store the positive, negative, and stop words as look-up is faster than iterating through the list

### Function to clean string and split

In [6]:
def clean_and_split(file_string):
    
    # Reference https://stackoverflow.com/a/40823105
    # Using regular expression to remove URL in file
    file_string = re.sub(r'http\S+', '', file_string)
    
    # Using regular expression to remove @username 
    file_string = re.sub(r'@\S+', '', file_string)
    
    # Replace function to replace the string with apostrophe
    file_string = file_string.replace("\\xe2\\x80\\x99", "'")
    
    # Loop to find and replace punctuations/strings with empty string
    punc_list = [".", "\"", ",", ":", "?", "!", "#", "@", "\\", "'b'RT", "&amp", "/", "b'RT"]
    for item in punc_list:
        file_string = file_string.replace(item, "")
    
    # Split function to convert string to a list
    file_list = file_string.split()
    return file_list

### Function to look-up if words in Positive/negative/stop words

In [7]:
def compare(file_list):
    
    # Declaring local variables for the counts
    count_pos = 0
    count_neg = 0
    count_st = 0
    count_others = 0
    
    # Iterating through the text's list of words and look-up for positive/negative/stop words
    for item in file_list:
        
        # First using Strip function to clean the string and then converting them to lower case
        item = item.strip()
        item = item.lower()
        if item in positive_words:
            count_pos += 1
        elif item in negative_words:
            count_neg += 1
        elif item in stop_words:
            count_st += 1
        else:
            count_others += 1
    
    # Print statement to display the count and return the values
    print("The count of:\n\tpositive words: %d\n\tnegative words: %d\n\tstop words: %d\n\tother words: %d\n" % (count_pos, count_neg, count_st, count_others))
    return count_pos, count_neg, count_st, count_others   

### Function to caluculate ratios and analyze if overall sentiment is Positive/Negative/Neutral

In [8]:
def analyze(file_list):
    
    # Calculating the length of the list to get total count of words
    Total_count = len(file_list)
    
    # Compute the ratios
    ratio_pos = Count_positive/Total_count
    ratio_neg = Count_negative/Total_count
    ratio_stop = Count_stop/Total_count
    ratio_others = Count_others/Total_count
    
    # Sentiment Analysis
    pos_neg_sum = Count_positive - Count_negative
    
    # Sentiment Conclusion
    if pos_neg_sum > 0:
        print("The overall sentiment of the Twitter data collected is POSITIVE\n")
    elif pos_neg_sum == 0:
        print("The overall sentiment of the Twitter data collected is NEUTRAL\n")
    else:
        print("The overall sentiment of the twitter data collected is NEGATIVE\n")
    
    # Print the ratios and difference of positive and negative word count for sentiment conclusion
    print("Positive words ratio: %f\nNegative words ratio: %f\nStop words ratio: %f\nOther words ratio: %f\nDifference of positive and negative words: %d\n" % (ratio_pos, ratio_neg, ratio_stop, ratio_others, pos_neg_sum))  

In [9]:
def check_word(file_list):
    
    # Declaring local variables for the counts
    count_pos = 0
    count_neg = 0
    count_st = 0
    count_others = 0
    
    for item in file_list:
        
        # First using Strip function to clean the string and then converting them to lower case
        item = item.strip()
        item = item.lower()
        
        # Computation of positive/negative/stop words without Trump keyword
        if item != "trump" and item in positive_words:
            count_pos += 1
        elif item in negative_words:
            count_neg += 1
        elif item in stop_words:
            count_st += 1
        else:
            count_others += 1
    
    # Print statement to display the count and return the values
    print("The count of\n\tpositive words: %d\n\tnegative words: %d\n\tstop words: %d\n\tother words: %d\n" % (count_pos, count_neg, count_st, count_others))
    return count_pos, count_neg, count_st, count_others  
            

### Call the functions to analyze Trump.txt and covid.txt files

In [10]:
# Call the functions for cleaning the Trump.txt file
trump_list = clean_and_split(trump_string)

# Call the function for the count and analysis of Trump.txt file without the keyword Trump
print("Sentiment analysis for Trump.txt file without the keyword 'trump'\n")
Count_positive, Count_negative, Count_stop, Count_others = check_word(trump_list)
analyze(trump_list)
print("-----------------------------\n")

# Call the function for the count and analysis of Trump.txt file with the keyword Trump
print("Sentiment analysis for Trump.txt file with the keyword 'trump'\n")
Count_positive, Count_negative, Count_stop, Count_others = compare(trump_list)
analyze(trump_list)

Sentiment analysis for Trump.txt file without the keyword 'trump'

The count of
	positive words: 123
	negative words: 214
	stop words: 1996
	other words: 2831

The overall sentiment of the twitter data collected is NEGATIVE

Positive words ratio: 0.023819
Negative words ratio: 0.041441
Stop words ratio: 0.386522
Other words ratio: 0.548218
Difference of positive and negative words: -91

-----------------------------

Sentiment analysis for Trump.txt file with the keyword 'trump'

The count of:
	positive words: 295
	negative words: 214
	stop words: 1996
	other words: 2659

The overall sentiment of the Twitter data collected is POSITIVE

Positive words ratio: 0.057126
Negative words ratio: 0.041441
Stop words ratio: 0.386522
Other words ratio: 0.514911
Difference of positive and negative words: 81



In [11]:
# Call the function for cleaning the covid.txt file
covid_list = clean_and_split(covid_string)

# Call the function to count and analyze the sentiment on covid.txt file
print("Sentiment analysis for covid.txt file\n")
Count_positive, Count_negative, Count_stop, Count_others= compare(covid_list)
analyze(covid_list)

Sentiment analysis for covid.txt file

The count of:
	positive words: 289
	negative words: 389
	stop words: 4291
	other words: 6413

The overall sentiment of the twitter data collected is NEGATIVE

Positive words ratio: 0.025391
Negative words ratio: 0.034177
Stop words ratio: 0.376999
Other words ratio: 0.563433
Difference of positive and negative words: -100

