In [1]:
import pandas as pd
# Import and Initialize Sentiment Analyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

from unidecode import unidecode
import os

In [2]:
# Create a reference the CSV file desired
csv_path = "training.1600000.processed.noemoticon.csv"

# Read the CSV into a Pandas DataFrame
df = pd.read_csv(csv_path,encoding = "ISO-8859-1",index_col=False,  
                  names = ["sentiment", "ID", "date","flag", "user","text"])


In [3]:
df.head()

Unnamed: 0,sentiment,ID,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
text=df[['text','sentiment']]
text.head()

Unnamed: 0,text,sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [17]:
## Reduce size of the dataframe since our local machine can't handle processing larger files frac=0.015

results_df2 = text.sample(frac=0.002)
results_df2.head()

Unnamed: 0,text,sentiment
453607,Why am I still hopeful when I know the chances...,0
187254,auch my wrist! it hurts me so much!!,0
175483,stoped broadcasting on blogtv coz they all lef...,0
1017604,@Iyarchuleta I'm good Been missing Twittervil...,4
1521411,Oh My Word. It slipped my mind that Gemmas bab...,4


In [18]:
# Add class column that contains string as classifier ( we will need this later for StringIndexer)
results_df2.loc[results_df2['sentiment'] == 0, 'original'] = 'negative'
results_df2.loc[results_df2['sentiment'] == 3, 'original'] = 'neutral'
results_df2.loc[results_df2['sentiment'] == 4, 'original'] = 'positive'
results_df2.head()

Unnamed: 0,text,sentiment,original
453607,Why am I still hopeful when I know the chances...,0,negative
187254,auch my wrist! it hurts me so much!!,0,negative
175483,stoped broadcasting on blogtv coz they all lef...,0,negative
1017604,@Iyarchuleta I'm good Been missing Twittervil...,4,positive
1521411,Oh My Word. It slipped my mind that Gemmas bab...,4,positive


In [19]:
len(results_df2)  

3200

In [20]:
## Export as CSV to upload to model script
output_path = os.path.join('CSV_cleaned', 'tweets_sample2.csv')
results_df2.to_csv(output_path, index=False, header=True)

In [None]:
######Further analysis on data########

In [None]:
# Run Vader analysis List to hold results
results_list = []


# Loop through all target users
for (idx, row) in text.iterrows():
    

    # Variables for holding sentiments
    compound_list = []
    positive_list = []
    negative_list = []
    neutral_list = []

    
  
    results = analyzer.polarity_scores(row["text"])
    compound = results["compound"]
    pos = results["pos"]
    neu = results["neu"]
    neg = results["neg"]

    # Add each value to the appropriate list
    compound_list.append(compound)
    positive_list.append(pos)
    negative_list.append(neg)
    neutral_list.append(neu)
                
           

    # Store the Average Sentiments
    sentiment = {
        "tweet": row["text"],
        "Compound": compound,
        "Positive": pos,
        "Neutral": neg,
        "Negative": neu,
        'original':row["sentiment"],
        
    }

#     # Print the Sentiments
#     print(sentiment)
#     print()
    
    # Append airline results to 'results_list'
    results_list.append(sentiment)

In [None]:
results_df = pd.DataFrame(results_list).set_index("tweet").round(3)
results_df.reset_index(level=0, inplace=True)
results_df.head()

In [None]:
results_df2= results_df[["tweet","original"]]
results_df2.head()

In [None]:
# results_df2.loc[results_df2['original'] == 0, 'original2'] = 'negative'
# results_df2.loc[results_df2['original'] == 3, 'original2'] = 'neutral'
# results_df2.loc[results_df2['original'] == 4, 'original2'] = 'positive'
# results_df2.head()

In [None]:
grouped=results_df2.groupby("original2")

mean=grouped.mean()
mean

In [None]:
## Export measure_df_clean file as a CSV, without the Pandas index, but with the header
output_path = os.path.join('CSV_cleaned', 'tweets_clean3.csv')
results_df2.to_csv(output_path, index=False, header=True)

In [None]:
# Get length of each tweet
results_df['length'] = results_df['tweet'].apply(len)
results_df.head()

In [None]:
## tokenize tweets and create function so that @-mentions, emoticons, URLs and #hash-tags are not recognised as single tokens.

import re
 
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens
 


In [None]:
## create new column with tokenized tweets
results_df['tokenized']=results_df['tweet'].apply(preprocess)

In [None]:
results_df.head()

In [None]:
# show average legth of tweet by original rating 0=negative, 4=positive

grouped=results_df.groupby("original")

mean=grouped.mean()
mean

In [None]:
# show average legth of tweet by compound rating
grouped2=results_df.groupby("Compound")

mean2=grouped2.mean()
mean2.reset_index(level=0, inplace=True)
mean2.head()

In [None]:
# plot reuslts
import matplotlib.pyplot as plt

plt.plot(mean2["length"],  mean2["Compound"], color="red", label="Original")

# plt.plot(  mean2["Compound"],mean2["length"], color="red", label="Original")


In [None]:
## Export measure_df_clean file as a CSV, without the Pandas index, but with the header
output_path = os.path.join('CSV_cleaned', 'tweets_clean.csv')
results_df.to_csv(output_path, index=False, header=True)

In [None]:
#reduce df for plotting
results_df2 = results_df.sample(frac=0.00005)
results_df2.head()

In [None]:
results_df2.reset_index(level=0, inplace=True)
results_df2.reset_index(level=0, inplace=True)
results_df2.head()

In [None]:
## Fit the scale of original sp its comparable to Vader analysis results
# results_df2['ori']= '0'
# results_df2['ori'][results_df['original'] = 1] = '-1'
# results_df2['ori'][results_df['original'] = 3] = '1'
# results_df2

results_df2.loc[results_df2['original'] == 0, 'ori'] = '-1.0'
results_df2.loc[results_df2['original'] == 3, 'ori'] = '0.0'
results_df2.loc[results_df2['original'] == 4, 'ori'] = '1.0'
results_df2.head()

In [None]:
results_df2['length'] = results_df2['tweet'].apply(len)
results_df2.head()

In [None]:
import matplotlib.pyplot as plt


plt.scatter(results_df2["level_0"], pd.to_numeric(results_df2.ori, errors='coerce'), color="red", label="Original")
plt.scatter(results_df2["level_0"], results_df2["Compound"], color="blue",alpha=0.5, label="Vader")


# plt.plot(results_df2["level_0"],  pd.to_numeric(results_df2.ori, errors='coerce'), color="red", label="Original")
# plt.plot(results_df2["level_0"], results_df2["Compound"], color="blue" ,alpha=0.5, label="Vader")
# plt.scatter(results_df2["level_0"],  pd.to_numeric(results_df2.ori, errors='coerce'), color="red", label="Original")


# plt.title(f"Sentiment Analysis of Tweets ({now}) for {target_user}")
# plt.ylim(10,-10) #Bonus
# plt.ylabel("Tweet Polarity")
# plt.xlabel("Tweets Ago")
## create Legend and place outside of graph
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

plt.show()
plt.style.use('seaborn')

In [None]:
grouped=results_df2.groupby("original")

mean=grouped.mean()
mean

In [None]:
grouped2=results_df2.groupby("length")

mean2=grouped2.mean()
mean2

In [None]:
# import numpy as np
# users = results_df["ori"]
# labels = results_df["ori"]
# x_axis = np.arange(len(users))
# # Users is our y axis and x_axis is, of course, our x axis
# plt.bar(x_axis, users, color='r', alpha=0.5,
#         tick_label=labels)

# plt.show()

In [None]:
# Export measure_df_clean file as a CSV, without the Pandas index, but with the header
output_path = os.path.join('CSV_cleaned_test', 'tweets_clean.csv')
measure_df_clean.to_csv(output_path, index=False, header=True)

# # Export measure_df_clean file as a CSV, without the Pandas index, but with the header
# output_path = os.path.join('CSV_cleaned', 'tweets_clean.csv')
# measure_df_clean.to_csv(output_path, index=False, header=True)

In [None]:
# # WORKS List to hold results
# results_list = []


# # Loop through all target users
# for (idx, row) in text.iterrows():
    

#     # Variables for holding sentiments
#     compound_list = []
#     positive_list = []
#     negative_list = []
#     neutral_list = []

    
  
#     results = analyzer.polarity_scores(row["text"])
#     compound = results["compound"]
#     pos = results["pos"]
#     neu = results["neu"]
#     neg = results["neg"]

#     # Add each value to the appropriate list
#     compound_list.append(compound)
#     positive_list.append(pos)
#     negative_list.append(neg)
#     neutral_list.append(neu)
                
           

#     # Store the Average Sentiments
#     sentiment = {
#         "tweet": row["text"],
#         "Compound": compound_list,
#         "Positive": positive_list,
#         "Neutral": negative_list,
#         "Negative": neutral_list,
#         'original':row["sentiment"],
        
#     }

# #     # Print the Sentiments
# #     print(sentiment)
# #     print()
    
#     # Append airline results to 'results_list'
#     results_list.append(sentiment)