# Sentiment Analysis NLP Project

In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt
from textblob import TextBlob
import nltk

In [2]:
df = pd.read_csv('tweets.csv')
original_df = df.copy(deep=True)

In [3]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows',500)
df

Unnamed: 0,datetime,tweetid,text
0,2020-11-10 03:12:42,1325999799729053696,"Arthur has a rat named Tesla, pass it on"
1,2020-11-10 03:12:21,1325999708763009030,"I’ll be joining @colbertlateshow tonight, 11/9, at 11:35pm ET on CBS. Tune in. #Election2020 https://t.co/2H3o6mHcVv"
2,2020-11-10 03:12:16,1325999688370282496,@elonmusk @jgrano305 What about Virginia? It’s for lovers... of Tesla insurance.
3,2020-11-10 03:12:14,1325999682024247296,@EastEndResist Same...I put a Tesla system in a couple years ago (I’m in Florida so the Powerwall is good in hurricane season) and the $15 tie-in is it for driving and power. I really love it.
4,2020-11-10 03:11:58,1325999615196454912,Toyota president: Tesla valuation doesn't reflect 'real world' business https://t.co/RIfvzNUHVF
5,2020-11-10 03:11:52,1325999589082554369,"@ValTex77 Yeah .. not a very good article. Doesn't talk about the EV infrastructure that isn't there for Tesla ... and how Nikola used customer down-payments for increasing their capital. Not the case for $HYLN, where the CNG infrastructure is already available for RNG use."
6,2020-11-10 03:11:52,1325999587799298048,"“Mr. Barr’s memo allows U.S. attorneys to bypass that career prosecutor and take their requests to his office for approval, effectively weakening a key safeguard that prevents political interference in an election by the party in power.” https://t.co/EahQXR2yc5"
7,2020-11-10 03:11:51,1325999585584623619,@Reenath @EliBurton_ @Tesla @elonmusk And God Bless Texas!
8,2020-11-10 03:11:46,1325999561559728128,@TeslaJoy Hmm wonder if Tesla has geo location to solve for crimes - my guess is no due to privacy issues
9,2020-11-10 03:11:30,1325999496149528576,Off A Tesla Can’t Go To Sleep ...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   datetime  500 non-null    object
 1   tweetid   500 non-null    int64 
 2   text      500 non-null    object
dtypes: int64(1), object(2)
memory usage: 11.8+ KB


In [5]:
# Check for NA values
df.isnull().values.any()

False

In [6]:
# Clean text function
import string
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # Removes @mentions
    text = re.sub(r'#', '', text) # Removes hashtags
    text = re.sub(r'RT[\s]+', '', text) # Removes RT
    text = re.sub(r'https?:\S+', '', text) # Removes hyperlinks
    text = text.translate(str.maketrans('', '', string.punctuation)) # Removes punctuations
    text = re.sub(r'\n', ' ',text) # Replaces new line syntax with whitespace
    text = re.sub(r'[^ -~]', '', text) # Removes emojis
    text = re.sub(r'[0-9]', '', text) # Removes numbers
    return text.lower()

In [7]:
df['text'] = df['text'].apply(clean_text)

In [8]:
df

Unnamed: 0,datetime,tweetid,text
0,2020-11-10 03:12:42,1325999799729053696,arthur has a rat named tesla pass it on
1,2020-11-10 03:12:21,1325999708763009030,ill be joining tonight at pm et on cbs tune in election
2,2020-11-10 03:12:16,1325999688370282496,what about virginia its for lovers of tesla insurance
3,2020-11-10 03:12:14,1325999682024247296,samei put a tesla system in a couple years ago im in florida so the powerwall is good in hurricane season and the tiein is it for driving and power i really love it
4,2020-11-10 03:11:58,1325999615196454912,toyota president tesla valuation doesnt reflect real world business
5,2020-11-10 03:11:52,1325999589082554369,yeah not a very good article doesnt talk about the ev infrastructure that isnt there for tesla and how nikola used customer downpayments for increasing their capital not the case for hyln where the cng infrastructure is already available for rng use
6,2020-11-10 03:11:52,1325999587799298048,mr barrs memo allows us attorneys to bypass that career prosecutor and take their requests to his office for approval effectively weakening a key safeguard that prevents political interference in an election by the party in power
7,2020-11-10 03:11:51,1325999585584623619,and god bless texas
8,2020-11-10 03:11:46,1325999561559728128,hmm wonder if tesla has geo location to solve for crimes my guess is no due to privacy issues
9,2020-11-10 03:11:30,1325999496149528576,off a tesla cant go to sleep


In [9]:
df['text'].nunique()

400

In [10]:
# We need to remove duplicate texts
df.drop_duplicates(subset=['text'],inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 0 to 499
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   datetime  400 non-null    object
 1   tweetid   400 non-null    int64 
 2   text      400 non-null    object
dtypes: int64(1), object(2)
memory usage: 12.5+ KB


In [11]:
# First use textblob to get subjectivity and sentiment polarity scores
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
    return TextBlob(text).sentiment.polarity

In [12]:
df['TB_Subjectivity'] = df['text'].apply(getSubjectivity)
df['TB_Polarity'] = df['text'].apply(getPolarity)

In [13]:
df

Unnamed: 0,datetime,tweetid,text,TB_Subjectivity,TB_Polarity
0,2020-11-10 03:12:42,1325999799729053696,arthur has a rat named tesla pass it on,0.0,0.0
1,2020-11-10 03:12:21,1325999708763009030,ill be joining tonight at pm et on cbs tune in election,1.0,-0.5
2,2020-11-10 03:12:16,1325999688370282496,what about virginia its for lovers of tesla insurance,0.0,0.0
3,2020-11-10 03:12:14,1325999682024247296,samei put a tesla system in a couple years ago im in florida so the powerwall is good in hurricane season and the tiein is it for driving and power i really love it,0.6,0.6
4,2020-11-10 03:11:58,1325999615196454912,toyota president tesla valuation doesnt reflect real world business,0.3,0.2
5,2020-11-10 03:11:52,1325999589082554369,yeah not a very good article doesnt talk about the ev infrastructure that isnt there for tesla and how nikola used customer downpayments for increasing their capital not the case for hyln where the cng infrastructure is already available for rng use,0.430769,0.065385
6,2020-11-10 03:11:52,1325999587799298048,mr barrs memo allows us attorneys to bypass that career prosecutor and take their requests to his office for approval effectively weakening a key safeguard that prevents political interference in an election by the party in power,0.633333,0.2
7,2020-11-10 03:11:51,1325999585584623619,and god bless texas,0.0,0.0
8,2020-11-10 03:11:46,1325999561559728128,hmm wonder if tesla has geo location to solve for crimes my guess is no due to privacy issues,0.375,0.0625
9,2020-11-10 03:11:30,1325999496149528576,off a tesla cant go to sleep,0.0,0.0


In [14]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Julian\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [15]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk_score = SentimentIntensityAnalyzer()

In [16]:
df['NLTK_Polarity'] = df['text'].apply(lambda x: nltk_score.polarity_scores(x)['compound'])

In [17]:
df

Unnamed: 0,datetime,tweetid,text,TB_Subjectivity,TB_Polarity,NLTK_Polarity
0,2020-11-10 03:12:42,1325999799729053696,arthur has a rat named tesla pass it on,0.0,0.0,0.0
1,2020-11-10 03:12:21,1325999708763009030,ill be joining tonight at pm et on cbs tune in election,1.0,-0.5,-0.4215
2,2020-11-10 03:12:16,1325999688370282496,what about virginia its for lovers of tesla insurance,0.0,0.0,0.5267
3,2020-11-10 03:12:14,1325999682024247296,samei put a tesla system in a couple years ago im in florida so the powerwall is good in hurricane season and the tiein is it for driving and power i really love it,0.6,0.6,0.8122
4,2020-11-10 03:11:58,1325999615196454912,toyota president tesla valuation doesnt reflect real world business,0.3,0.2,0.0
5,2020-11-10 03:11:52,1325999589082554369,yeah not a very good article doesnt talk about the ev infrastructure that isnt there for tesla and how nikola used customer downpayments for increasing their capital not the case for hyln where the cng infrastructure is already available for rng use,0.430769,0.065385,-0.1085
6,2020-11-10 03:11:52,1325999587799298048,mr barrs memo allows us attorneys to bypass that career prosecutor and take their requests to his office for approval effectively weakening a key safeguard that prevents political interference in an election by the party in power,0.633333,0.2,0.8519
7,2020-11-10 03:11:51,1325999585584623619,and god bless texas,0.0,0.0,0.5994
8,2020-11-10 03:11:46,1325999561559728128,hmm wonder if tesla has geo location to solve for crimes my guess is no due to privacy issues,0.375,0.0625,-0.1027
9,2020-11-10 03:11:30,1325999496149528576,off a tesla cant go to sleep,0.0,0.0,0.0


In [18]:
by_TB = df.sort_values(by='TB_Polarity',ascending=False).reset_index(drop=True)

In [19]:
by_NLTK = df.sort_values(by='NLTK_Polarity',ascending=False).reset_index(drop=True)

In [20]:
# Top 5 Positive and negative comments of the day using TextBlob
by_TB

Unnamed: 0,datetime,tweetid,text,TB_Subjectivity,TB_Polarity,NLTK_Polarity
0,2020-11-10 03:05:39,1325998025291034624,suggest martian wheels excellent quality designed for tesla,1.0,1.0,0.5719
1,2020-11-10 02:52:18,1325994664386727936,ozsc will make tesla the best choice,0.3,1.0,0.6369
2,2020-11-10 03:10:13,1325999171975983104,lmfao he is the best at everything,0.3,1.0,0.8271
3,2020-11-10 02:16:01,1325985531893751814,this doggo inside a tesla is living hisher best life,0.3,1.0,0.6369
4,2020-11-10 02:59:06,1325996376950534145,wheres this its a beautiful sight,1.0,0.85,0.5994
5,2020-11-10 02:38:08,1325991101052841992,anybody with money is going to buy a tesla its a beautiful car,1.0,0.85,0.5994
6,2020-11-10 03:01:32,1325996990186967047,the win your dream tesla giveaway stacksocial via,0.4,0.8,0.7003
7,2020-11-10 02:43:31,1325992455385264133,if youve ever reposted david dobrik to win a tesla block me rn,0.4,0.8,0.2263
8,2020-11-10 03:07:31,1325998494591684608,what color tesla you want lol,0.7,0.8,0.4767
9,2020-11-10 02:23:09,1325987330050551808,the perfect side for this is an orange tesla honestly,0.95,0.8,0.7717


In [21]:
TB_ids_pos = by_TB.loc[0:4]['tweetid']

In [22]:
TB_ids_neg = by_TB[::-1].reset_index(drop=True).loc[0:4]['tweetid']

In [23]:
NLTK_ids_pos = by_NLTK.loc[0:4]['tweetid']

In [24]:
NLTK_ids_neg = by_NLTK[::-1].reset_index(drop=True).loc[0:4]['tweetid']

In [25]:
TB_pos_text = []
for id in TB_ids_pos:
    TB_pos_text.append(original_df[original_df['tweetid'] == id]['text'].values[0])
TB_neg_text = []
for id in TB_ids_neg:
    TB_neg_text.append(original_df[original_df['tweetid'] == id]['text'].values[0])
NLTK_pos_text = []
for id in NLTK_ids_pos:
    NLTK_pos_text.append(original_df[original_df['tweetid'] == id]['text'].values[0])
NLTK_neg_text = []
for id in NLTK_ids_neg:
    NLTK_neg_text.append(original_df[original_df['tweetid'] == id]['text'].values[0])

In [26]:
comparison_df = pd.DataFrame({
    "TB_positive" : TB_pos_text,
    "TB_negative" : TB_neg_text,
    "NLTK_positive" : NLTK_pos_text,
    "NLTK_negative" : NLTK_neg_text
})

In [27]:
comparison_df

Unnamed: 0,TB_positive,TB_negative,NLTK_positive,NLTK_negative
0,@SCMountainDad @TSportline Suggest Martian Wheels excellent quality designed for Tesla,@teslaownerssv @elonmusk @tesla_raj @klwtts @JVerdura Insane. Just imagine what else he is innovating this way.,"@techAU @Tesla Hmm, maybe you could pony up some of those advertising dollars and buy your wife a Model Y.\n\nIf you want a Tesla, please use a random referral code, and support someone less fortunate with some sweet sweet free kms.\n\nhttps://t.co/O3MYlmUjid","@Reuters Need to remember to be critical of Biden as ppl were critical of Trump. Both are racist, corrupt, sociopaths that cannot be trusted. Biden is not a savior. He was just the lesser of two evils."
1,@elonmusk $OZSC Will make Tesla the best choice .,@The_pKchu @chloegraccce Tesla plug into the wall and stupid girl no realize,"@JohnEG78 @TeslaChillMode @Tesla @BLKMDL3 @omg_tesla @AdamsPolishes Nah we have 3 ICE cars in attendance. Vendors who created apps, services or products for EVs. We are a welcoming club so all are welcome. Not all who walk in a church believe. Tesla club is open and we love to help people see the light. https://t.co/VtavlfjKt7","#UPDATE The head of the #OSCE international observer mission to the US elections has accused Donald Trump of a ""gross abuse of office"" after the president alleged he was being cheated and demanded that vote counting be halted https://t.co/NbKJ9NAefZ https://t.co/GNeLrDI7NQ"
2,@EnglishDanYT @Tesla @elonmusk Lmfao he IS the best at everything https://t.co/b2CDDuGPtz,My dad put a front license plate (Bc it’s the law) on the Tesla and it looks so ugly now 😭,"Better late than never.\n\nBut um.. where are they finding 3,000 talented people to work on this?\n\nAll the best are working at #SpaceX and #Tesla.\n\n$TSLA https://t.co/zvPN8oJPlV","🚨🚨 #Tesla ""Full Self Driving"" Tests Risk Killing People\n\nDid these MOTORCYCLISTS consent to nearly getting killed by a ""self-driving car"" that can't turn safely? \n@Easyriders1 @harleydavidson @CMMmag @ABRmagazine @SoCal__HAMC @OCChoppers #biker\n#motorcycle $TSLA $TSLAQ 1/10 https://t.co/sI5ZUcPIAu"
3,This doggo inside a Tesla is living his/her best life. https://t.co/Ne4VO1q2JC,@BriArsement she is very creepy disco:tesla#3785,Trust Elon Musk....😎\n\nGreat share @FrRonconi 😍\n\n#TheDigitalCoach #Tesla #Fun https://t.co/CXIMWK0mtw,"Ex Microsoft engineer gets 9 yrs in prison for stealing more than $10M from Microsoft, making it look like other employees were stealing, using btc mixers, falsely filing tax returns. Wild story. \n\nhttps://t.co/orteStcL9F"
4,@Tesla Where's this? It's a beautiful sight.,Tesla Released An Absurdly Overpriced Tesla Tequila\r\rhttps://t.co/f49o4EUIDA\n\n#ITintheD373,@SaraJAwesome @EliBurton_ @Tesla @elonmusk WOW!!! God bless Tesla. God bless Sara 🙏🏼👍🏼👌🏼,"@Robotbeat @ghotiing @CruizVinicius @josh119872 @SweetINXS @voteLabonte @Erdayastronaut @Tesla Tesla actually sent out ResMed, Philips &amp; Medtronic units. Latter is fully intratracheal. My personal opinion is that some ICUs are jumping the gun on intubation &amp; setting PEEP &amp; O2 too high. High pressure, pure oxygen increases risk of lung damage. https://t.co/2IUnS5DPOg"


In [29]:
comparison_df['NLTK_positive'][0]

'@techAU @Tesla Hmm, maybe you could pony up some of those advertising dollars and buy your wife a Model Y.\n\nIf you want a Tesla, please use a random referral code, and support someone less fortunate with some sweet sweet free kms.\n\nhttps://t.co/O3MYlmUjid'