In [1]:
#!pip install -U textblob
#!pip install vaderSentiment
#!pip install flair
#!pip install pycorenlp
#!pip install happytransformer
#!pip install afinn
#!pip install NRCLex
#!pip install senticnet
#!pip install pattern
#!pip install tweepy
#!pip install googletrans==3.1.0a0

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.metrics import (
    accuracy_score, 
    recall_score,
    precision_score,
    confusion_matrix
)

from sklearn.metrics import classification_report
from matplotlib import pyplot as plt
import seaborn as sns
from time import time

In [4]:
def model_metrics(target,predicted):
    print("--------------Metrics-------------------")
    print(classification_report(target,predicted))
    print("\n--------------CONFUSION-MATRIX-------------------")
    print('')

    conf_mat = confusion_matrix(target,predicted)
    print('Confusion matrix:\n', conf_mat)

    group_names = ['True Negative','False Positive','False Negative','True Positive']
    group_counts = ["{0:0.0f}".format(value) for value in conf_mat.flatten()]

    group_percentages = ["{0:.2%}".format(value) for value in conf_mat.flatten()/np.sum(conf_mat)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)

    plt.subplots(figsize=(10,6))
    sns.heatmap(conf_mat, annot=labels, fmt='', cmap='Blues')

# TextBlob

In [None]:
from textblob import TextBlob

In [None]:
df=pd.read_csv("sentiment140.csv")
df.head()

In [None]:
def tb_sentiment(sentence):
    blob = TextBlob(sentence)
    if blob.sentiment[0]<0:
        return 0
    else:
        return 4

In [None]:
start=time()
df['compound'] = df['text'].apply(lambda text: tb_sentiment(text))
end=time()
df.head(100)

In [None]:
df["equal"]=np.where(df["target"] == df["compound"], True, False)
t=(df["equal"]==True).sum()
f=(df["equal"]==False).sum()
tb_acc=t/(t+f)
tb_acc

In [None]:
tb_time=end-start
print("The time taken:",tb_time)

In [None]:
model_metrics(df['target'],df['compound'])

# VADER

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
df=pd.read_csv("sentiment140.csv")
df.head()

In [None]:
def sentiment_scores(sentence):
    sent = SentimentIntensityAnalyzer()
    sentiment_dict = sent.polarity_scores(sentence)
    if sentiment_dict['compound']<0:
        return 0
    else:
        return 4

In [None]:
start=time()
df['compound'] = df['text'].apply(lambda text: sentiment_scores(text))
end=time()
df.head()

In [None]:
df["equal"]=np.where(df["target"] == df["compound"], True, False)
t=(df["equal"]==True).sum()
f=(df["equal"]==False).sum()
vd_acc=t/(t+f)
vd_acc

In [None]:
vd_time=end-start
print("The time taken:",vd_time)

In [None]:
model_metrics(df['target'],df['compound'])

# Flair

In [None]:
from flair.models import TextClassifier
from flair.data import Sentence

In [None]:
df=pd.read_csv("sentiment140.csv")
df.head()
classifier = TextClassifier.load('en-sentiment')

In [None]:
def flair_sentiment(sentence):
    text = Sentence(sentence)
    classifier.predict(text)
    label = str(text.labels[0]).split()[0]
    if label=="NEGATIVE":
        return 0
    else:
        return 4

In [None]:
start=time()
df['compound'] = df['text'].apply(lambda text: flair_sentiment(text))
end=time()
df.head()

In [None]:
df["equal"]=np.where(df["target"] == df["compound"], True, False)
t=(df["equal"]==True).sum()
f=(df["equal"]==False).sum()
flair_acc=t/(t+f)
flair_acc

In [None]:
flair_time=end-start
print("The time taken:",flair_time)

In [None]:
model_metrics(df['target'],df['compound'])

# Stanford CoreNLP

In [None]:
from pycorenlp import StanfordCoreNLP
import re

In [None]:
df=pd.read_csv("sentiment140.csv")
df.head()

In [None]:
def corenlp_sentiment(sentence):
    text=re.sub('[^A-Za-z0-9.]+', ' ',sentence)
    nlp = StanfordCoreNLP('http://localhost:9000')
    results = nlp.annotate(text,properties={
        'annotators':'sentiment, ner, pos',
        'outputFormat': 'json',
        'timeout': 50000,
        })
    sentsum=0
    count=0
    for s in results["sentences"]:
        sentsum+=int(s["sentimentValue"])
        count+=1
    sentavg=sentsum/count
    if sentavg<2:
        return 0
    else:
        return 4

In [None]:
start=time()
df['compound'] = df['text'].apply(lambda text: corenlp_sentiment(text))
end=time()
df.head()

In [None]:
df["equal"]=np.where(df["target"] == df["compound"], True, False)
t=(df["equal"]==True).sum()
f=(df["equal"]==False).sum()
core_acc=t/(t+f)
core_acc

In [None]:
core_time=end-start
print("The time taken:",core_time)

In [None]:
model_metrics(df['target'],df['compound'])

# AFINN

In [None]:
from afinn import Afinn

In [None]:
df=pd.read_csv("sentiment140.csv")
df.head()

In [None]:
def afn_sentiment(sentence):
    afinn = Afinn(language='en')
    sent=afinn.score(sentence)
    if sent<0:
        return 0
    else:
        return 4

In [None]:
start=time()
df['compound'] = df['text'].apply(lambda text: afn_sentiment(text))
end=time()
df.head()

In [None]:
df["equal"]=np.where(df["target"] == df["compound"], True, False)
t=(df["equal"]==True).sum()
f=(df["equal"]==False).sum()
af_acc=t/(t+f)
af_acc

In [None]:
af_time=end-start
print("The time taken:",af_time)

In [None]:
model_metrics(df['target'],df['compound'])

# Pattern

In [None]:
from pattern.en import sentiment

In [None]:
df=pd.read_csv("sentiment140.csv")
df.head()

In [None]:
def pattern_sentiment(sentence):
    sent=sentiment(sentence)
    if sent[0]<=0:
        return 0
    else:
        return 4

In [None]:
start=time()
df['compound'] = df['text'].apply(lambda text: pattern_sentiment(text))
end=time()
df.head()

In [None]:
df["equal"]=np.where(df["target"] == df["compound"], True, False)
t=(df["equal"]==True).sum()
f=(df["equal"]==False).sum()
pt_acc=t/(t+f)
pt_acc

In [None]:
pt_time=end-start
print("The time taken:",pt_time)

In [None]:
model_metrics(df['target'],df['compound'])

# Happy transformer

In [None]:
from happytransformer import HappyTextClassification 

In [None]:
df=pd.read_csv("sentiment140.csv")
df.head()
happy_tc = HappyTextClassification("DISTILBERT", "distilbert-base-uncased-finetuned-sst-2-english", num_labels=2)

In [None]:
def ht_sentiment(sentence):
    result = happy_tc.classify_text(sentence)
    if result.label=="NEGATIVE":
        return 0
    else:
        return 4

In [None]:
start=time()
df['compound'] = df['text'].apply(lambda text: ht_sentiment(text))
end=time()
df.head()

In [None]:
df["equal"]=np.where(df["target"] == df["compound"], True, False)
t=(df["equal"]==True).sum()
f=(df["equal"]==False).sum()
ht_acc=t/(t+f)
ht_acc

In [None]:
ht_time=end-start
print("The time taken:",ht_time)

In [None]:
model_metrics(df['target'],df['compound'])

# NRC Lex

In [None]:
from nrclex import NRCLex

In [None]:
df=pd.read_csv("sentiment140.csv")
df.head()

In [None]:
def nrc_sentiment(sentence):
    nrc = NRCLex(sentence)
    pos=nrc.affect_frequencies['positive']
    neg=nrc.affect_frequencies['negative']
    if pos<=neg:
        return 0
    else:
        return 4

In [None]:
start=time()
df['compound'] = df['text'].apply(lambda text: nrc_sentiment(text))
end=time()
df.head()

In [None]:
df["equal"]=np.where(df["target"] == df["compound"], True, False)
t=(df["equal"]==True).sum()
f=(df["equal"]==False).sum()
nrc_acc=t/(t+f)
nrc_acc

In [None]:
nrc_time=end-start
print("The time taken:",nrc_time)

In [None]:
model_metrics(df['target'],df['compound'])

# Senticnet

In [None]:
from senticnet.senticnet import SenticNet

In [None]:
df=pd.read_csv("sentiment140.csv")
df.head()

In [None]:
def snet_sentiment(sentence):
    word=sentence.split()
    sent=0
    sn = SenticNet()
    for w in word:
        try:
            polarity_value = sn.polarity_value(w)
        except:
            pass
        else:
            sent+=float(polarity_value)
    if sent<0:
        return 0
    else:
        return 4

In [None]:
start=time()
df['compound'] = df['text'].apply(lambda text: snet_sentiment(text))
end=time()
df.head()

In [None]:
df["equal"]=np.where(df["target"] == df["compound"], True, False)
t=(df["equal"]==True).sum()
f=(df["equal"]==False).sum()
snet_acc=t/(t+f)
snet_acc

In [None]:
snet_time=end-start
print("The time taken:",snet_time)

In [None]:
model_metrics(df['target'],df['compound'])

# Bar graph

In [None]:
import matplotlib.pyplot as plt
models=('TextBlob','Vader','Flair','Stanford CoreNLP','AFINN','Pattern','Happy Transformer','NRC Lex','Senticnet')
acc_values=(tb_acc,vd_acc,flair_acc,core_acc,af_acc,pt_acc,ht_acc,nrc_acc,snet_acc)
fig = plt.figure(figsize = (15, 5))
plt.bar(models, acc_values, color ='green',width = 0.4)
plt.xlabel("Lexicon Based Models")
plt.ylabel("Accuracy")
plt.title("Comparison Of Models")
plt.show()

In [None]:
models=('TextBlob','Vader','Flair','Stanford CoreNLP','AFINN','Pattern','Happy Transformer','NRC Lex','Senticnet')
time_values=(tb_time,vd_time,flair_time,core_time,af_time,pt_time,ht_time,nrc_time,snet_time)
fig = plt.figure(figsize = (15, 5))
plt.bar(models, time_values, color ='green',width = 0.4)
plt.xlabel("Lexicon Based Models")
plt.ylabel("time")
plt.title("Comparison Of Models")
plt.show()

# Tweet collection

In [5]:
from googletrans import Translator, constants
from pprint import pprint

In [6]:
translator = Translator()

In [7]:
consumer_key='9N25TnKAVRlCnzwczo1H57AC9'
consumer_secret='75jWEcEvlDnEz49QkquoccNoaHHFkzAyF2OUByVqK0UpnuqllT'
access_token='1186607165944913920-BUl3h3wuCr8y7YNmXliBZUlBnS0GjG'
access_token_secret='uNLDlE1TVJ3bOkZbx85TbH2TFY4lP3Yjr04rW0QDs7sef'

In [73]:
import tweepy
import re
import pandas as pd
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

regex = "[^a-zA-Z0-9]+"
p=re.compile(regex)


In [77]:
def get_tweet(keyword):
    search_words = keyword
    date_since = "2020-11-16"
    tweets = tweepy.Cursor(api.search_tweets,
                  q=search_words,
                  lang="",
                ).items(5)

    tweet_det=[]
    tweet_transdet=[]
    for tweet in tweets:
        tweet_det.append([tweet.id,tweet.lang,tweet.text,tweet.user.screen_name,tweet.user.name,tweet.created_at,tweet.user.location])
        if tweet.lang!="en":
            tw_trans = translator.translate(tweet.text)
            tweet_transdet.append([tweet.id,tweet.lang,tw_trans.text,tweet.user.screen_name,tweet.user.name,tweet.created_at,tweet.user.location])
        else:
            tweet_transdet.append([tweet.id,tweet.lang,tweet.text,tweet.user.screen_name,tweet.user.name,tweet.created_at,tweet.user.location])

    tweet_details = pd.DataFrame(data=tweet_det, columns=['Id','Lang','Tweet','Username','User','Date-Time','Location'])
    #tweet_details.to_csv(keyword+".csv")
    tweet_transdetails = pd.DataFrame(data=tweet_transdet, columns=['Id','Lang','Tweet','Username','User','Date-Time','Location'])
    return tweet_transdetails

In [82]:
key1=[" #"]
key1_tweet=get_tweet(key1)
key1_tweet.head()


Forbidden: 403 Forbidden
195 - Missing or invalid url parameter.

In [86]:
key1=["policy"," $"]
a=1
for k in key1:
    if(re.search(p, k)):
        a=0
    
if a==0:
    print("Invalid!!\nEnter keyword without space or special character!!")
else:
    key1_tweet=get_tweet(key1)
    key1_tweet.head()


Invalid!!
Enter keyword without space or special character!!


In [71]:
key2=["hhj","ds"]
b=1
for k in key2:
    if(re.search(p, k)):
        b=0
    
if b==0:
    print("Invalid")
else:
    key2_tweet=get_tweet(key2)
    key2_tweet.head()

Unexpected parameter: since
Unexpected parameter: since


# Pre-Processing

In [None]:
import nltk
nltk.download('wordnet')

In [20]:
import re
import string
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
import nltk
nltk.download('stopwords')
stop_words=set(stopwords.words('english'))

In [None]:
def preprocess_tweet(tweet):
    #LowerCasing all tweets
    tweet=tweet.lower()
    #removing URL
    tweet=re.sub(r"http\S+|www\S+|https\S+","",tweet)
    #removing @ from tweets
    tweet=re.sub(r'\@\w+|\#+'," ",tweet)
    #removing punctuation from tweets
    tweet=tweet.translate(str.maketrans("","",string.punctuation))
    #removing rt from tweets
    tweet=re.sub(r'^rt[\s]+', '', tweet)
    #Stop Word removal
    tweet_tokens=word_tokenize(tweet)
    filtered_words=[word for word in tweet_tokens if word not in stop_words]
    #lemmatization
    lemmatizer=WordNetLemmatizer()
    lemma_words=[lemmatizer.lemmatize(w,pos='a') for w in filtered_words]
    return " ".join(lemma_words)

In [None]:
key1_tweet['clean_tweets'] = key1_tweet['Tweet'].apply(lambda Tweet: preprocess_tweet(Tweet))
key1_tweet["clean_tweets"].replace("", np.NaN, inplace=True)
key1_tweet.dropna(inplace=True)
key1_tweet.to_csv('key1_cleantweets.csv')
key1_tweet.head()

In [None]:
key2_tweet['clean_tweets'] = key2_tweet['Tweet'].apply(lambda Tweet: preprocess_tweet(Tweet))
key2_tweet["clean_tweets"].replace("", np.NaN, inplace=True)
key2_tweet.dropna(inplace=True)
key2_tweet.to_csv('key2_cleantweets.csv')
key2_tweet.head()

In [None]:
def sentiment_analysis(sentence):
    text = Sentence(sentence)
    classifier.predict(text)
    label = str(text.labels[0]).split()[0]
    if label=="NEGATIVE":
        return "Negative"
    else:
        return "Positive"

In [None]:
def sentiment_classify(df):
    df['sentiment'] = df['clean_tweets'].apply(lambda clean_tweets: sentiment_analysis(clean_tweets))
    pos=(df["sentiment"]=="Positive").sum()
    neg=(df["sentiment"]=="Negative").sum()
    return pos,neg

In [None]:
pos1,neg1=sentiment_classify(key1_tweet)
key1_tweet.head()

In [None]:
n=len(key1_tweet)
print("Positive sentiment percentage: ",pos1*100/n)
print("Negative sentiment percentage: ",neg1*100/n)

In [None]:
pos2,neg2=sentiment_classify(key2_tweet)
key2_tweet.head()

In [None]:
n=len(key2_tweet)
print("Positive sentiment percentage: ",pos2*100/n)
print("Negative sentiment percentage: ",neg2*100/n)

In [None]:
winner=key1 if (pos1>pos2) else key2
print(winner,"has a more positive sentiment")