<h1 class="alert alert-block alert-info" style="text-align:center; font-size:30px">Sentiment Analysis for Twitter Data</h1>

<h1 class="alert alert-block alert-info" style="text-align:center; font-size:30px">Problem Statement</h1>
<li>Study the subjects of recent tweets about the vaccine made in collaboration by Pfizer and BioNTech, perform various NLP tasks on this data source



<h1 class="alert alert-block alert-info" style="text-align:center; font-size:30px">About Data Set</h1>
<li>Data is collected from recent tweets about Pfizer and BioNTech vaccine.
<li>The data is collected using tweepy Python package to access Twitter API.



<h1 class="alert alert-block alert-info" style="text-align:center; font-size:30px">Importing Libraries</h1>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#For basic table operation
import pandas as pd

#For work with arrays
import numpy as np

#For find pattern in text
import re

#For visualization
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")

#For processing textial data
from textblob import TextBlob

#For Tokenizing segments
from nltk.tokenize import word_tokenize

#For Stemming text
from nltk.stem import PorterStemmer

#For removing StopWords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

#For Plotting Words
from wordcloud import WordCloud

# Convert a collection of text documents to a matrix of token counts.
from sklearn.feature_extraction.text import CountVectorizer

#To split data into train and test
from sklearn.model_selection import train_test_split

#For fitting model
from sklearn.linear_model import LogisticRegression

#For evaluation of model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

#For Hyper-tuning model
from sklearn.model_selection import GridSearchCV


In [None]:
df = pd.read_csv("/kaggle/input/pfizer-vaccine-tweets/vaccination_tweets.csv")
df.head(4)

In [None]:
df.info()

In [None]:
df.columns

In [None]:
# Extracting only Text attributs for analysis
text_df = df.drop(['id', 'user_name', 'user_location', 'user_description', 'user_created',
       'user_followers', 'user_friends', 'user_favourites', 'user_verified',
       'date', 'hashtags', 'source', 'retweets', 'favorites',
       'is_retweet'],axis=1)
text_df.head()

In [None]:
#visualizing Raw data we have from Tweetr
print(text_df["text"].iloc[0],"\n")
print(text_df["text"].iloc[1],"\n")
print(text_df["text"].iloc[2],"\n")
print(text_df["text"].iloc[3],"\n")
print(text_df["text"].iloc[4],"\n")
print(text_df["text"].iloc[5],"\n")


<h1 class="alert alert-block alert-info" style="text-align:center; font-size:30px">Data Preprocessing</h1>

In [None]:
def data_processing(text):
    text = text.lower()     #Converting to text to lowercase
    text = re.sub(r'https\S+|www\S+https\S+','',text,flags=re.MULTILINE)   #Removing URL
    text = re.sub(r'\@w+|\#','',text)         #Removing hashtags
    text = re.sub(r'[^\w\s]','',text)         #Removing hashtags
    text_tokens = word_tokenize(text)         #Getting tokens
    filtered_text = [w for w in text_tokens if not w in stop_words]
    return " ".join(filtered_text)

In [None]:
# Applying Data Processing function
text_df.text = text_df["text"].apply(data_processing)

In [None]:
# Removing Duplicates if any
text_df = text_df.drop_duplicates('text')

In [None]:
# Performing Stemming
stemmer = PorterStemmer()
def stemming(data):
    text = [stemmer.stem(word) for word in data]
    return data

In [None]:
text_df["text"] = text_df["text"].apply(lambda x: stemming(x))

In [None]:
#visualizing Processed text
print(text_df["text"].iloc[0],"\n")
print(text_df["text"].iloc[1],"\n")
print(text_df["text"].iloc[2],"\n")
print(text_df["text"].iloc[3],"\n")
print(text_df["text"].iloc[4],"\n")
print(text_df["text"].iloc[5],"\n")

In [None]:
#Checking data shape
print("Shape of data after processing:",text_df["text"].shape)

In [None]:
#calculating polarity for categorizing text 
def polarity(text):
    return TextBlob(text).sentiment.polarity

In [None]:
text_df["polarity"] = text_df["text"].apply(polarity)
text_df.head(10)

In [None]:
#  Adding Sentiment to the data frame
def sentiment(label):
    if label <0:
        return "Negative"
    elif label ==0:
        return "Neutral"
    elif label>0:
        return "Positive"

In [None]:
text_df['sentiment'] = text_df['polarity'].apply(sentiment)
text_df.head(10)

In [None]:
#Visualizing the Sentiment
fig = plt.figure(figsize=(7,5))
sns.countplot(x="sentiment",data=text_df)

In [None]:
fig = plt.figure(figsize=(7,7))
colors = ("yellowgreen", "gold", "red")
wp = {'linewidth':2, 'edgecolor':"black"}
tags = text_df['sentiment'].value_counts()
explode = (0.1,0.1,0.1)
tags.plot(kind='pie', autopct='%1.1f%%', shadow=True, colors = colors,
         startangle=90, wedgeprops = wp, explode = explode, label='')
plt.title('Distribution of sentiments')

In [None]:
#Visulaizing Top 5 positive Sentiments
pos_tweets = text_df[text_df.sentiment == 'Positive']
pos_tweets = pos_tweets.sort_values(['polarity'], ascending= False)
pos_tweets.head()

In [None]:
text = ' '.join([word for word in pos_tweets['text']])
plt.figure(figsize=(20,15), facecolor='None')
wordcloud = WordCloud(max_words=500, width=1600, height=800).generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Most frequent words in positive tweets', fontsize=19)
plt.show()

In [None]:
#Visualizing Negative Words
neg_tweets = text_df[text_df.sentiment == 'Negative']
neg_tweets = neg_tweets.sort_values(['polarity'], ascending= False)
neg_tweets.head()

In [None]:
text = ' '.join([word for word in neg_tweets['text']])
plt.figure(figsize=(20,15), facecolor='None')
wordcloud = WordCloud(max_words=500, width=1600, height=800).generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Most frequent words in negative tweets', fontsize=19)
plt.show()

In [None]:
#Visualizing Neutral Words
neutral_tweets = text_df[text_df.sentiment == 'Neutral']
neutral_tweets = neutral_tweets.sort_values(['polarity'], ascending= False)
neutral_tweets.head()

In [None]:
text = ' '.join([word for word in neutral_tweets['text']])
plt.figure(figsize=(20,15), facecolor='None')
wordcloud = WordCloud(max_words=500, width=1600, height=800).generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Most frequent words in neutral tweets', fontsize=19)
plt.show()


<h1 class="alert alert-block alert-info" style="text-align:center; font-size:30px">Vectorizing Data</h1>


In [None]:
# Performing Vectorizing to crate bigram model
vect = CountVectorizer(ngram_range=(1,2)).fit(text_df['text'])

In [None]:
#Getting Features
feature_names = vect.get_feature_names_out()
print("Number of features: {}\n".format(len(feature_names)))
print("First 20 features:\n {}".format(feature_names[:20]))


<h1 class="alert alert-block alert-info" style="text-align:center; font-size:30px">Model Development</h1>

In [None]:
#seperating Independent and Depentent Variables and tranform X data
X = text_df['text']
Y = text_df['sentiment']
X = vect.transform(X)

In [None]:
# Splitting data with test 20%
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
#Checking shape of train and test data
print("Size of x_train:", (x_train.shape))
print("Size of y_train:", (y_train.shape))
print("Size of x_test:", (x_test.shape))
print("Size of y_test:", (y_test.shape))

In [None]:
import warnings
warnings.filterwarnings('ignore')

#Training logisticRegression
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
logreg_pred = logreg.predict(x_test)
logreg_acc = accuracy_score(logreg_pred, y_test)
print("Test accuracy: {:.2f}%".format(logreg_acc*100))

In [None]:
#Confusion matrix
print(confusion_matrix(y_test, logreg_pred))
print("\n")
print(classification_report(y_test, logreg_pred))

In [None]:
style.use('classic')
cm = confusion_matrix(y_test, logreg_pred, labels=logreg.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels=logreg.classes_)
disp.plot()

<h1 class="alert alert-block alert-info" style="text-align:center; font-size:30px">Tuning Model</h1>

In [None]:
#Lets perform Hyper-Parameter to modulate performance of model

param_grid={'C':[0.001, 0.01, 0.1, 1, 10]}                  #Taking random  alpha values
grid = GridSearchCV(LogisticRegression(), param_grid)
grid.fit(x_train, y_train)

In [None]:
print("Best parameters:", grid.best_params_)

In [None]:
y_pred = grid.predict(x_test)
logreg_acc = accuracy_score(y_pred, y_test)
print("Test accuracy: {:.2f}%".format(logreg_acc*100))

# we can see increase in accurancy by impementing hyperparameter

<div style="border-radius:10px;border:black solid;padding: 15px;background-color:lightgreen;font-size:110%;text-align:left">
<div style="font-family:Georgia;background-color:'#DEB887'; padding:30px; font-size:25px">

<h1 style="color:black;font-size:20px;font-family:Georgia;text-align:center;">👨‍💻<strong>Thank you for Joining, Happy Kaggling</strong>👨‍💻</h1>