<a href="https://colab.research.google.com/github/harshavarma02/SENTIMENT-ANALYSIS-OF-INDIAN-POLITICAL-TWEETS/blob/main/SENTIMENT_ANALYSIS_OF_INDIAN_POLITICAL_TWEETS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **SENTIMENT ANALYSIS OF INDIAN POLITICAL TWEETS**



 📋Table of Contents
* Installing Necessary Libraries
* Loading the Dataset
* Dataset Checking to perform
* Data Preprocessing
* Sentiment Analysis


<a id="section-one"></a>
# 🛠️Installing Necessary Libraries

In [1]:
!pip install better-profanity
!pip install textblob

Collecting better-profanity
  Downloading better_profanity-0.7.0-py3-none-any.whl.metadata (7.1 kB)
Downloading better_profanity-0.7.0-py3-none-any.whl (46 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.1/46.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: better-profanity
Successfully installed better-profanity-0.7.0


In [8]:
# Import Basic Libraries
import re
import os
import pandas as pd
import numpy as np
from datetime import datetime

from better_profanity import profanity
from textblob import TextBlob

# Import Visualization Libraries
import plotly.express as px
import plotly.graph_objs as go
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Import NLP Libraries
import nltk
from nltk.corpus import stopwords

# downloading stopwords
nltk.download('stopwords')

# Remove distarcting warning
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


<a id="section-three"></a>
# 📋Loading the Dataset

In [9]:
# Define the path to your CSV file
tweets_data_path = '/content/tweets.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(tweets_data_path)

# Print the shape of the DataFrame
print("Data Shape is:", df.shape)

# Show the top 10 records
print("\nShow Top 10 Records")
print(df.head(10))

Data Shape is: (50001, 6)

Show Top 10 Records
  Unnamed: 0                       Date             User  \
0          0  2023-03-29 15:42:36+00:00      AnandPatni8   
1          1  2023-03-29 15:42:05+00:00         dhinamum   
2          2  2023-03-29 15:34:29+00:00     PrincetonCGI   
3          3  2023-03-29 15:31:43+00:00     RishiJoeSanu   
4          4  2023-03-29 15:26:48+00:00     itweetsensee   
5          5  2023-03-29 15:21:29+00:00  FreeMindKeenEye   
6          6  2023-03-29 15:20:41+00:00     SamsSamsson6   
7          7  2023-03-29 15:20:16+00:00     SamsSamsson6   
8          8  2023-03-29 15:19:44+00:00     SamsSamsson6   
9          9  2023-03-29 15:19:32+00:00       ncsukumar1   

                                               Tweet  Likes  Retweets  
0  @vinodkapri @RahulGandhi Respected Indian Citi...    0.0       0.0  
1  *Respected Indian Citizens,* Namaskaar I Am Th...    0.0       0.0  
2  1/n-Meet Filmmaker Prakash Jha in New Jersey t...    0.0       0.0  
3  @

<a id="section-four"></a>
# ✔ Dataset Checking to perform

In [10]:
#Checking Missing Values

df.isna().sum()

Unnamed: 0,0
Unnamed: 0,0
Date,0
User,0
Tweet,1
Likes,2
Retweets,2


In [11]:
#Drop NA Value
df.dropna(inplace=True)

In [12]:
#Checking Duplicates
df.duplicated().sum()

0

In [13]:
#Checking Data Types
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49999 entries, 0 to 50000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  49999 non-null  object 
 1   Date        49999 non-null  object 
 2   User        49999 non-null  object 
 3   Tweet       49999 non-null  object 
 4   Likes       49999 non-null  float64
 5   Retweets    49999 non-null  float64
dtypes: float64(2), object(4)
memory usage: 2.7+ MB


In [14]:
#Checking the number of unique values of each column
df.nunique()

Unnamed: 0,0
Unnamed: 0,49999
Date,49820
User,35108
Tweet,49568
Likes,598
Retweets,312


In [15]:
#Print numerical and categorical columns
numeric_columns = [column for column in df.columns if df[column].dtype != 'O']
categorical_columns = [column for column in df.columns if df[column].dtype == 'O']

# print columns
print('We have {} numerical columns(features) : {}'.format(len(numeric_columns), numeric_columns))
print('\nWe have {} categorical columns(features) : {}'.format(len(categorical_columns), categorical_columns))

We have 2 numerical columns(features) : ['Likes', 'Retweets']

We have 4 categorical columns(features) : ['Unnamed: 0', 'Date', 'User', 'Tweet']


<a id="section-five"></a>
# 🏗️Data Preprocessing

In [16]:
df['Original_Tweet']=df['Tweet']
df['DateTime'] = df['Date']
df = df.drop('Date', axis=1)
df['DateTime'] = df['DateTime'].astype(str).apply(lambda x: x.split('+')[0])
# convert the 'DateTime' column to datetime format and replace invalid values with a default datetime
df['DateTime'] = pd.to_datetime(df['DateTime'], errors='coerce', format='%Y-%m-%d %H:%M:%S').fillna(pd.Timestamp('1900-01-01'))

df['date'] = df.DateTime.apply(lambda x: x.date())
df['month'] = df.DateTime.apply(lambda x: x.month)
df['year'] = df.DateTime.apply(lambda x: x.year)
df['hour'] = df.DateTime.apply(lambda x: x.hour)
df.head()

Unnamed: 0.1,Unnamed: 0,User,Tweet,Likes,Retweets,Original_Tweet,DateTime,date,month,year,hour
0,0,AnandPatni8,@vinodkapri @RahulGandhi Respected Indian Citi...,0.0,0.0,@vinodkapri @RahulGandhi Respected Indian Citi...,2023-03-29 15:42:36,2023-03-29,3,2023,15
1,1,dhinamum,"*Respected Indian Citizens,* Namaskaar I Am Th...",0.0,0.0,"*Respected Indian Citizens,* Namaskaar I Am Th...",2023-03-29 15:42:05,2023-03-29,3,2023,15
2,2,PrincetonCGI,1/n-Meet Filmmaker Prakash Jha in New Jersey t...,0.0,0.0,1/n-Meet Filmmaker Prakash Jha in New Jersey t...,2023-03-29 15:34:29,2023-03-29,3,2023,15
3,3,RishiJoeSanu,@MrinalWahal Why would politicians stop using ...,0.0,0.0,@MrinalWahal Why would politicians stop using ...,2023-03-29 15:31:43,2023-03-29,3,2023,15
4,4,itweetsensee,@annamalai_k @narendramodi A state level presi...,0.0,0.0,@annamalai_k @narendramodi A state level presi...,2023-03-29 15:26:48,2023-03-29,3,2023,15


In [17]:
# Create a function to clean the tweets. Remove profanity, unnecessary characters, spaces, and stopwords.


# define the stopwords list
stop_words = stopwords.words('english')

def clean_tweet(tweet):
    # convert to lower case
    tweet = tweet.lower()

    # censor profanity
    profanity.load_censor_words()
    tweet = profanity.censor(tweet)

    # remove twitter handlers
    tweet = re.sub('@[^\s]+', '', tweet)

    # remove hashtags
    tweet = re.sub(r'\B#\S+', '', tweet)

    # remove urls
    tweet = re.sub(r"http\S+", "", tweet)

    # remove special characters and punctuations
    tweet = re.sub(r'\W', ' ', tweet)

    # remove single characters except for 'a' and 'i'
    tweet = re.sub(r'\s+[a-hj-z]\s+', ' ', tweet)
    tweet = re.sub(r'\s+i\s+', ' I ', tweet)
    tweet = re.sub(r'\s+a\s+', ' a ', tweet)

    # substitute multiple spaces with single space
    tweet = re.sub(r'\s+', ' ', tweet, flags=re.I)

    # remove stop words
    tweet = ' '.join([word for word in tweet.split() if word not in stop_words])

    return tweet



In [18]:
# df = df.sample(n=1000, replace=True)
# df=df.head(100)

# df.shape

In [None]:
df['Tweet'] = df['Tweet'].apply(clean_tweet)

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,User,Tweet,Likes,Retweets,Original_Tweet,DateTime,date,month,year,hour
0,0,AnandPatni8,respected indian citizens namaskaar I original...,0.0,0.0,@vinodkapri @RahulGandhi Respected Indian Citi...,2023-03-29 15:42:36,2023-03-29,3,2023,15
1,1,dhinamum,respected indian citizens namaskaar I original...,0.0,0.0,"*Respected Indian Citizens,* Namaskaar I Am Th...",2023-03-29 15:42:05,2023-03-29,3,2023,15
2,2,PrincetonCGI,1 meet filmmaker prakash jha new jersey talkin...,0.0,0.0,1/n-Meet Filmmaker Prakash Jha in New Jersey t...,2023-03-29 15:34:29,2023-03-29,3,2023,15
3,3,RishiJoeSanu,would politicians stop using religion politics...,0.0,0.0,@MrinalWahal Why would politicians stop using ...,2023-03-29 15:31:43,2023-03-29,3,2023,15
4,4,itweetsensee,state level president knows policy pm union mi...,0.0,0.0,@annamalai_k @narendramodi A state level presi...,2023-03-29 15:26:48,2023-03-29,3,2023,15


<a id="section-six"></a>
# 😐Sentiment Analysis

In [None]:
# Define the sentiment objects using TextBlob

sentiment_objects = [TextBlob(tweet) for tweet in df['Tweet']]

# Create a list of polarity values and tweet text
sentiment_values = [[tweet.sentiment.polarity, str(tweet)] for tweet in sentiment_objects]


# Create a dataframe of each tweet against its polarity
sentiment_df = pd.DataFrame(sentiment_values, columns=["polarity", "tweet"])

df['Polarity']=sentiment_df['polarity']

In [None]:
# Create the new column using a lambda function and apply() to classify the tweets as Positive, Negative, or Neutral.
df['Sentiment'] = df['Polarity'].apply(lambda x: 'Positive' if x > 0 else 'Negative' if x < 0 else 'Neutral')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,User,Tweet,Likes,Retweets,Original_Tweet,DateTime,date,month,year,hour,Polarity,Sentiment
0,0,AnandPatni8,respected indian citizens namaskaar I original...,0.0,0.0,@vinodkapri @RahulGandhi Respected Indian Citi...,2023-03-29 15:42:36,2023-03-29,3,2023,15,-0.0625,Negative
1,1,dhinamum,respected indian citizens namaskaar I original...,0.0,0.0,"*Respected Indian Citizens,* Namaskaar I Am Th...",2023-03-29 15:42:05,2023-03-29,3,2023,15,-0.0625,Negative
2,2,PrincetonCGI,1 meet filmmaker prakash jha new jersey talkin...,0.0,0.0,1/n-Meet Filmmaker Prakash Jha in New Jersey t...,2023-03-29 15:34:29,2023-03-29,3,2023,15,0.173232,Positive
3,3,RishiJoeSanu,would politicians stop using religion politics...,0.0,0.0,@MrinalWahal Why would politicians stop using ...,2023-03-29 15:31:43,2023-03-29,3,2023,15,0.0,Neutral
4,4,itweetsensee,state level president knows policy pm union mi...,0.0,0.0,@annamalai_k @narendramodi A state level presi...,2023-03-29 15:26:48,2023-03-29,3,2023,15,0.214286,Positive


In [None]:
# Count the number of occurrences for each sentiment
sentiment_counts = df['Sentiment'].value_counts()
# display the number of Twitter users who have expressed a positive sentiment,a negative sentiment and a neutral sentiment about the given topic.
positive_percent = sentiment_counts['Positive'] / len(df) * 100
negative_percent = sentiment_counts['Negative'] / len(df) * 100
neutral_percent = sentiment_counts['Neutral'] / len(df) * 100
print("%.2f percent of twitter users feel positive." %positive_percent)

print("%.2f percent of twitter users feel negative."%negative_percent)

print("%.2f percent of twitter users feel neutral."%neutral_percent)

32.00 percent of twitter users feel positive.
36.00 percent of twitter users feel negative.
32.00 percent of twitter users feel neutral.
