In [2]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

df = pd.read_csv(r"C:\Users\krupe\OneDrive\Desktop\Sentiment\vaccination_all_tweets.csv")

#Data Preprocessing
text_df = df.drop(['id', 'user_name', 'user_location', 'user_description', 'user_created',
       'user_followers', 'user_friends', 'user_favourites', 'user_verified',
       'date', 'hashtags', 'source', 'retweets', 'favorites',
       'is_retweet'], axis=1)

#Cleansing strings
def data_processing(text):
    text = text.lower()
    text = re.sub(r"https\S+|www\S+https\S+", '',text, flags=re.MULTILINE)
    text = re.sub(r'\@w+|\#','',text)
    text = re.sub(r'[^\w\s]','',text)
    text_tokens = word_tokenize(text)
    filtered_text = [w for w in text_tokens if not w in stop_words]
    return " ".join(filtered_text)
text_df.text = text_df['text'].apply(data_processing)

#Stemming Words
stmpred = PorterStemmer()
def stemming_frame(data):
    l = len(data)
    for i in range(0,l,1):
        txt = []
        list = data[i].split(" ")
    
    
        for w in list:
            a = stmpred.stem(w)
            txt.append(a)
            textprocessed = ' '.join(map(str,txt))
        data[i] = textprocessed
stemming_frame(text_df['text'])
    
#Polarity of Strings
def polarity(text):
    return TextBlob(text).sentiment.polarity
text_df['polarity'] = text_df['text'].apply(polarity)

#Sentiment Labelling
def sentiment(label):
    if label <0:
        return "Negative"
    elif label ==0:
        return "Neutral"
    elif label>0:
        return "Positive"
text_df['sentiment'] = text_df['polarity'].apply(sentiment)

#Dropping Duplicate Tweets
text_df = text_df.drop_duplicates('text')

#Dropping Polarity
text_df = text_df.drop(['polarity'],axis=1)

#Save Processed Dataframe to CSV

text_df.to_csv(r"C:\Users\krupe\OneDrive\Desktop\Sentiment\Final.csv")
print("done")


done


In [50]:
df = pd.read_csv(r"C:\Users\krupe\OneDrive\Desktop\Sentiment\Final.csv")

In [51]:
vectoriser = TfidfVectorizer(ngram_range=(1,2)).fit(df['text'])

In [52]:
feature_names =vectoriser.get_feature_names_out()
print("Number of features: {}\n".format(len(feature_names)))
print("First 20 features:\n {}".format(feature_names[:20]))

Number of features: 803546

First 20 features:
 ['00' '00 dose' '000' '000 00' '000 000the' '000 717' '000 belgium'
 '000 covid19' '000 death' '000 donat' '000 dose' '000 first' '000 half'
 '000 health' '000 initi' '000 peopl' '000 purchas' '000 sha' '000 share'
 '000 sinopharm']


In [53]:
X = df['text']
Y = df['sentiment']
X = vectoriser.transform(X)

In [54]:
print(df['text'])
print(X)

0         folk said daikon past could treat cytokin stor...
1         world wrong side histori year hope biggest vac...
2         coronaviru sputnikv astrazeneca pfizerbiontech...
3         fact immut senat even your ethic sturdi enough...
4         explain need vaccin borisjohnson matthancock w...
                                ...                        
211525    45 urban bengaluru covidvaccin avail 1511 0230...
211526    1844 bbmp bengaluru covidvaccin avail 1511 020...
211527    1844 urban bengaluru covidvaccin avail 1511 02...
211528    promot vaccin leav stronger russia vaccin sput...
211529    45 urban bengaluru covidvaccin avail 1511 0130...
Name: text, Length: 211530, dtype: object
  (0, 724810)	0.287984170255165
  (0, 724788)	0.19233937752503638
  (0, 679565)	0.287984170255165
  (0, 679555)	0.22921529381984124
  (0, 618999)	0.287984170255165
  (0, 618772)	0.12527791376881747
  (0, 547348)	0.10173962790381616
  (0, 537995)	0.27869403407063364
  (0, 537957)	0.1726505377312020

In [55]:
import warnings
a=warnings.filterwarnings('ignore')

In [60]:
predict_ip = "eating sour is #harmful during covid-19"
predf = pd.DataFrame()
predf['text'] = [predict_ip,"covid19 sucks job gone","depression while seeing people die","happy to be with family at lockdown time"]

predf.text = predf['text'].apply(data_processing)

stemming_frame(predf['text'])

predf['polarity'] = predf['text'].apply(polarity)

predf['sentiment'] = predf['polarity'].apply(sentiment)

predf = predf.drop_duplicates('text')
print(predf)

vectoriser1 = TfidfVectorizer(ngram_range=(1,2)).fit(predf['text'])
print('No. of feature_words: ', len(vectoriser1.get_feature_names()))


Xpred = predf['text']
Ypred = predf['sentiment']
Xpred = vectoriser1.transform(Xpred)

print(Xpred)

                         text  polarity sentiment
0       eat sour harm covid19 -0.150000  Negative
1       covid19 suck job gone  0.000000   Neutral
2       depress see peopl die -0.066667  Negative
3  happi famili lockdown time  0.000000   Neutral
No. of feature_words:  27
  (0, 23)	0.388614292631317
  (0, 22)	0.388614292631317
  (0, 13)	0.388614292631317
  (0, 12)	0.388614292631317
  (0, 6)	0.388614292631317
  (0, 5)	0.388614292631317
  (0, 0)	0.3063879719831814
  (1, 25)	0.388614292631317
  (1, 24)	0.388614292631317
  (1, 15)	0.388614292631317
  (1, 14)	0.388614292631317
  (1, 9)	0.388614292631317
  (1, 1)	0.388614292631317
  (1, 0)	0.3063879719831814
  (2, 21)	0.3779644730092272
  (2, 20)	0.3779644730092272
  (2, 19)	0.3779644730092272
  (2, 18)	0.3779644730092272
  (2, 4)	0.3779644730092272
  (2, 3)	0.3779644730092272
  (2, 2)	0.3779644730092272
  (3, 26)	0.3779644730092272
  (3, 17)	0.3779644730092272
  (3, 16)	0.3779644730092272
  (3, 11)	0.3779644730092272
  (3, 10)	0.37796447

In [63]:
feature_names1 =vectoriser1.get_feature_names_out()
print("Number of features: {}\n".format(len(feature_names1)))
print("features:\n {}".format(feature_names1[:]))

Number of features: 27

features:
 ['covid19' 'covid19 suck' 'depress' 'depress see' 'die' 'eat' 'eat sour'
 'famili' 'famili lockdown' 'gone' 'happi' 'happi famili' 'harm'
 'harm covid19' 'job' 'job gone' 'lockdown' 'lockdown time' 'peopl'
 'peopl die' 'see' 'see peopl' 'sour' 'sour harm' 'suck' 'suck job' 'time']


In [98]:
print(X.shape,Y.shape,Xpred.shape)

(211530, 803546) (211530,) (4, 27)


In [62]:
logreg = LogisticRegression()
logreg.fit(X, Y)
logreg_pred = logreg.predict(Xpred)
logreg_acc = accuracy_score(logreg_pred, ypred)
print("Test accuracy: {:.2f}%".format(logreg_acc*100))

ValueError: X has 27 features, but LogisticRegression is expecting 803546 features as input.

In [None]:
from sklearn.svm import LinearSVC

In [None]:
SVCmodel = LinearSVC()
SVCmodel.fit(x_train, y_train)

In [None]:
svc_pred = SVCmodel.predict(x_test)
svc_acc = accuracy_score(svc_pred, y_test)
print("test accuracy: {:.2f}%".format(svc_acc*100))