# Importing Dataset

In [None]:
import pandas as pd

This project focuses on Natural Language Processing (NLP), specifically on cleaning and preparing text data for sentiment analysis related to the Corona pandemic. I have experience with several NLP tasks, including:

*   **Text Preprocessing:** Cleaning raw text data by removing unwanted characters, converting to lowercase, handling punctuation, and removing stop words and emojis. This project utilizes libraries like `re`, `string`, `textblob`, and `nltk` for these tasks.
*   **Feature Extraction:** Converting text data into numerical representations using techniques like TF-IDF, implemented using `sklearn.feature_extraction.text.TfidfVectorizer`.
*   **Sentiment Analysis:** Building and evaluating machine learning models (like Logistic Regression, SVM, Decision Tree, etc.) to classify the sentiment of text. This project uses models from `sklearn.linear_model`, `sklearn.tree`, `sklearn.neighbors`, and `sklearn.svm`.

In [None]:
df = pd.read_csv(r'D:\My Drive\Corona_NLP_test.csv')
df

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral
...,...,...,...,...,...,...
3793,3794,48746,Israel ??,16-03-2020,Meanwhile In A Supermarket in Israel -- People...,Positive
3794,3795,48747,"Farmington, NM",16-03-2020,Did you panic buy a lot of non-perishable item...,Negative
3795,3796,48748,"Haverford, PA",16-03-2020,Asst Prof of Economics @cconces was on @NBCPhi...,Neutral
3796,3797,48749,,16-03-2020,Gov need to do somethings instead of biar je r...,Extremely Negative


# Cleaning Dataset

In [None]:
df.drop(columns=['UserName','ScreenName','Location','TweetAt'],inplace=True)
df

Unnamed: 0,OriginalTweet,Sentiment
0,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,When I couldn't find hand sanitizer at Fred Me...,Positive
2,Find out how you can protect yourself and love...,Extremely Positive
3,#Panic buying hits #NewYork City as anxious sh...,Negative
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral
...,...,...
3793,Meanwhile In A Supermarket in Israel -- People...,Positive
3794,Did you panic buy a lot of non-perishable item...,Negative
3795,Asst Prof of Economics @cconces was on @NBCPhi...,Neutral
3796,Gov need to do somethings instead of biar je r...,Extremely Negative


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3798 entries, 0 to 3797
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   OriginalTweet  3798 non-null   object
 1   Sentiment      3798 non-null   object
dtypes: object(2)
memory usage: 59.5+ KB


In [None]:
df.isnull().sum()

OriginalTweet    0
Sentiment        0
dtype: int64

In [None]:
df.duplicated().sum()

0

In [None]:
df['Sentiment'].value_counts()

Negative              1041
Positive               947
Neutral                619
Extremely Positive     599
Extremely Negative     592
Name: Sentiment, dtype: int64

# Convert into Lower Case

In [None]:
df['OriginalTweet']=df['OriginalTweet'].str.lower()

In [None]:
df.head()

Unnamed: 0,OriginalTweet,Sentiment
0,trending: new yorkers encounter empty supermar...,Extremely Negative
1,when i couldn't find hand sanitizer at fred me...,Positive
2,find out how you can protect yourself and love...,Extremely Positive
3,#panic buying hits #newyork city as anxious sh...,Negative
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


# Remove Unwanted Data like html tags

In [None]:
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'',str(text))

In [None]:
df['OriginalTweet']=df['OriginalTweet'].apply(remove_html_tags)

In [None]:
df.head()

Unnamed: 0,OriginalTweet,Sentiment
0,trending: new yorkers encounter empty supermar...,Extremely Negative
1,when i couldn't find hand sanitizer at fred me...,Positive
2,find out how you can protect yourself and love...,Extremely Positive
3,#panic buying hits #newyork city as anxious sh...,Negative
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


# Remove url

In [None]:
def remove_url(text):
    pattern = re.compile(r'http?://\S+|www\.\S+')
    return pattern.sub(r'',str(text))

In [None]:
df['OriginalTweet']=df['OriginalTweet'].apply(remove_url)

In [None]:
df.head()

Unnamed: 0,OriginalTweet,Sentiment
0,trending: new yorkers encounter empty supermar...,Extremely Negative
1,when i couldn't find hand sanitizer at fred me...,Positive
2,find out how you can protect yourself and love...,Extremely Positive
3,#panic buying hits #newyork city as anxious sh...,Negative
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


# Remove Punctuation (!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~)

In [None]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
exclude = string.punctuation

In [None]:
def remove_punc(text):
    return text.translate(str.maketrans('','',exclude))

In [None]:
df['OriginalTweet']=df['OriginalTweet'].apply(remove_punc)

In [None]:
df.head()

Unnamed: 0,OriginalTweet,Sentiment
0,trending new yorkers encounter empty supermark...,Extremely Negative
1,when i couldnt find hand sanitizer at fred mey...,Positive
2,find out how you can protect yourself and love...,Extremely Positive
3,panic buying hits newyork city as anxious shop...,Negative
4,toiletpaper dunnypaper coronavirus coronavirus...,Neutral


# Correct Spelling for that pip install textblob (maner - manner)

In [None]:
from textblob import TextBlob

In [None]:
def correct_spelling(text):
    return TextBlob(text).correct().string

In [None]:
df['OriginalTweet']=df['OriginalTweet'].apply(correct_spelling)

In [None]:
df

Unnamed: 0,OriginalTweet,Sentiment
0,treading new yorkers encounter empty supermark...,Extremely Negative
1,when i couldn find hand sanitizer at fred meye...,Positive
2,find out how you can protect yourself and love...,Extremely Positive
3,panic buying hits network city as anxious ship...,Negative
4,toiletpaper dunnypaper coronavirus coronavirus...,Neutral
...,...,...
3793,meanwhile in a supermarket in israel people d...,Positive
3794,did you panic buy a lot of nonperishable items...,Negative
3795,asset prof of economics chances was on nbcphil...,Neutral
3796,go need to do something instead of bear je rak...,Extremely Negative


# Remove Stop Words (a, the, etc...) for that pip install nltk

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) # Choose the language for stopwords

In [None]:
def remove_stopwords(text):
   new_text = []
   for word in text.split():
      if word not in stopwords.words('english'):
         new_text.append(word)
   return" ".join(new_text)

In [None]:
df['OriginalTweet']=df['OriginalTweet'].apply(remove_stopwords)

In [None]:
df

Unnamed: 0,OriginalTweet,Sentiment
0,treading new yorkers encounter empty supermark...,Extremely Negative
1,find hand sanitizer fred meyer turned amazon 1...,Positive
2,find protect loved ones coronavirus,Extremely Positive
3,panic buying hits network city anxious shipper...,Negative
4,toiletpaper dunnypaper coronavirus coronavirus...,Neutral
...,...,...
3793,meanwhile supermarket israel people dance sing...,Positive
3794,panic buy lot nonperishable items echo needs f...,Negative
3795,asset prof economics chances nbcphiladelphia t...,Neutral
3796,go need something instead bear je rakyat assum...,Extremely Negative


# Remove Emojies

In [None]:
import re
def remove_emoji(text):
    emoji_pattern = re.compile("["
                        u"\U0001F600-\U0001F64F"  # emoticons
                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                        u"\U00002500-\U00002BEF"  # chinese characters
                        u"\U00002702-\U000027B0"
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251"
                        u"\U0001f926-\U0001f937"
                        u"\U00010000-\U0010ffff"
                        u"\u2640-\u2642"
                        u"\u2600-\u2B55"
                        u"\u200d"
                        u"\u23cf"
                        u"\u23e9"
                        u"\u231a"
                        u"\ufe0f"  # dingbats
                        u"\u3030"
                       "]+",flags=re.UNICODE)
    return emoji_pattern.sub(r'',text)

In [None]:
df['OriginalTweet']=df['OriginalTweet'].apply(remove_emoji)

In [None]:
df.head()

Unnamed: 0,OriginalTweet,Sentiment
0,treading new yorkers encounter empty supermark...,Extremely Negative
1,find hand sanitizer fred meyer turned amazon 1...,Positive
2,find protect loved ones coronavirus,Extremely Positive
3,panic buying hits network city anxious shipper...,Negative
4,toiletpaper dunnypaper coronavirus coronavirus...,Neutral


# Stemming Words (likes, liked, likely, liking - like)

In [None]:
from nltk.stem.porter import PorterStemmer

In [None]:
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [None]:
df['OriginalTweet']=df['OriginalTweet'].apply(stem_words)

In [None]:
df

Unnamed: 0,OriginalTweet,Sentiment
0,tread new yorker encount empti supermarket she...,Extremely Negative
1,find hand sanit fred meyer turn amazon 11497 2...,Positive
2,find protect love one coronaviru,Extremely Positive
3,panic buy hit network citi anxiou shipper stoc...,Negative
4,toiletpap dunnypap coronaviru coronavirusaustr...,Neutral
...,...,...
3793,meanwhil supermarket israel peopl danc sing to...,Positive
3794,panic buy lot nonperish item echo need food do...,Negative
3795,asset prof econom chanc nbcphiladelphia talk r...,Neutral
3796,go need someth instead bear je rakyat assum lo...,Extremely Negative


# Apply TF-IDF (Term frequency-inverse Document Frequency) Model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer()

In [None]:
bow = tfidf.fit_transform(df['OriginalTweet'])

In [None]:
bow

<3798x9971 sparse matrix of type '<class 'numpy.float64'>'
	with 70249 stored elements in Compressed Sparse Row format>

In [None]:
print(tfidf.idf_)
print(tfidf.get_feature_names_out())

[8.54934597 7.85619879 8.54934597 ... 8.54934597 8.54934597 8.54934597]
['00' '000' '0095' ... 'zásobi' 'zásobováni' 'zástupc']


In [None]:
print(bow[0].toarray())

[[0. 0. 0. ... 0. 0. 0.]]


In [None]:
matrix = tfidf.fit_transform(df['OriginalTweet']).toarray()
matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
df['Sentiment'] = le.fit_transform(df['Sentiment'])

In [None]:
df

Unnamed: 0,OriginalTweet,Sentiment
0,tread new yorker encount empti supermarket she...,0
1,find hand sanit fred meyer turn amazon 11497 2...,4
2,find protect love one coronaviru,1
3,panic buy hit network citi anxiou shipper stoc...,2
4,toiletpap dunnypap coronaviru coronavirusaustr...,3
...,...,...
3793,meanwhil supermarket israel peopl danc sing to...,4
3794,panic buy lot nonperish item echo need food do...,2
3795,asset prof econom chanc nbcphiladelphia talk r...,3
3796,go need someth instead bear je rakyat assum lo...,0


# Defining X & y for Training and Testing Dataset

In [None]:
X = matrix
y = df[['Sentiment']]

# Split Dataset into Training & Testing

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()

In [None]:
model = lr.fit(X_train,y_train)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
from sklearn.metrics import r2_score

In [None]:
print('R2_Score: ',r2_score(y_test,y_pred))

R2_Score:  -0.3177652859230333


# RIDGE

In [None]:
from sklearn.linear_model import Ridge

In [None]:
r=Ridge(alpha=0.0001)

In [None]:
r.fit(X_train,y_train)

In [None]:
y_pred = r.predict(X_test)

In [None]:
print('R2_Score: ',r2_score(y_test,y_pred))

R2_Score:  -0.31650975215405164


# LASSO

In [None]:
from sklearn.linear_model import Lasso

In [None]:
l = Lasso(alpha=0.01)

In [None]:
l.fit(X_train,y_train)

In [None]:
y_pred = l.predict(X_test)

In [None]:
print('R2_Score: ',r2_score(y_test,y_pred))

R2_Score:  0.0103906824970732


# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
classifier = DecisionTreeClassifier(criterion='entropy',random_state=0)

In [None]:
classifier.fit(X_train,y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [None]:
confusion_matrix(y_pred,y_test)

array([[32, 10, 32,  5, 28],
       [ 9, 49, 14,  8, 33],
       [43, 22, 72, 37, 55],
       [14,  7, 28, 42, 24],
       [26, 32, 45, 33, 60]], dtype=int64)

In [None]:
accuracy_score(y_pred,y_test)

0.3355263157894737

In [None]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.26      0.30      0.28       107
           1       0.41      0.43      0.42       113
           2       0.38      0.31      0.34       229
           3       0.34      0.37      0.35       115
           4       0.30      0.31      0.30       196

    accuracy                           0.34       760
   macro avg       0.34      0.34      0.34       760
weighted avg       0.34      0.34      0.34       760



# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lgr = LogisticRegression()

In [None]:
lgr.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [None]:
y_pred = lgr.predict(X_test)

In [None]:
confusion_matrix(y_pred,y_test)

array([[ 24,   3,  13,   2,   3],
       [  2,  31,   6,   2,  16],
       [ 84,  26, 118,  53,  84],
       [  2,   3,   8,  30,  10],
       [ 12,  57,  46,  38,  87]], dtype=int64)

In [None]:
accuracy_score(y_pred,y_test)

0.3815789473684211

In [None]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.19      0.53      0.28        45
           1       0.26      0.54      0.35        57
           2       0.62      0.32      0.42       365
           3       0.24      0.57      0.34        53
           4       0.43      0.36      0.40       240

    accuracy                           0.38       760
   macro avg       0.35      0.47      0.36       760
weighted avg       0.48      0.38      0.40       760



# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
nc = KNeighborsClassifier(n_neighbors=3)

In [None]:
nc.fit(X_train,y_train)

  return self._fit(X, y)


In [None]:
y_pred = nc.predict(X_test)

In [None]:
confusion_matrix(y_pred,y_test)

array([[52, 26, 59, 28, 49],
       [13, 41, 35, 17, 46],
       [34, 29, 59, 33, 55],
       [16, 12, 16, 27, 15],
       [ 9, 12, 22, 20, 35]], dtype=int64)

In [None]:
accuracy_score(y_pred,y_test)

0.28157894736842104

In [None]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.42      0.24      0.31       214
           1       0.34      0.27      0.30       152
           2       0.31      0.28      0.29       210
           3       0.22      0.31      0.26        86
           4       0.17      0.36      0.23        98

    accuracy                           0.28       760
   macro avg       0.29      0.29      0.28       760
weighted avg       0.32      0.28      0.29       760



# SVC

In [None]:
from sklearn.svm import SVC

In [None]:
classifier_svc = SVC(kernel='linear')

In [None]:
classifier_svc.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [None]:
y_pred = classifier_svc.predict(X_test)

In [None]:
confusion_matrix(y_pred,y_test)

array([[ 29,   3,  10,   1,   3],
       [  2,  43,   6,   1,  21],
       [ 78,  23, 121,  46,  75],
       [  4,   1,  10,  37,  15],
       [ 11,  50,  44,  40,  86]], dtype=int64)

In [None]:
accuracy_score(y_pred,y_test)

0.41578947368421054

In [None]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.23      0.63      0.34        46
           1       0.36      0.59      0.45        73
           2       0.63      0.35      0.45       343
           3       0.30      0.55      0.39        67
           4       0.43      0.37      0.40       231

    accuracy                           0.42       760
   macro avg       0.39      0.50      0.40       760
weighted avg       0.49      0.42      0.42       760



# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb = GaussianNB()

In [None]:
nb.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [None]:
y_pred = nb.predict(X_test)

In [None]:
confusion_matrix(y_pred,y_test)

array([[34, 13, 39, 21, 37],
       [12, 41, 28, 16, 46],
       [46, 31, 57, 20, 46],
       [12,  9, 26, 50, 20],
       [20, 26, 41, 18, 51]], dtype=int64)

In [None]:
accuracy_score(y_pred,y_test)

0.30657894736842106

In [None]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.27      0.24      0.25       144
           1       0.34      0.29      0.31       143
           2       0.30      0.28      0.29       200
           3       0.40      0.43      0.41       117
           4       0.26      0.33      0.29       156

    accuracy                           0.31       760
   macro avg       0.31      0.31      0.31       760
weighted avg       0.31      0.31      0.31       760

