### NLP Twitter Sentiment Analysis

In [1]:
import warnings; warnings.filterwarnings('ignore')
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, TreebankWordTokenizer, WordPunctTokenizer, BlanklineTokenizer, RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from langdetect import detect
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from wordcloud import WordCloud, STOPWORDS
from PIL import Image

import re

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score
from sklearn.model_selection import train_test_split

In [2]:
df=pd.read_csv('Twitter.csv',encoding = "ISO-8859-1")
df

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies


In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Evim
[nltk_data]     Preschool\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords

In [5]:
df.rename(columns={'tweet':'text'},inplace=True)

In [6]:
#cleaning text
df['text']=df['text'].str.lower()

In [7]:
df['text']=df['text'].str.replace('[^\@\w\s]','')

In [8]:
df['text']=df['text'].str.replace('\d+','')

In [9]:
df['text']=df['text'].str.replace('\n',' ')

In [10]:
df['text']=df['text'].str.replace('\s+',' ')

In [11]:
df['text']=df['text'].str.replace('rt','')

In [12]:
#remove http
df['text']=df['text'].str.replace('http\S+|www.\S+','')


In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,text
0,0,3,0,0,3,2,!!! @mayasolovely: as a woman you shouldn't c...
1,1,3,0,3,0,1,!!!!! @mleew17: boy dats cold...tyga dwn bad ...
2,2,3,0,3,0,1,!!!!!!! @urkindofbrand dawg!!!! @80sbaby4lif...
3,3,3,0,2,1,1,!!!!!!!!! @c_g_anderson: @viva_based she look...
4,4,6,0,6,0,1,!!!!!!!!!!!!! @shenikarobes: the shit you hea...


In [14]:
#remove stop words
stop = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,text
0,0,3,0,0,3,2,!!! @mayasolovely: woman complain cleaning hou...
1,1,3,0,3,0,1,!!!!! @mleew17: boy dats cold...tyga dwn bad c...
2,2,3,0,3,0,1,!!!!!!! @urkindofbrand dawg!!!! @80sbaby4life:...
3,3,3,0,2,1,1,!!!!!!!!! @c_g_anderson: @viva_based look like...
4,4,6,0,6,0,1,!!!!!!!!!!!!! @shenikarobes: shit hear might t...


In [15]:
df['text'][5]

'!!!!!!!!!!!!!!!!!!"@t_madison_x: shit blows me..claim faithful somebody still fucking hoes! &#128514;&#128514;&#128514;"'

In [16]:
def clean_tweet(tweet):
    """
    This function removes everything up to and including the first colon,
    all @ mentions, and emojis in the tweet.
    
    Parameters:
    tweet (str): The input tweet as a string.
    
    Returns:
    str: The cleaned tweet.
    """
    # Remove everything up to and including the first colon
    colon_pos = tweet.find(':')
    if (colon_pos != -1):
        tweet = tweet[colon_pos + 1:].strip()
    
    # Remove @ mentions
    tweet = re.sub(r'@\w+', '', tweet).strip()
    
    # Remove emojis
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002500-\U00002BEF"  # chinese characters
                           u"\U00002702-\U000027B0"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U0001f926-\U0001f937"
                           u"\U00010000-\U0010ffff"
                           u"\u2640-\u2642" 
                           u"\u2600-\u2B55"
                           u"\u200d"
                           u"\u23cf"
                           u"\u23e9"
                           u"\u231a"
                           u"\ufe0f"  # dingbats
                           u"\u3030"
                           "]+", flags=re.UNICODE)
    
    tweet = emoji_pattern.sub(r'', tweet)
    
    return tweet


In [17]:
df['text']=df['text'].apply(clean_tweet)
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,text
0,0,3,0,0,3,2,woman complain cleaning house. &amp; man alway...
1,1,3,0,3,0,1,boy dats cold...tyga dwn bad cuffin dat hoe 1s...
2,2,3,0,3,0,1,ever fuck bitch sta cry? confused shit
3,3,3,0,2,1,1,look like tranny
4,4,6,0,6,0,1,shit hear might true might faker bitch told ya...


In [18]:
#remove $amp
df['text']=df['text'].str.replace('&amp','')
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,text
0,0,3,0,0,3,2,woman complain cleaning house. ; man always ta...
1,1,3,0,3,0,1,boy dats cold...tyga dwn bad cuffin dat hoe 1s...
2,2,3,0,3,0,1,ever fuck bitch sta cry? confused shit
3,3,3,0,2,1,1,look like tranny
4,4,6,0,6,0,1,shit hear might true might faker bitch told ya...


In [19]:
from nltk.stem import PorterStemmer
pr = PorterStemmer()

In [20]:
def tokenize_and_stem(text):
    tokens = nltk.word_tokenize(text)
    stemmed_tokens = [pr.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

In [21]:
df['text'] = df['text'].apply(tokenize_and_stem)


In [22]:
def lemma(text):
    words=TextBlob(text).words
    return [pr.stem(word) for word in words]

In [23]:
def polarity(text):
    return TextBlob(text).sentiment.polarity

In [24]:
def subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

In [25]:
df['polarity']=df['text'].apply(polarity)
df['subjectivity']=df['text'].apply(subjectivity)

In [26]:
df['sentiment']=np.where(df['polarity']>0,'positive',np.where(df['polarity']<0,'negative','neutral'))
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,text,polarity,subjectivity,sentiment
0,0,3,0,0,3,2,woman complain clean hous . ; man alway take t...,0.366667,0.7,positive
1,1,3,0,3,0,1,boy dat cold ... tyga dwn bad cuffin dat hoe 1...,-0.8,0.833333,negative
2,2,3,0,3,0,1,ever fuck bitch sta cri ? confus shit,-0.3,0.7,negative
3,3,3,0,2,1,1,look like tranni,0.0,0.0,neutral
4,4,6,0,6,0,1,shit hear might true might faker bitch told ya...,0.075,0.725,positive


In [27]:
x=sum(df['sentiment']=='positive')
y=sum(df['sentiment']=='negative')
z=sum(df['sentiment']=='neutral')
print(x,y,z)

6044 6174 12565


In [28]:
def sentiment_score(a, b, c):
    if (a>b) and (a>c):
        print("Positive 😊 ")
    elif (b>a) and (b>c):
        print("Negative 😠 ")
    else:
        print("Neutral 🙂 ")
sentiment_score(x, y, z)

Neutral 🙂 
