# NECESSARY LIBRARIES

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
#!pip install emoji --upgrade
import emoji
#!pip install contractions
import contractions
#!pip install nltk
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()
snowball_stemmer = SnowballStemmer('english')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yakup\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yakup\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Data Cleaning and Tokenization Function

In [4]:
def process_tweet(tweet, verbose=False):
    
    tweet = re.sub('RT\s+', '', tweet)
   
    tweet = re.sub('\B@\w+','twitteruser', tweet)
    
    tweet = re.sub('(http|https):/\/\S+', '', tweet)
    
    tweet = re.sub('#+', '', tweet)
    
    tweet = tweet.lower()
    
    tweet = contractions.fix(tweet)
    
    tweet = re.sub(r'[\?\.\!]+(?=[\?\.\!])', '', tweet)
    
    tweet = re.sub(r'(.)\1+', r'\1\1', tweet)
    
    tweet = emoji.demojize(tweet)
    
    token_list = word_tokenize(tweet)
    
    token_list = [token for token in token_list
                  if token not in string.punctuation]
    
    token_list = [token for token in token_list
                  if token.isalpha()]
    stop_words = set(stopwords.words('english'))
    stop_words.discard('not')
    tokens = [token for token in token_list
                  if not token in stop_words] 
    
    token_list = []
    for token in tokens:
        token_list.append(porter_stemmer.stem(token))
  
    return token_list

# Tokenization of the Tweet Data

In [5]:
df = pd.read_csv('tweet_data.csv')

In [6]:
df.shape

(18727, 3)

In [7]:
df.head(3)

Unnamed: 0,textID,tweet_text,sentiment
0,1956967666,Layin n bed with a headache ughhhh...waitin o...,negative
1,1956967696,Funeral ceremony...gloomy friday...,negative
2,1956967789,wants to hang out with friends SOON!,positive


## Tokenization and Binary Transformation of Sentiment Values

In [8]:
df['tokens'] = df['tweet_text'].apply(process_tweet)
df['tweet_sentiment'] = df['sentiment'].apply(lambda i:1 
                                             if i== 'positive' else 0)

df.head()

Unnamed: 0,textID,tweet_text,sentiment,tokens,tweet_sentiment
0,1956967666,Layin n bed with a headache ughhhh...waitin o...,negative,"[layin, n, bed, headach, call]",0
1,1956967696,Funeral ceremony...gloomy friday...,negative,"[funer, friday]",0
2,1956967789,wants to hang out with friends SOON!,positive,"[want, hang, friend, soon]",1
3,1956968477,Re-pinging @ghostridah14: why didn't you go to...,negative,"[twitterus, not, go, prom, bf, not, like, friend]",0
4,1956968636,Hmmm. http://www.djhero.com/ is down,negative,[hmm],0


In [9]:
df.head(3)

Unnamed: 0,textID,tweet_text,sentiment,tokens,tweet_sentiment
0,1956967666,Layin n bed with a headache ughhhh...waitin o...,negative,"[layin, n, bed, headach, call]",0
1,1956967696,Funeral ceremony...gloomy friday...,negative,"[funer, friday]",0
2,1956967789,wants to hang out with friends SOON!,positive,"[want, hang, friend, soon]",1


In [10]:
X = df['tokens'].tolist()
y = df['tweet_sentiment'].tolist()

We extracted tokens and sentiment values for our machine learning model


Building TF-IDF functions which vectirizes our tweets

In [11]:
def build_freqs(tweet_list, sentiment_list):
  freqs = {}
  for tweet, sentiment in zip(tweet_list, sentiment_list):
    for word in tweet:
      pair = (word, sentiment)
      if pair in freqs:
        freqs[pair] += 1
      else:
        freqs[pair] = 1
  return freqs

In [12]:
def fit_tfidf(tweet_corpus):
  tf_vect = TfidfVectorizer(preprocessor = lambda x : x,
                            tokenizer  = lambda x : x)
  tf_vect.fit(tweet_corpus)
  return tf_vect

Partitioning of the data for train and test 

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 0,
                                                    train_size = 0.80)

## Logistic Regression Model

In [14]:
def fit_lr(X_train, y_train):
  model = LogisticRegression()
  model.fit(X_train, y_train)
  return model

In [15]:
tf = fit_tfidf(X_train)
X_train_tf = tf.transform(X_train)
X_test_tf = tf.transform(X_test)



In [16]:
model_lr_tf = fit_lr(X_train_tf, y_train)

# Accuracy of the Model

In [17]:
y_pred_lr_tf = model_lr_tf.predict(X_test_tf)

In [18]:
accuracy_score(y_test, y_pred_lr_tf)

0.8830752802989856

# Pipiline function of all above 

In [19]:
def predict_tweet(tweet):
  processed_tweet = process_tweet(tweet)#will call the preprocess tweet we defined above
  transformed_tweet = tf.transform([processed_tweet])#will vectorize our twett with tf-idf 
  prediction = model_lr_tf.predict(transformed_tweet)

  if prediction == 0:
    return 'Prediction sentiment is negative'
  else:
    return 'Prediction sentiment is positive'

In [20]:
your_tweet = ' noooooooooo! I just got the worst news of this year😬'

In [21]:
predict_tweet(your_tweet)

'Prediction sentiment is negative'

In [22]:
your_tweet = '@TheEllenShow If only Bradley''s arm was longer. Best photo ever. #oscars😃'

In [23]:
predict_tweet(your_tweet)

'Prediction sentiment is positive'

Now that the model is trained and is capable of predicting the sentiment values of unseen observations,

Now i will write a function to predict the sentiment values when we have an aggregate data like a csv file, or a data frame

In [24]:
def predict_tweet(tweet):
  processed_tweet = process_tweet(tweet)
  transformed_tweet = tf.transform([processed_tweet])
  prediction = model_lr_tf.predict(transformed_tweet)
  return prediction

In [25]:
a = predict_tweet(your_tweet)

In [26]:
a

array([1])

there are two points here,

we need to use a for loop for our function to get to every tweet and implement the predict tweet pipeline

also, we need to get rid of the array structure

In [27]:
b = int(a)

In [28]:
b

1

In [29]:
b?

[1;31mType:[0m        int
[1;31mString form:[0m 1
[1;31mDocstring:[0m  
int([x]) -> integer
int(x, base=10) -> integer

Convert a number or string to an integer, or return 0 if no arguments
are given.  If x is a number, return x.__int__().  For floating point
numbers, this truncates towards zero.

If x is not a number or if base is given, then x must be a string,
bytes, or bytearray instance representing an integer literal in the
given base.  The literal can be preceded by '+' or '-' and be surrounded
by whitespace.  The base defaults to 10.  Valid bases are 0 and 2-36.
Base 0 means to interpret the base from the string as an integer literal.
>>> int('0b100', base=0)
4


we can get rid if the array structure of the predict_tweet function this way simply,

now let get a bunch of observations for testing

In [30]:
df.tail(15)

Unnamed: 0,textID,tweet_text,sentiment,tokens,tweet_sentiment
18712,1753904518,@Rtib happy birthday,positive,"[twitterus, happi, birthday]",1
18713,1753904626,@acchanosaurus good luck chan! gue kmrn bawa b...,positive,"[twitterus, good, luck, chan, gue, kmrn, bawa,...",1
18714,1753904668,good morning/midday nation! FORMULA ONE IN ON...,positive,"[good, nation, formula, one, one, hour]",1
18715,1753904674,to my pretty lady @nikkiwoods HAPPY MOTHER'S D...,positive,"[pretti, ladi, twitterus, happi, mother, day, ...",1
18716,1753904868,"@givemestrength bloody Feds, they lost last st...",negative,"[twitterus, bloodi, fed, lost, last, statement...",0
18717,1753904911,@prinsezha awesome. Wha'dya get her?,positive,"[twitterus, awesom, get]",1
18718,1753904912,Sitting in Gatwick- going home for a week! can...,positive,"[sit, go, home, week, not, wait, see, famili]",1
18719,1753904919,@maynaseric good luck with your auction,positive,"[twitterus, good, luck, auction]",1
18720,1753905153,going to watch boy in the striped pj's hope i ...,positive,"[go, watch, boy, stripe, pj, hope, not, cri]",1
18721,1753918809,"gave the bikes a thorough wash, degrease it an...",positive,"[gave, bike, thorough, wash, degreas, greas, t...",1


In [31]:
df.iloc[18172:18176, 1:2]

Unnamed: 0,tweet_text
18172,Good morning Tweeple of the sun! What you all ...
18173,@TheSimsHub I'm going to kill the person who s...
18174,Happy Mother's Day!
18175,Good Morning Everyone


In [32]:
test = df.iloc[18172:18176, 1:2]

In [33]:
test

Unnamed: 0,tweet_text
18172,Good morning Tweeple of the sun! What you all ...
18173,@TheSimsHub I'm going to kill the person who s...
18174,Happy Mother's Day!
18175,Good Morning Everyone


our relevant function, which is also a pipeline for aggregate data, which will add the sentiment scores on a new column

In [34]:
l = []
def sentiment_score(test):
    for i in test['tweet_text']:
        l.append(int(predict_tweet(i)))
    test['score'] = l
         

In [35]:
sentiment_score(test)

In [36]:
l

[1, 0, 1, 1]

In [37]:
test

Unnamed: 0,tweet_text,score
18172,Good morning Tweeple of the sun! What you all ...,1
18173,@TheSimsHub I'm going to kill the person who s...,0
18174,Happy Mother's Day!,1
18175,Good Morning Everyone,1


Now lets create some data which was not in the training, for testing our model:

In [38]:
test2 ={'tweet_text':[' not only good but the best in town','## this was a bad idea from the beginng @you','this movie is sooooooooo gloomy','what a splendid dayyyyyyy!!!!!!']}

In [39]:
test2 = pd.DataFrame(test2) 

In [85]:
test2?

[1;31mType:[0m        DataFrame
[1;31mString form:[0m
   tweet
0  Karan
1  Rohit
2  Sahil
3  Aryan
[1;31mLength:[0m      4
[1;31mFile:[0m        c:\users\yakup\anaconda3\lib\site-packages\pandas\core\frame.py
[1;31mDocstring:[0m  
Two-dimensional, size-mutable, potentially heterogeneous tabular data.

Data structure also contains labeled axes (rows and columns).
Arithmetic operations align on both row and column labels. Can be
thought of as a dict-like container for Series objects. The primary
pandas data structure.

Parameters
----------
data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
    Dict can contain Series, arrays, constants, dataclass or list-like objects. If
    data is a dict, column order follows insertion-order.

    .. versionchanged:: 0.25.0
       If data is a list of dicts, column order follows insertion-order.

index : Index or array-like
    Index to use for resulting frame. Will default to RangeIndex if
    no indexing information p

In [40]:
test2

Unnamed: 0,tweet_text
0,not only good but the best in town
1,## this was a bad idea from the beginng @you
2,this movie is sooooooooo gloomy
3,what a splendid dayyyyyyy!!!!!!


In [41]:
l = []
def sentiment_score(test2):
    for i in test2['tweet_text']:
        l.append(int(predict_tweet(i)))
    test2['score'] = l
         

In [42]:
sentiment_score(test2)

In [43]:
test2

Unnamed: 0,tweet_text,score
0,not only good but the best in town,1
1,## this was a bad idea from the beginng @you,0
2,this movie is sooooooooo gloomy,0
3,what a splendid dayyyyyyy!!!!!!,1


However, I want to mention a situation about the data. In some cases, the data which we used for training does not produce the results we wish as the vectorization process combined with the context of the tokens may result in unforeseen consequences. For example, when we think of the word sunny, we would expect it to have a positive sentiment,lets chech:

In [47]:
predict_tweet('sunny')

array([0])

however, sunny has a negative value in our model apparently. Why would that be? Lets chech out data for some answers. Lets chechk the tweets which the word sunny is included:

In [48]:
sunny = df[df['tweet_text'].str.contains("sunny")]

In [49]:
sunny.groupby('sentiment').count()

Unnamed: 0_level_0,textID,tweet_text,tokens,tweet_sentiment
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
negative,23,23,23,23
positive,20,20,20,20


as can be seen, because of the context, the word 'sunny' is considered as negative by the model


the difference between negative and positive 'sunny' usages are three

therefore I will add 5 positive 'sunny' values to the data and rebuild the model

first I will reread the csv data

In [50]:
df_ = pd.read_csv('tweet_data.csv')

In [51]:
df_.tail(3)

Unnamed: 0,textID,tweet_text,sentiment
18724,1753918900,Succesfully following Tayla!!,positive
18725,1753919001,Happy Mothers Day All my love,positive
18726,1753919005,Happy Mother's Day to all the mommies out ther...,positive


In [52]:
df_.shape

(18727, 3)

In [78]:
df2 = pd.DataFrame({'textID': ['1838943234', '1838943235', '1838943236', '1838943237', '1838943238', '1838943239', '1838943240', '1838943240', '1838943240', '1838943240'],
                    'tweet_text' : ['sunny days make me so happy!','dont get sad !sunny days are ahaed of us', 'sunny and awesome', 'sun is good for your health', 'beach is sunny and funny!!!', 'sun is beatiful', 'sun is so bright', 'sunny means funny!', 'a sunny garden iss all I want', 'The weather was surprisingly warm and sunny'], 
                    'sentiment' : ['positive', 'positive','positive','positive', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive']})

In [79]:
df2

Unnamed: 0,textID,tweet_text,sentiment
0,1838943234,sunny days make me so happy!,positive
1,1838943235,dont get sad !sunny days are ahaed of us,positive
2,1838943236,sunny and awesome,positive
3,1838943237,sun is good for your health,positive
4,1838943238,beach is sunny and funny!!!,positive
5,1838943239,sun is beatiful,positive
6,1838943240,sun is so bright,positive
7,1838943240,sunny means funny!,positive
8,1838943240,a sunny garden iss all I want,positive
9,1838943240,The weather was surprisingly warm and sunny,positive


In [80]:
df = pd.concat([df_, df2], ignore_index = True, axis = 0)

In [81]:
df.tail()

Unnamed: 0,textID,tweet_text,sentiment
18732,1838943239,sun is beatiful,positive
18733,1838943240,sun is so bright,positive
18734,1838943240,sunny means funny!,positive
18735,1838943240,a sunny garden iss all I want,positive
18736,1838943240,The weather was surprisingly warm and sunny,positive


Now lets rebuild the model

In [82]:
df['tokens'] = df['tweet_text'].apply(process_tweet)
df['tweet_sentiment'] = df['sentiment'].apply(lambda i:1 
                                             if i== 'positive' else 0)

X = df['tokens'].tolist()
y = df['tweet_sentiment'].tolist()

def build_freqs(tweet_list, sentiment_list):
  freqs = {}
  for tweet, sentiment in zip(tweet_list, sentiment_list):
    for word in tweet:
      pair = (word, sentiment)
      if pair in freqs:
        freqs[pair] += 1
      else:
        freqs[pair] = 1
  return freqs

def fit_tfidf(tweet_corpus):
  tf_vect = TfidfVectorizer(preprocessor = lambda x : x,
                            tokenizer  = lambda x : x)
  tf_vect.fit(tweet_corpus)
  return tf_vect

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 0,
                                                    train_size = 0.80)

In [83]:
def fit_lr(X_train, y_train):
  model = LogisticRegression()
  model.fit(X_train, y_train)
  return model

tf = fit_tfidf(X_train)
X_train_tf = tf.transform(X_train)
X_test_tf = tf.transform(X_test)

model_lr_tf = fit_lr(X_train_tf, y_train)



In [84]:
y_pred_lr_tf = model_lr_tf.predict(X_test_tf)

accuracy_score(y_test, y_pred_lr_tf)

0.891675560298826

In [85]:
def predict_tweet(tweet):
  processed_tweet = process_tweet(tweet)#will call the preprocess tweet we defined above
  transformed_tweet = tf.transform([processed_tweet])#will vectorize our twett with tf-idf 
  prediction = model_lr_tf.predict(transformed_tweet)

  if prediction == 0:
    return 'Prediction sentiment is negative'
  else:
    return 'Prediction sentiment is positive'

In [86]:
your_tweet = 'sun day'

In [87]:
predict_tweet(your_tweet)

'Prediction sentiment is positive'

In [88]:
your_tweet = 'sunny'

In [89]:
predict_tweet(your_tweet)

'Prediction sentiment is positive'

As can be seen, as we increased the frequency of positive sentiments in the train data for the word sunny, the machine learned that 'sunny' is positive, just to see before finishing, lets check and see that our new data has a greater frequency of positive usage of sunny and sun:

In [90]:
sunny2 = df[df['tweet_text'].str.contains("sunny")]

In [91]:
sunny2.groupby('sentiment').count()

Unnamed: 0_level_0,textID,tweet_text,tokens,tweet_sentiment
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
negative,23,23,23,23
positive,27,27,27,27


In [92]:
sun = df[df['tweet_text'].str.contains("sun")]

In [93]:
sun.groupby('sentiment').count()

Unnamed: 0_level_0,textID,tweet_text,tokens,tweet_sentiment
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
negative,138,138,138,138
positive,142,142,142,142


The tweet data is retreived from Benjamin Termonia's 'Applied Text Mining and Sentiment Analysis with Python' Course on UDEMY
URL: https://www.udemy.com/course/applied-text-mining-and-sentiment-analysis-with-python/?src=sac&kw=applied+text+m