<a href="https://colab.research.google.com/github/i-ganza007/ML_Exercises/blob/main/Breakout9_twitter_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install tweet-preprocessor



## Import Libraries

In [2]:
""" Importing all libraries """

import re
import sklearn
import numpy as np
import pandas as pd
import preprocessor as p

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


## Import Datasets

In [5]:
""" Specify the paths to the dataset """
train_path = "/content/train.csv"
test_path = "/content/test.csv"

In [6]:
# training data
train = pd.read_csv(train_path)

# testing data
test = pd.read_csv(test_path)

## Understanding the Dataset

In [7]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [8]:
test.tail()

Unnamed: 0,id,tweet
17192,49155,thought factory: left-right polarisation! #tru...
17193,49156,feeling like a mermaid ð #hairflip #neverre...
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...
17195,49158,"happy, at work conference: right mindset leads..."
17196,49159,"my song ""so glad"" free download! #shoegaze ..."


In [9]:
# good sentiment related tweets
sum(train["label"] == 0)

29720

In [10]:
# bad sentiment related tweets
sum(train["label"] == 1)

2242

In [11]:
# check if there are any missing values
train.isnull().sum()

Unnamed: 0,0
id,0
label,0
tweet,0


## Data Cleaning

In [14]:
# Set up punctuations we want to be replaced
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")

  REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
  REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")


In [15]:
def clean_tweets(df):
    tempArr = []
    for line in df:
        # clean using tweet_preprocessor
        tmpL = p.clean(line)
        # remove all punctuation
        tmpL = REPLACE_NO_SPACE.sub("", tmpL.lower())
        tmpL = REPLACE_WITH_SPACE.sub(" ", tmpL)
        tempArr.append(tmpL)
    return tempArr

In [16]:
# clean training data
train_tweet = clean_tweets(train["tweet"])
train_tweet = pd.DataFrame(train_tweet)

In [17]:
# append cleaned tweets to the training dataset
train["clean_tweet"] = train_tweet

# display the new dataset
train.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for credit i cant use cause they dont o...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,i love u take with u all the time in ur
4,5,0,factsguide: society now #motivation,factsguide society now


In [18]:
# clean training data
test_tweet = clean_tweets(test["tweet"])
test_tweet = pd.DataFrame(test_tweet)

In [19]:
# append cleaned tweets to the test dataset
test["clean_tweet"] = test_tweet

# display the new dataset
test.tail()

Unnamed: 0,id,tweet,clean_tweet
17192,49155,thought factory: left-right polarisation! #tru...,thought factory left right polarisation &gt3
17193,49156,feeling like a mermaid ð #hairflip #neverre...,feeling like a mermaid
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...,today in omg &amp used words like assets&ampli...
17195,49158,"happy, at work conference: right mindset leads...",happy at work conference right mindset leads t...
17196,49159,"my song ""so glad"" free download! #shoegaze ...",my song so glad free download


## Train and Test Split

In [20]:
# extract the labels from the train data
y = train.label.values

# use 70% for the training and 30% for the test
x_train, x_test, y_train, y_test = train_test_split(train.clean_tweet.values, y,
                                                   stratify = y,
                                                   random_state = 1,
                                                   test_size = 0.3,
                                                   shuffle = True)

## Vectorize tweets using CountVectorizer()

In [22]:
# Implement the Bag of Words vectorization using CountVectorizer()
vectorizer = CountVectorizer(binary = True, stop_words = "english")

# learn a vocabulary dictionary of all tokens in the raw documents
vectorizer.fit(list(x_train) + list(x_test))

# transfrom documents to document-term matrix
x_train_vec = vectorizer.transform(x_train)
x_test_vec = vectorizer.transform(x_test)

## Model Building

In [23]:
""" Train a classical machine learning model (like SVC, etc) """
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score , precision_score , recall_score , f1_score
# Define your model
model = SVC(kernel="linear",probability=True)

# fit the SVC model based on the given training data
model.fit(x_train_vec, y_train).predict_proba(x_test_vec)

# perform classification and prediction on samples in x_test
y_pred = model.predict(x_test_vec)

## Accuracy score for SVC with CountVectorizer

In [25]:
print("Accuracy score for SVC is:", accuracy_score(y_test, y_pred) * 100, "%")

Accuracy score for SVC is: 94.86912086766085 %


## Vectorize tweets using Tf-IDF

In [26]:
# initilizing the TFID Vectorizer()
tf_idf_vectorizer = TfidfVectorizer(use_idf = True,
                                    smooth_idf = True,
                                    sublinear_tf = True,
                                    stop_words = "english")

# learn a vocabulary dictionary of all tokens in the raw documents
tf_idf_vectorizer.fit(list(x_train) + list(x_test))

# transfrom documents to document-term matrix
x_train_vec_tf = tf_idf_vectorizer.transform(x_train)
x_test_vec_tf = tf_idf_vectorizer.transform(x_test)

In [27]:
# Define your model
model = SVC(kernel="linear",probability=True)

# fit the SVC model based on the given training data
model.fit(x_train_vec_tf, y_train).predict_proba(x_test_vec_tf)

# perform classification and prediction on samples in x_test
y_pred_svm = model.predict(x_test_vec_tf)

In [28]:
print("Accuracy score for SVC is:", accuracy_score(y_test, y_pred_svm) * 100, "%")

Accuracy score for SVC is: 94.96297841276463 %
