<a href="https://colab.research.google.com/github/krystianjarmul/tweet-sentiment-classification/blob/main/tweet-sentiment_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
# Download data
!gdown --id 1BnUfDWvSGooNDgNZFAO54TLllU-WubPE

Downloading...
From: https://drive.google.com/uc?id=1BnUfDWvSGooNDgNZFAO54TLllU-WubPE
To: /content/Data_tweets.csv
0.00B [00:00, ?B/s]4.30MB [00:00, 137MB/s]


In [40]:
from __future__ import annotations
from string import punctuation
import os

import numpy as np
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.externals import joblib
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
)

np.random.seed(42)

In [41]:
# Load English pipeline optimized for CPU
os.system("python -m spacy download en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
stopwords = list(STOP_WORDS)

In [42]:
class Preprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_ = X_.map(lambda row: self.preprocess(row))
        return X_

    def preprocess(self, tweet):
        doc = nlp(tweet)
        tokens = []
        for token in doc:
            if token.lemma_ != '-PRON-':
                temp = token.lemma_.lower().strip()
            else:
                temp = token.lower_
            tokens.append(temp)

        clean_tokens = []
        for token in tokens:
            if token not in punctuation and token not in stopwords:
                clean_tokens.append(token)

        return ' '.join(tokens)


In [43]:
# Load raw data
data = pd.read_csv('/content/Data_tweets.csv', header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0,0,2068921155,Sun Jun 07 14:56:42 PDT 2009,NO_QUERY,smiley_sophie,my arm still hurts from when i pulled it yeste...
1,1,4,2065871668,Sun Jun 07 09:27:21 PDT 2009,NO_QUERY,ImmaChocoholic,I have so much to do outside! Been looking at ...
2,2,0,1835774749,Mon May 18 06:43:27 PDT 2009,NO_QUERY,drmomentum,"@AbsolutSara Yes, I knew about the clusterfark..."
3,3,0,1967121891,Fri May 29 19:00:46 PDT 2009,NO_QUERY,sweetsheilx,Just woke up and i feel relieved Haha now i ha...
4,4,4,1695846172,Mon May 04 07:04:29 PDT 2009,NO_QUERY,monmariej,LOVING the hot weather forecast for the rest o...


In [44]:
# Select important columns from raw data and name them
data = pd.read_csv('/content/Data_tweets.csv', header=None)
df = data[[6, 1]].rename(columns={6: "Tweet", 1: "Sentiment"})
df.head()

Unnamed: 0,Tweet,Sentiment
0,my arm still hurts from when i pulled it yeste...,0
1,I have so much to do outside! Been looking at ...,4
2,"@AbsolutSara Yes, I knew about the clusterfark...",0
3,Just woke up and i feel relieved Haha now i ha...,0
4,LOVING the hot weather forecast for the rest o...,4


In [45]:
# Split data into train and test sets
X = df['Tweet']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, y_train.shape

((24000,), (24000,))

In [46]:
# Create pipeline
pipeline = Pipeline(
    [
        ('preprocessor', Preprocessor()),
        ('tfidf', TfidfVectorizer()),
        ('svc_rbf', SVC(kernel='rbf'))
    ],
    verbose=True
)

In [47]:
# Train model
pipeline.fit(X_train, y_train)

[Pipeline] ...... (step 1 of 3) Processing preprocessor, total= 4.0min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.4s
[Pipeline] ........... (step 3 of 3) Processing svc_rbf, total= 3.6min


Pipeline(memory=None,
         steps=[('preprocessor', Preprocessor()),
                ('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('svc_rbf',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
            

In [48]:
# Evaluate model
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print("\n\n")
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.77      0.78      3004
           4       0.77      0.78      0.78      2996

    accuracy                           0.78      6000
   macro avg       0.78      0.78      0.78      6000
weighted avg       0.78      0.78      0.78      6000




[[2326  678]
 [ 665 2331]]


In [50]:
def predict(text, pipeline=pipeline):
  y_pred = pipeline.predict(pd.Series(text))
  if not y_pred:
    print('NEGATIVE')
  else:
    print('POSITIVE')

In [51]:
predict("I feel tired this morning.")

NEGATIVE


In [52]:
predict("He is my best friend.")

POSITIVE


In [49]:
# Save pipeline
joblib.dump(pipeline, '/content/pipeline.pkl')

['/content/pipeline.pkl']