<a href="https://colab.research.google.com/github/krystianjarmul/tweet-sentiment-classification/blob/main/tweet_sentiment_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Download data
!gdown --id 1BnUfDWvSGooNDgNZFAO54TLllU-WubPE

Downloading...
From: https://drive.google.com/uc?id=1BnUfDWvSGooNDgNZFAO54TLllU-WubPE
To: /content/Data_tweets.csv
4.30MB [00:00, 16.9MB/s]


In [None]:
from __future__ import annotations
from string import punctuation
import os

import numpy as np
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.externals import joblib
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
)

np.random.seed(42)

In [None]:
# Load English pipeline optimized for CPU
os.system("python -m spacy download en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
stopwords = list(STOP_WORDS)

In [None]:
class Preprocessor(BaseEstimator, TransformerMixin):
    """Transformer class for tokenizing and cleaning a data"""
    def fit(self, X: pd.Series, y: pd.Series = None) -> Preprocessor:
        return self

    def transform(self, X: pd.Series, y: pd.Series = None) -> pd.Series:
        X_ = X.copy()
        X_ = X_.map(lambda row: self.preprocess(row))
        return X_

    def preprocess(self, tweet: str) -> str:
        doc = nlp(tweet)
        tokens = []
        for token in doc:
            if token.lemma_ != '-PRON-':
                new_token = token.lemma_.lower().strip()
            else:
                new_token = token.lower_
            if new_token not in punctuation and new_token not in stopwords:
                tokens.append(new_token)
        return ' '.join(tokens)

In [None]:
# Load raw data
data = pd.read_csv('/content/Data_tweets.csv', header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0,0,2068921155,Sun Jun 07 14:56:42 PDT 2009,NO_QUERY,smiley_sophie,my arm still hurts from when i pulled it yeste...
1,1,4,2065871668,Sun Jun 07 09:27:21 PDT 2009,NO_QUERY,ImmaChocoholic,I have so much to do outside! Been looking at ...
2,2,0,1835774749,Mon May 18 06:43:27 PDT 2009,NO_QUERY,drmomentum,"@AbsolutSara Yes, I knew about the clusterfark..."
3,3,0,1967121891,Fri May 29 19:00:46 PDT 2009,NO_QUERY,sweetsheilx,Just woke up and i feel relieved Haha now i ha...
4,4,4,1695846172,Mon May 04 07:04:29 PDT 2009,NO_QUERY,monmariej,LOVING the hot weather forecast for the rest o...


In [None]:
# Select important columns from raw data and name them
data = pd.read_csv('/content/Data_tweets.csv', header=None)
df = data[[6, 1]].rename(columns={6: "Tweet", 1: "Sentiment"})
df.head()

Unnamed: 0,Tweet,Sentiment
0,my arm still hurts from when i pulled it yeste...,0
1,I have so much to do outside! Been looking at ...,4
2,"@AbsolutSara Yes, I knew about the clusterfark...",0
3,Just woke up and i feel relieved Haha now i ha...,0
4,LOVING the hot weather forecast for the rest o...,4


In [None]:
# Split data into train and test sets
X = df['Tweet']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, y_train.shape

((24000,), (24000,))

In [None]:
# Create pipeline
pipeline = Pipeline(
    [
        ('preprocessor', Preprocessor()),
        ('tfidf', TfidfVectorizer()),
        ('svc_rbf', SVC(kernel='rbf'))
    ],
    verbose=True
)

In [None]:
# Train model
pipeline.fit(X_train, y_train)

In [None]:
# Evaluate model
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print("\n\n")
print(confusion_matrix(y_test, y_pred))

In [None]:
# Save pipeline
joblib.dump(pipeline, '/content/pipeline.pkl')