## sentiment analysis in pytorch

In [32]:
import os
import re
import pandas as pd
from pathlib import Path

import nltk
from nltk.tokenize import TweetTokenizer

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [8]:
data_path = Path(r"C:\Users\bamilosin\Documents\dataset\nlp\sentiment data")
train_path = data_path / "train.csv"
test_path = data_path / "test.csv"

In [9]:
from charset_normalizer import detect

with open(train_path, 'rb') as file:
    result = detect(file.read())
    print(result)

{'encoding': 'windows-1250', 'language': 'English', 'confidence': 0.9676666666666667}


In [60]:
train_data = pd.read_csv(train_path, encoding=result['encoding'])
train_data = train_data[['text', 'sentiment']] # the two columns we need.

# remove null values
train_data = train_data.dropna()
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27480 entries, 0 to 27480
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       27480 non-null  object
 1   sentiment  27480 non-null  object
dtypes: object(2)
memory usage: 644.1+ KB


In [62]:
import random
idx = random.randint(0, len(train_data))
text = train_data['text'][idx]
text

'is looking at that person from a very different pov.  never thought i`d see this day arrive!'

In [None]:
def clean_data(text):
    text_lower = text.lower()
    clean_text = re.sub('[<>{};@#$%^&*()>]', '', text_lower)
    clean_text = re.sub(['https'])
    return clean_text

In [64]:
def tokenize_data(text):
    tokenizer = nltk.TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens

In [65]:
# stop words: words tha dont add semantic meaning to the dataset
stopwords = nltk.corpus.stopwords.words('english')
tokens = tokenize_data(text)
filtered_tokens = [token for token in tokens if token not in stopwords]

print(f"unfiltered tokens: {tokens}")
print(f"filtered tokens: {filtered_tokens}")

unfiltered tokens: ['is', 'looking', 'at', 'that', 'person', 'from', 'a', 'very', 'different', 'pov', '.', 'never', 'thought', 'i', '`', 'd', 'see', 'this', 'day', 'arrive', '!']
filtered tokens: ['looking', 'person', 'different', 'pov', '.', 'never', 'thought', '`', 'see', 'day', 'arrive', '!']


In [71]:
class Vocabulary(Dataset):
    def __init__(self, df, csv_encoding:str="windows-1250"):
        super(Vocabulary, self).__init__()
        
        # get relevant columns
        self.data = df[['text', 'sentiment']]
        self.texts = list(df['text'])
        self.all_tokens = self.create_tokenized_data(self.texts)


    def clean_text(self, text):
        strip_text = text.strip()
        text_lower = strip_text.lower()
        clean_text = re.sub('[<>{};@#$%^&*()>]', '', text_lower)
        clean_text  = re.sub(r'https?://[^\s<>"]+|www\.[^\s<>"]+', '', clean_text)
        return clean_text
    
    def tokenize_data(self, text):
        tokenizer = nltk.TweetTokenizer()
        tokens = tokenizer.tokenize(text)
        return tokens
    
    def create_tokenized_data(self, texts):
        stopwords = nltk.corpus.stopwords.words('english')
        all_tokens = []
        for text in texts:
            clean_text = self.clean_text(text)
            tokens = self.tokenize_data(clean_text)
            # filter tokens
            tokens = [token for token in tokens if token not in stopwords]
            # add to "all_tokens" list
            all_tokens.append(tokens)

        return all_tokens
            

In [72]:
vocab = Vocabulary(train_data)

In [73]:
vocab.all_tokens

[['`', 'responded', ',', 'going'],
 ['sooo', 'sad', 'miss', 'san', 'diego', '!', '!', '!'],
 ['boss', 'bullying', '...'],
 ['interview', '!', 'leave', 'alone'],
 ['sons', ',', '`', 'put', 'releases', 'already', 'bought'],
 ['-', 'shameless', 'plugging', 'best', 'rangers', 'forum', 'earth'],
 ['2am', 'feedings', 'baby', 'fun', 'smiles', 'coos'],
 ['soooo', 'high'],
 [],
 ['journey',
  '!',
  '?',
  'wow',
  '...',
  'u',
  'became',
  'cooler',
  '.',
  'hehe',
  '...',
  'possible',
  '!',
  '?'],
 ['much',
  'love',
  'hopeful',
  ',',
  'reckon',
  'chances',
  'minimal',
  '=p',
  '`',
  'never',
  'gonna',
  'get',
  'cake',
  'stuff'],
 ['really', 'really', 'like', 'song', 'love', 'story', 'taylor', 'swift'],
 ['sharpie', 'running', 'dangerously', 'low', 'ink'],
 ['want', 'go', 'music', 'tonight', 'lost', 'voice', '.'],
 ['test', 'test', 'lg', 'env', '2'],
 ['uh', 'oh', ',', 'sunburned'],
 ['`', 'ok', ',', 'trying', 'plot', 'alternatives', 'speak', 'sigh'],
 ['`',
  'sick',
  'pas