In [62]:

import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# Prepare data

## Read input

In [30]:
input_file = './data/data.csv'
column_names = ['target', 'id','date','flag','user','text']
input_df = pd.read_csv(input_file, names=column_names,encoding='latin-1')[["id","target", "text"]] \
    .sample(100)

## Download stopwords

In [20]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\karpi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

## Clean input

In [31]:
transformed_df = input_df \
    .dropna() 
transformed_df['target'] = transformed_df['target'].map({0: 0, 4: 1}) # Normalize target values

## Stemming

In [32]:
def steeming_transform(text):
    stemmer = PorterStemmer()
    with_removed_non_letter_signs = re.sub('[^a-zA-Z]', ' ', text)
    with_lower_case = with_removed_non_letter_signs.lower()
    with_word_tokenization = with_lower_case.split()
    with_stemmed = [stemmer.stem(word) for word in with_word_tokenization if not word in set(stopwords.words('english'))]
    output = ' '.join(with_stemmed)
    return output

In [33]:
transformed_df['processed_text'] = transformed_df['text'].apply(steeming_transform)

## Indexing

In [42]:
X = transformed_df['processed_text'].values
Y = transformed_df['target'].values

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

## Dataset

In [55]:
class TweetDataset(Dataset):
    
    def __init__(self,x,y):
        self.x = x
        self.y = y
        
    
    def __len__(self):
        return len(self.x)
    
    
    def __getitem__(self,index):
        return self.x[index],self.y[index]

In [61]:
test_dataset = TweetDataset(X_test,Y_test)
train_dataset = TweetDataset(X_train,Y_train)   

In [63]:
train_dataloader = DataLoader(train_dataset, batch_size=1)
test_dataloader = DataLoader(test_dataset, batch_size=1)