In [2]:
from csv_loader import tweets, test_tweets
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable

import numpy as np

In [3]:
stemmer = PorterStemmer()


def check_base_word(base_word):
	if base_word and len(base_word) > 1 and base_word.isalpha():
		stemmed_word = stemmer.stem(base_word)
		if stemmed_word not in base_vector and stemmed_word not in stopwords.words('english'):
			return True
	return False


# Create base vector
base_vector = []
for tweet in tweets:
	if tweet['text']:
		words = tweet['text'].split()
		for word in words:
			if check_base_word(word):
				base_vector.append(stemmer.stem(word))
                

# Transform tweets into vectors
tweet_vectors = []
category_labels = []
subcategory_labels = []
for tweet in tweets:
    category_labels.append(tweet['category'])
    subcategory_labels.append(tweet['subcategory'])
    words = list(set(tweet['text'].split()))
    stemmed_words = [stemmer.stem(word) for word in words if word and len(word) > 1 and word.isalpha()]
    tweet_vector = []
    for ele in base_vector:
        tweet_vector.append(1 if ele in stemmed_words else 0)
    tweet_vectors.append(tweet_vector)
        
# Transform tweets into vectors
test_tweet_vectors = []
test_category_labels = []
test_subcategory_labels = []
for tweet in test_tweets:
    test_category_labels.append(tweet['category'])
    test_subcategory_labels.append(tweet['subcategory'])
    words = list(set(tweet['text'].split()))
    stemmed_words = [stemmer.stem(word) for word in words if word and len(word) > 1 and word.isalpha()]
    test_tweet_vector = []
    for ele in base_vector:
        test_tweet_vector.append(1 if ele in stemmed_words else 0)
    test_tweet_vectors.append(test_tweet_vector)
        


In [4]:
tensor_tweet_vectors = torch.LongTensor(tweet_vectors)
tensor_category_labels = torch.LongTensor(category_labels)
tensor_subcategory_labels = torch.LongTensor(subcategory_labels)


tensor_test_tweet_vectors = torch.LongTensor(test_tweet_vectors)
tensor_test_category_labels = torch.LongTensor(test_category_labels)
tensor_test_subcategory_labels = torch.LongTensor(test_subcategory_labels)


In [None]:
# Hyper Parameters
input_size = len(base_vector)
num_classes = 5
num_epochs = 1000
batch_size = 100
learning_rate = 0.001

# Model
class LogisticRegression(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, 100)
        self.linear2 = nn.Linear(100, num_classes)
    
    def forward(self, x):
        out = self.linear(x)
        out = self.linear2(out)
        return out


model = LogisticRegression(input_size, num_classes)

# Loss and Optimizer
# Softmax is internally computed.
# Set parameters to be updated.
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# Training the Model
for epoch in range(num_epochs):
    for i, (tweet_vector) in enumerate(tweet_vectors):
        tensor_tweet_vector = Variable(torch.FloatTensor(tweet_vectors))
        labels = Variable(torch.LongTensor(category_labels))

        # Forward + Backward + Optimize
        optimizer.zero_grad()
        outputs = model(tensor_tweet_vector)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print('Epoch: [%d/%d], Step: [%d/%d], Loss: %.4f' % (
            epoch + 1, num_epochs, i + 1, len(tensor_tweet_vectors) // batch_size, loss.data[0]))

# Test the Model
correct = 0
total = 0
for test_tweet_vector in test_tweet_vectors:
    tensor_test_tweet_vector = Variable(torch.FloatTensor(test_tweet_vectors))
    outputs = model(tensor_test_tweet_vector)
    test_labels = torch.LongTensor(test_category_labels)
    _, predicted = torch.max(outputs.data, 1)
    total += test_labels.size(0)
    correct += (predicted == test_labels).sum()

print('Accuracy of the model on the 100 test: %d %%' % (100 * correct / total))

# Save the Model
torch.save(model.state_dict(), 'model.pkl')

Epoch: [1/1000], Step: [100/3], Loss: 1.5782
Epoch: [1/1000], Step: [200/3], Loss: 1.5378
Epoch: [1/1000], Step: [300/3], Loss: 1.4995
Epoch: [2/1000], Step: [100/3], Loss: 1.4290
Epoch: [2/1000], Step: [200/3], Loss: 1.3961
Epoch: [2/1000], Step: [300/3], Loss: 1.3649
Epoch: [3/1000], Step: [100/3], Loss: 1.3077
Epoch: [3/1000], Step: [200/3], Loss: 1.2814
Epoch: [3/1000], Step: [300/3], Loss: 1.2566
Epoch: [4/1000], Step: [100/3], Loss: 1.2122
Epoch: [4/1000], Step: [200/3], Loss: 1.1922
Epoch: [4/1000], Step: [300/3], Loss: 1.1737
Epoch: [5/1000], Step: [100/3], Loss: 1.1413
Epoch: [5/1000], Step: [200/3], Loss: 1.1271
Epoch: [5/1000], Step: [300/3], Loss: 1.1141
Epoch: [6/1000], Step: [100/3], Loss: 1.0918
Epoch: [6/1000], Step: [200/3], Loss: 1.0821
Epoch: [6/1000], Step: [300/3], Loss: 1.0733
Epoch: [7/1000], Step: [100/3], Loss: 1.0581
Epoch: [7/1000], Step: [200/3], Loss: 1.0514
Epoch: [7/1000], Step: [300/3], Loss: 1.0453
Epoch: [8/1000], Step: [100/3], Loss: 1.0346
Epoch: [8/

Epoch: [60/1000], Step: [300/3], Loss: 0.3461
Epoch: [61/1000], Step: [100/3], Loss: 0.3417
Epoch: [61/1000], Step: [200/3], Loss: 0.3395
Epoch: [61/1000], Step: [300/3], Loss: 0.3374
Epoch: [62/1000], Step: [100/3], Loss: 0.3332
Epoch: [62/1000], Step: [200/3], Loss: 0.3311
Epoch: [62/1000], Step: [300/3], Loss: 0.3290
Epoch: [63/1000], Step: [100/3], Loss: 0.3249
Epoch: [63/1000], Step: [200/3], Loss: 0.3229
Epoch: [63/1000], Step: [300/3], Loss: 0.3208
Epoch: [64/1000], Step: [100/3], Loss: 0.3169
Epoch: [64/1000], Step: [200/3], Loss: 0.3149
Epoch: [64/1000], Step: [300/3], Loss: 0.3129
Epoch: [65/1000], Step: [100/3], Loss: 0.3090
Epoch: [65/1000], Step: [200/3], Loss: 0.3071
Epoch: [65/1000], Step: [300/3], Loss: 0.3052
Epoch: [66/1000], Step: [100/3], Loss: 0.3015
Epoch: [66/1000], Step: [200/3], Loss: 0.2996
Epoch: [66/1000], Step: [300/3], Loss: 0.2977
Epoch: [67/1000], Step: [100/3], Loss: 0.2941
Epoch: [67/1000], Step: [200/3], Loss: 0.2923
Epoch: [67/1000], Step: [300/3], L

Epoch: [119/1000], Step: [300/3], Loss: 0.1001
Epoch: [120/1000], Step: [100/3], Loss: 0.0992
Epoch: [120/1000], Step: [200/3], Loss: 0.0988
Epoch: [120/1000], Step: [300/3], Loss: 0.0984
Epoch: [121/1000], Step: [100/3], Loss: 0.0976
Epoch: [121/1000], Step: [200/3], Loss: 0.0972
Epoch: [121/1000], Step: [300/3], Loss: 0.0968
Epoch: [122/1000], Step: [100/3], Loss: 0.0960
Epoch: [122/1000], Step: [200/3], Loss: 0.0956
Epoch: [122/1000], Step: [300/3], Loss: 0.0953
Epoch: [123/1000], Step: [100/3], Loss: 0.0945
Epoch: [123/1000], Step: [200/3], Loss: 0.0941
Epoch: [123/1000], Step: [300/3], Loss: 0.0937
Epoch: [124/1000], Step: [100/3], Loss: 0.0930
Epoch: [124/1000], Step: [200/3], Loss: 0.0926
Epoch: [124/1000], Step: [300/3], Loss: 0.0922
Epoch: [125/1000], Step: [100/3], Loss: 0.0915
Epoch: [125/1000], Step: [200/3], Loss: 0.0912
Epoch: [125/1000], Step: [300/3], Loss: 0.0908
Epoch: [126/1000], Step: [100/3], Loss: 0.0901
Epoch: [126/1000], Step: [200/3], Loss: 0.0897
Epoch: [126/1

Epoch: [178/1000], Step: [100/3], Loss: 0.0459
Epoch: [178/1000], Step: [200/3], Loss: 0.0458
Epoch: [178/1000], Step: [300/3], Loss: 0.0457
