In [67]:
from csv_loader import tweets, test_tweets
from db import tweets as db_tweets
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable

import numpy as np

In [68]:
stemmer = PorterStemmer()


def check_base_word(base_word):
	if base_word and len(base_word) > 1 and base_word.isalpha():
		stemmed_word = stemmer.stem(base_word)
		if stemmed_word not in base_vector and stemmed_word not in stopwords.words('english'):
			return True
	return False


# Create base vector
base_vector = []
for tweet in tweets:
	if tweet['text']:
		words = tweet['text'].split()
		for word in words:
			if check_base_word(word):
				base_vector.append(stemmer.stem(word))
                

# Transform tweets into vectors
tweet_vectors = []
category_labels = []
subcategory_labels = []
for tweet in tweets:
    category_labels.append(tweet['category'])
    subcategory_labels.append(tweet['subcategory'])
    words = list(set(tweet['text'].split()))
    stemmed_words = [stemmer.stem(word) for word in words if word and len(word) > 1 and word.isalpha()]
    tweet_vector = []
    for ele in base_vector:
        tweet_vector.append(1 if ele in stemmed_words else 0)
    tweet_vectors.append(tweet_vector)
        
# Transform tweets into vectors
test_tweet_vectors = []
test_category_labels = []
test_subcategory_labels = []
for tweet in test_tweets:
    test_category_labels.append(tweet['category'])
    test_subcategory_labels.append(tweet['subcategory'])
    words = list(set(tweet['text'].split()))
    stemmed_words = [stemmer.stem(word) for word in words if word and len(word) > 1 and word.isalpha()]
    test_tweet_vector = []
    for ele in base_vector:
        test_tweet_vector.append(1 if ele in stemmed_words else 0)
    test_tweet_vectors.append(test_tweet_vector)
        


In [69]:
# Hyper Parameters
input_size = 1314
num_classes = 5
num_epochs = 5
batch_size = 100
learning_rate = 0.002

# Model
class LogisticRegression(nn.Module):
	def __init__(self, input_size, num_classes):
		super(LogisticRegression, self).__init__()
		self.linear = nn.Linear(input_size, 200) # Co the doi 200 thanh so khac. cung ko bit so nao dung y_1 = W_1x
		self.relu = nn.ReLU() # non-linearity. do y=sigma(W_1x)
		self.linear2 = nn.Linear(200, num_classes) # y = W_2 sigma(W_1 x). 
        #Em co the them 1 layer nua hien tai la no map tu input_size -> 200 -> num_classes 
        # Them layer nua nhu la input_size -> 200 -> 100 -> num_classes
        # roi them non-linearity. em search cai relu
	
	def forward(self, x):
		out = self.linear(x)
		out = self.relu(out)
		out = self.linear2(out)
		return out


model = LogisticRegression(input_size, num_classes)

# Loss and Optimizer
# Softmax is internally computed.
# Set parameters to be updated.
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)



In [70]:
# Training the Model
for epoch in range(15000):
	#for i, (tv) in enumerate(tweet_vectors):
    tensor_tweet_vector = Variable(torch.FloatTensor(tweet_vectors))
    labels = Variable(torch.LongTensor(category_labels))

    # Forward + Backward + Optimize
    optimizer.zero_grad()

    outputs = model(tensor_tweet_vector)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        print('Epoch: [%d/%d], Loss: %.4f' % (
            epoch + 1, num_epochs, loss.data[0]))

Epoch: [1/5], Loss: 1.6060
Epoch: [101/5], Loss: 1.5399
Epoch: [201/5], Loss: 1.4790
Epoch: [301/5], Loss: 1.4227
Epoch: [401/5], Loss: 1.3708
Epoch: [501/5], Loss: 1.3231
Epoch: [601/5], Loss: 1.2798
Epoch: [701/5], Loss: 1.2407
Epoch: [801/5], Loss: 1.2058
Epoch: [901/5], Loss: 1.1751
Epoch: [1001/5], Loss: 1.1484
Epoch: [1101/5], Loss: 1.1256
Epoch: [1201/5], Loss: 1.1062
Epoch: [1301/5], Loss: 1.0898
Epoch: [1401/5], Loss: 1.0760
Epoch: [1501/5], Loss: 1.0644
Epoch: [1601/5], Loss: 1.0544
Epoch: [1701/5], Loss: 1.0459
Epoch: [1801/5], Loss: 1.0385
Epoch: [1901/5], Loss: 1.0319
Epoch: [2001/5], Loss: 1.0259
Epoch: [2101/5], Loss: 1.0204
Epoch: [2201/5], Loss: 1.0153
Epoch: [2301/5], Loss: 1.0105
Epoch: [2401/5], Loss: 1.0058
Epoch: [2501/5], Loss: 1.0014
Epoch: [2601/5], Loss: 0.9970
Epoch: [2701/5], Loss: 0.9928
Epoch: [2801/5], Loss: 0.9886
Epoch: [2901/5], Loss: 0.9844
Epoch: [3001/5], Loss: 0.9803
Epoch: [3101/5], Loss: 0.9762
Epoch: [3201/5], Loss: 0.9721
Epoch: [3301/5], Loss:

In [76]:
# Test the Model
correct = 0
total = 0
images = Variable(torch.FloatTensor(test_tweet_vectors))
outputs = model(images)
test_labels = torch.LongTensor(test_category_labels)
_, predicted = torch.max(outputs.data, 1)
total += test_labels.size(0)
correct += (predicted == test_labels).sum()

print('Accuracy of the model on the 100 test: %d %%' % (100 * correct / total))

from db import tweets as db

for tweet in db:
    print(tweets['text'])
    words = list(set(tweet['text'].split()))
    stemmed_words = [stemmer.stem(word) for word in words if word and len(word) > 1 and word.isalpha()]
    tweet_vector = []
    for ele in base_vector:
        tweet_vector.append(1 if ele in stemmed_words else 0)
    images = Variable(torch.FloatTensor(tweet_vector))
    output = model(images)
    _, predicted = torch.max(outputs.data, 1)
    print(predicted)

# Save the Model
torch.save(model.state_dict(), 'model.pkl')

Accuracy of the model on the 100 test: 80 %
