In [17]:
import pandas as pd
import nltk
import torch.nn as nn
import torch
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np


In [18]:

# Replace 'your_file.csv' with the actual path to your CSV file
file_path = '15topics.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

In [19]:
df.head()

Unnamed: 0,topic,hashtag,content,hashtag_related 1,hashtag_related 2,hashtag_related 3,hashtag_related 4,hashtag_related 5,author,create_at
0,Politics,NationalPolitics,"Now, a member of Minnesota's Congressional del...",#Business,#Hudson,#UAWStrike,,,strike,2023-10-06 11:52:06+00:00
1,Politics,NationalPolitics,Markus Söder’s crumbling empire https://www.po...,,,,,,politico_eu_bot,2023-10-06 02:18:11.689000+00:00
2,Politics,NationalPolitics,"Poland, Hungary, Slovakia impose own Ukraine g...",,,,,,politico_eu_bot,2023-09-16 09:34:38.494000+00:00
3,Politics,NationalPolitics,Vienna seeks to calm Selmayr ‘blood money’ fur...,,,,,,politico_eu_bot,2023-09-10 17:33:43.008000+00:00
4,Politics,NationalPolitics,Brexiters rage after crowd waves EU flags at R...,,,,,,politico_eu_bot,2023-09-10 15:13:45.314000+00:00


In [20]:
contents = df['content'].tolist()

In [21]:
len(df['topic'].unique())   

15

In [22]:


nltk.download('punkt')

tokenized_contents = [word_tokenize(content) for content in contents]


[nltk_data] Downloading package punkt to /Users/yulin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed_sentences = [[stemmer.stem(word) for word in sentence] for sentence in tokenized_contents]
print(stemmed_sentences)



In [24]:
from gensim.models import Word2Vec
word2vec = Word2Vec(sentences=tokenized_contents, vector_size=100, window=5, min_count=1, workers=4)


In [25]:


def padding(data, max_length):
    padded_data = []
    for sentence in data:
        if len(sentence) >= max_length:
            padded_data.append(sentence[:max_length])
        else:
            padded_data.append(sentence + [[0]*100] * (max_length - len(sentence)))
    return padded_data

In [26]:


X = [[word2vec.wv[word] for word in sentence] for sentence in tokenized_contents]
X = padding(X, max([len(sentence) for sentence in tokenized_contents]))
y = df['topic'].tolist()

le = LabelEncoder()
y = le.fit_transform(y)

def one_hot(y):
    y_one_hot = []
    for label in y:
        one_hot = [0.] * len(le.classes_)
        one_hot[label] = 1
        y_one_hot.append(one_hot)
    return np.array(y_one_hot)

y = one_hot(y)


In [27]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data, labels):
        self.data = torch.tensor(data, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index], self.labels[index]
    





In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

train_dataset = Dataset(X_train, y_train)
test_dataset = Dataset(X_test, y_test)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)

In [29]:
import torch
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.hidden = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.output = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # Reshape the input to (batch_size, input_size)
        x = x.view(x.size(0), -1)  # Reshape to (2080, 2864*100)
        hidden_output = self.relu(self.hidden(x))
        output = self.output(hidden_output)
        return output




In [30]:
X = np.array(X)

In [15]:
X.shape

(2080, 2864, 100)

In [33]:
mlp = MLP(2864*100, 100, 15)
criteria = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp.parameters(), lr=0.0005)

for epoch in range(100):
    loss_train = 0
    precision_train = 0
    for i, (data, labels) in enumerate(train_loader):

        optimizer.zero_grad()
        outputs = mlp(data)
        loss = criteria(outputs, labels)
        loss.backward()
        optimizer.step()
        loss_train += loss.item()
        precision_train += (outputs.argmax(dim=1) == labels.argmax(dim=1)).float().mean()
    loss_train /= len(train_loader)
    precision_train /= len(train_loader)
    print(f'Epoch {epoch+1}: train loss: {loss_train}, train precision: {precision_train}')

    loss_test = 0
    precision_test = 0
    for i, (data, labels) in enumerate(test_loader):
        outputs = mlp(data)
        loss = criteria(outputs, labels)
        loss_test += loss.item()
        precision_test += (outputs.argmax(dim=1) == labels.argmax(dim=1)).float().mean()
    loss_test /= len(test_loader)
    precision_test /= len(test_loader)
    print(f'Epoch {epoch+1}: test loss: {loss_test}, test precision: {precision_test}')

    



Epoch 1: train loss: 2.6139599130703854, train precision: 0.15444710850715637
Epoch 1: test loss: 2.524296815578754, test precision: 0.1875
Epoch 2: train loss: 2.289715134180509, train precision: 0.2662259638309479
Epoch 2: test loss: 2.5074646839728723, test precision: 0.19230769574642181
Epoch 3: train loss: 2.08264346076892, train precision: 0.3413461446762085
Epoch 3: test loss: 2.5650548934936523, test precision: 0.20432692766189575
Epoch 4: train loss: 1.91094986979778, train precision: 0.3930288553237915
Epoch 4: test loss: 2.559135968868549, test precision: 0.24278846383094788
Epoch 5: train loss: 1.7431802176512206, train precision: 0.4447115361690521
Epoch 5: test loss: 2.556150197982788, test precision: 0.23798076808452606
Epoch 6: train loss: 1.6240859031677246, train precision: 0.49399039149284363
Epoch 6: test loss: 2.6992365763737607, test precision: 0.22355769574642181
Epoch 7: train loss: 1.4728673215095813, train precision: 0.5534855723381042
Epoch 7: test loss: 2.71

KeyboardInterrupt: 